Source code for sdata.blob

# -*-coding: utf-8-*-

import logging
import collections
import pandas as pd
import numpy as np
from sdata.timestamp import TimeStamp
from sdata.data import Data
from sdata.metadata import Metadata, Attribute
import json
import os
import hashlib


[docs]class Blob(Data): """Binary Large Object as reference .. warning:: highly experimental""" VAULT_TYPES = ["filesystem", "hdf5", "db", "www"] def __init__(self, **kwargs): """Binary Large Object :param name: :param url: :param filetype: :param vault: :param kwargs: """ Data.__init__(self, **kwargs) self.metadata.add("blob_name", kwargs.get("name", "")) self.metadata.add("blob_type", "unknown") self.url = kwargs.get("url", "") self.metadata.add("blob_sha1", self.sha1) self.metadata.add("blob_md5", self.md5) self.metadata.add("vault", kwargs.get("vault", "")) self.metadata.add("blob_filetype", kwargs.get("filetype", "unknown")) def _get_url(self): # return self._name return self.metadata.get("blob_url").value def _set_url(self, value): if isinstance(value, str): try: self.metadata.set_attr("blob_url", str(value)[:256]) except ValueError as exp: logging.warning("blob.url: %s" % exp) else: # self._name = str(value)[:256] self.metadata.set_attr("blob_url", str(value)[:256]) url = property(fget=_get_url, fset=_set_url, doc="url of the blob")
[docs] def exists(self, vault="filesystem"): """Test whether a object under the blob.url exists. :param vault: :return: """ if vault == "filesystem": if os.path.exists(self.url): return True else: return False else: return False
@property def sha1(self): """calculate the sha1 hash of the blob :return: sha1 """ hash = hashlib.sha1() if self.url and os.path.exists(self.url): with open(self.url, "rb") as fh: self.update_hash(fh, hash) return hash.hexdigest() else: logging.error("can't open external url '{}'".format(str(self.url))) @property def md5(self): """calculate the md5 hash of the blob :return: sha1 """ hash = hashlib.md5() if self.url and os.path.exists(self.url): with open(self.url, "rb") as fh: self.update_hash(fh, hash) return hash.hexdigest() else: logging.error("can't open external url '{}'".format(str(self.url)))
[docs] def update_hash(self, fh, hashobject, buffer_size = 65536): """A hash represents the object used to calculate a checksum of a string of information. .. code-block:: python hashobject = hashlib.md5() df = pd.DataFrame([1,2,3]) url = "/tmp/blob.csv" df.to_csv(url) blob = sdata.Blob(url=url) fh = open(url, "rb") blob.update_hash(fh, hashobject) hashobject.hexdigest() :param fh: file handle :param hashobject: hash object, e.g. hashlib.sha1() :param buffer_size: buffer size (default buffer_size=65536) :return: hashobject """ with fh as f: while True: data = f.read(buffer_size) if not data: break hash.update(data) return hash