Source code for pytriqs.archive.hdf_archive


################################################################################
#
# TRIQS: a Toolbox for Research in Interacting Quantum Systems
#
# Copyright (C) 2011 by M. Ferrero, O. Parcollet
#
# TRIQS is free software: you can redistribute it and/or modify it under the
# terms of the GNU General Public License as published by the Free Software
# Foundation, either version 3 of the License, or (at your option) any later
# version.
#
# TRIQS is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
# details.
#
# You should have received a copy of the GNU General Public License along with
# TRIQS. If not, see <http://www.gnu.org/licenses/>.
#
################################################################################

import sys,numpy
from hdf_archive_basic_layer_h5py import HDFArchiveGroupBasicLayer

from pytriqs.archive.hdf_archive_schemes import hdf_scheme_access_for_write, hdf_scheme_access_for_read, register_class

# -------------------------------------------
#
#  Various wrappers for basic python types.
#
# --------------------------------------------
class PythonListWrap:
    def __init__(self,ob) :
        self.ob = ob
    def __reduce_to_dict__(self) :
        return {str(n):v for n,v in enumerate(self.ob)}
    @classmethod
    def __factory_from_dict__(cls, name, D) :
        return [x for n,x in sorted([(int(n), x) for n,x in D.items()])]

class PythonTupleWrap:
    def __init__(self,ob) :
        self.ob = ob
    def __reduce_to_dict__(self) :
        return {str(n):v for n,v in enumerate(self.ob)}
    @classmethod
    def __factory_from_dict__(cls, name, D) :
        return tuple(x for n,x in sorted([(int(n), x) for n,x in D.items()]))

class PythonDictWrap:
    def __init__(self,ob) :
        self.ob = ob
    def __reduce_to_dict__(self) :
        return {str(n):v for n,v in self.ob.items()}
    @classmethod
    def __factory_from_dict__(cls, name, D) :
        return {n:x for n,x in D.items()}

register_class (PythonListWrap)
register_class (PythonTupleWrap)
register_class (PythonDictWrap)

# -------------------------------------------
#
#  A view of a subgroup of the archive
#
# --------------------------------------------

class HDFArchiveGroup (HDFArchiveGroupBasicLayer) :
    """
    """
    _wrappedType = {list : PythonListWrap, tuple : PythonTupleWrap, dict : PythonDictWrap}
    _MaxLengthKey = 500

    def __init__(self, parent, subpath) :
        # We want to hold a reference to the parent group, if we are not at the root
        # This will prevent a premature destruction of the root HDFArchive object
        if not self is parent: self.parent = parent
        self.options = parent.options
        HDFArchiveGroupBasicLayer.__init__(self, parent, subpath)
        self.options = parent.options
        self.key_as_string_only = self.options['key_as_string_only']
        self._reconstruct_python_objects = self.options['reconstruct_python_object']
        self.is_top_level = False

    #-------------------------------------------------------------------------
    def _key_cipher(self,key) :
        if key in self.ignored_keys :
            raise KeyError, "key %s is reserved"%key
        if self.key_as_string_only : # for bacward compatibility
            if type(key) not in (str,unicode):
                raise KeyError, "Key must be string only !"
            return key
        r = repr(key)
        if len (r)> self._MaxLengthKey :
            raise KeyError, "The Key is too large !"
        # check that the key is ok (it can be reconstructed)
        try :
            if eval(r) != key: raise KeyError
        except :
            raise KeyError, "The Key *%s*cannot be serialized properly by repr !"%key
        return r

    #-------------------------------------------------------------------------
    def _key_decipher(self,key) :
        return key if self.key_as_string_only else eval(key)

    #-------------------------------------------------------------------------
    def __contains__(self,key) :
        key= self._key_cipher(key)
        return key in self.keys()

    #-------------------------------------------------------------------------
    def values(self) :
        """
        Generator returning the values in the group
        """
        def res() :
            for name in self.keys() :
                yield self[name]
        return res()

   #-------------------------------------------------------------------------
    def items(self) :
        """
        Generator returning couples (key, values) in the group.
        """
        def res() :
            for name in self.keys():
                yield name, self[name]
        return res()

    #-------------------------------------------------------------------------
    def __iter__(self) :
        """Returns the keys, like a dictionary"""
        def res() :
            for name in self.keys() :
                yield name
        return res()

    #-------------------------------------------------------------------------
    def __len__(self) :
        """Returns the length of the keys list """
        return  len(self.keys())

    #-------------------------------------------------------------------------
    def update(self,object_with_dict_protocol):
        for k,v in object_with_dict_protocol.items() : self[k] = v

    #-------------------------------------------------------------------------
    def __delitem__(self,key) :
        key= self._key_cipher(key)
        self._clean_key(key,True)

    #-------------------------------------------------------------------------
    def __setitem__(self,key,val) :
        assert '/' not in key, "/ can not be part of a key"
        key= self._key_cipher(key)# first look if key is a string or key

        if key in self.keys() :
            if self.options['do_not_overwrite_entries'] : raise KeyError, "key %s already exist."%key
            self._clean_key(key) # clean things

        # Transform list, dict, etc... into a wrapped type that will allow HDF reduction
        if type(val) in self._wrappedType: val = self._wrappedType[type(val)](val)

        # write the attributes
        def write_attributes(g) :
           """Use the _hdf5_data_scheme_ if it exists otherwise the class name"""
           ds = val._hdf5_data_scheme_ if hasattr(val,"_hdf5_data_scheme_") else val.__class__.__name__
           try :
             sch = hdf_scheme_access_for_write(ds)
           except :
             err = """
               You are trying to store an object of type "%s", with the TRIQS_HDF5_data_scheme "%s".
               But that data_scheme is not registered, so you will not be able to reread the class.
               Didn't you forget to register your class in pytriqs.archive.hdf_archive_schemes?
               """ %(val.__class__.__name__,ds)
             raise IOError,err
           g.write_attr("TRIQS_HDF5_data_scheme", ds)

        if hasattr(val,'__write_hdf5__') : # simplest protocol
            val.__write_hdf5__(self._group,key)
            self.cached_keys.append(key) # I need to do this here
            # Should be done in the __write_hdf5__ function
            #SUB = HDFArchiveGroup(self,key)
            #write_attributes(SUB)
        elif hasattr(val,'__reduce_to_dict__') : # Is it a HDF_compliant object
            self.create_group(key) # create a new group
            d = val.__reduce_to_dict__()
            if not isinstance(d,dict) : raise ValueError, " __reduce_to_dict__ method does not return a dict. See the doc !"
            SUB = HDFArchiveGroup(self,key)
            for n,v in d.items() : SUB[n] = v
            write_attributes(SUB)
        elif isinstance(val,numpy.ndarray) : # it is a numpy
            try :
               self._write_array( key, numpy.array(val,copy=1,order='C') )
            except RuntimeError:
               print "HDFArchive is in trouble with the array %s"%val
               raise
        elif isinstance(val, HDFArchiveGroup) : # will copy the group recursively
            # we could add this for any object that has .items() in fact...
            SUB = HDFArchiveGroup(self, key)
            for k,v in val.items() : SUB[k]=v
        else : # anything else... expected to be a scalar
            try :
               self._write_scalar( key, val)
            except:
               raise #ValueError, "Value %s\n is not of a type suitable to storage in HDF file"%val
        self._flush()

    #-------------------------------------------------------------------------
    def get_raw (self,key):
        """Similar to __getitem__ but it does NOT reconstruct the python object,
        it presents it as a subgroup"""
        return self.__getitem1__(key,False)

    #-------------------------------------------------------------------------
    def __getitem__(self,key) :
        """Return the object key, possibly reconstructed as a python object if
        it has been properly set up"""
        # If the key contains /, grabs the subgroups
        if '/' in key:
            a,l =self, key.split('/')
            for s in l[:-1]: a = a.get_raw(s)
            return a[l[-1]]
        return self.__getitem1__(key,self._reconstruct_python_objects)

    #-------------------------------------------------------------------------
    def __getitem1__(self, key, reconstruct_python_object, scheme = None) :

        if key not in self :
            key = self._key_cipher(key)
            if key not in self  : raise KeyError, "Key %s does not exist."%key

        if self.is_group(key) :
            SUB = HDFArchiveGroup(self,key) # View of the subgroup
            bare_return = lambda: SUB
        elif self.is_data(key) :
            bare_return = lambda: self._read(key)
        else :
            raise KeyError, "Key %s is of unknown type !!"%Key

        if not reconstruct_python_object : return bare_return()

        # try to find the scheme
        try :
            hdf_data_scheme = scheme if scheme else self._group[key].attrs["TRIQS_HDF5_data_scheme"]
        except:
            return bare_return()
        try :
            sch, group_to_scheme = hdf_scheme_access_for_read(hdf_data_scheme)
        except KeyError:
            print "Warning : The TRIQS_HDF5_data_scheme %s is not recognized. Returning as a group. Hint : did you forgot to import this python class ?"%hdf_data_scheme
            return bare_return()

        r_class_name  = sch.classname
        r_module_name = sch.modulename
        r_readfun = sch.read_fun
        if not (r_class_name and r_module_name) : return bare_return()
        try :
            exec("from %s import %s as r_class" %(r_module_name,r_class_name)) in globals(), locals()
        except KeyError :
            raise RuntimeError, "I cannot find the class %s to reconstruct the object !"%r_class_name
        if r_readfun :
            return r_readfun(self._group,str(key)) # str transforms unicode string to regular python string
        if hasattr(r_class,"__factory_from_dict__"):
            assert self.is_group(key), "__factory_from_dict__ requires a subgroup"
            f = lambda K : SUB.__getitem1__(K, reconstruct_python_object, group_to_scheme.get(K, None) if group_to_scheme else None)
            values = {self._key_decipher(str(K)): f(K) for K in SUB}  # str transforms unicode string to regular python string
            return r_class.__factory_from_dict__(key,values)

        raise ValueError, "Impossible to reread the class %s for group %s and key %s"%(r_class_name,self, key)

    #---------------------------------------------------------------------------
    def __str__(self) :
        def pr(name) :
            if self.is_group(name) :
                return "%s : subgroup"%name
            elif self.is_data(name) : # can be an array of a number
                return "%s : data "%name
            else :
                raise ValueError, "oopps %s"%name

        s= "HDFArchive%s with the following content:\n"%(" (partial view)" if self.is_top_level else '')
        s+='\n'.join([ '  '+ pr(n) for n in self.keys() ])
        return s

    #-------------------------------------------------------------------------
    def __repr__(self) :
        return self.__str__()

    #-------------------------------------------------------------------------
    def apply_on_leaves (self,f) :
        """
           For each named leaf (name,value) of the tree, it calls f(name,value)
           f should return :
            - `None`                    : no action is taken
            - an `empty tuple` ()       : the leaf is removed from the tree
            - an hdf-compliant value    : the leaf is replaced by the value
        """
        def visit_tree(n,d):
          for k in d:# Loop over the subgroups in d
              if d.is_group(k) : visit_tree(k,d[k])
              else :
                  r = f(k,d[k])
                  if not r is None : d[k] = r
                  elif r == () : del d[k]
        visit_tree('/',self['/'])

    # These two methods are necessary for "with"
    def __enter__(self): return self
    def __exit__(self, type, value, traceback): pass

# -------------------------------------------
#
#  The main class
#
# --------------------------------------------

[docs]class HDFArchive(HDFArchiveGroup):
    """
    """
    _class_version = "HDFArchive | 1.0"

    def __init__(self, url_name, open_flag = 'a', key_as_string_only = True,
            reconstruct_python_object = True, init = {}):
        r"""
           Parameters
           -----------
           url_name : string
             The url of the hdf5 file.

                  * If url is a simple string, it is interpreted as a local file name

                  * If url is a remote url (e.g. `http://ipht.cea.fr/triqs/data/single_site_bethe.output.h5` )
                    then the h5 file is downloaded in temporary file and opened.
                    In that case, ``open_flag`` must be 'r', read-only mode.
                    The temporary file is deleted at exit.
           open_flag : Legal modes: r, w, a (default)
           key_as_string_only : True (default)
           init : any generator of tuple (key,val), e.g. a dict.items().
             It will fill the archive with these values.

           Attributes
           ----------
           LocalFileName : string
             the name of the file or of the local downloaded copy
           url_name : string
             the name of the Url

           Examples
           --------
           >>> # retrieve a remove archive (in read-only mode) :
           >>> h = HDFArchive( 'http://ipht.cea.fr/triqs/data/single_site_bethe.output.h5')
           >>>
           >>> # full copy of an archive
           >>> HDFArchive( f, 'w', init = HDFArchive(fmp,'r').items())  # full
           >>>
           >>> # partial copy of file of name fmp, with only the key 'G'
           >>> HDFArchive( f, 'w', init = [ (k,v) for (k,v) in HDFArchive(fmp,'r') if k in ['G'] )
           >>>
           >>> # faster version : the object are only retrieved when needed (list comprehension vs iterator comprehension)
           >>> HDFArchive( f, 'w', init = ( (k,v) for (k,v) in HDFArchive(fmp,'r') if k in ['G'] ) )
           >>>
           >>> # partial copy with processing on the fly with the P function
           >>> HDFArchive( f, 'w', init = ( (k,P(v)) for (k,v) in HDFArchive(fmp,'r') if k in ['G'] ) )
           >>>
           >>> # another variant with a filtered dict
           >>> HDFArchive( f, 'w', init = HDFArchive(fmp,'r').items(lambda k :  k in ['G'] ))

        """
        import os,os.path
        assert open_flag in ['r','w','a'], "Invalid mode"
        assert isinstance(url_name,str), "url_name must be a string"

        # If it is an url , retrieve if and check mode is read only
        import urllib
        LocalFileName, http_message = urllib.urlretrieve (url_name) if open_flag == 'r' else (url_name, None)
        if LocalFileName != url_name : # this was not a local file, so it must be read only
            assert open_flag == 'r', "You retrieve a distant Url %s which is not local, so it must be read-only. Use 'r' option"%url_name

        if open_flag == 'w' :
            # destroys the file, ignoring errors
            try: os.remove(os.path.abspath(LocalFileName))
            except OSError: pass

        self._init_root( LocalFileName, open_flag)
        self.options = {'key_as_string_only' : key_as_string_only,
                        'do_not_overwrite_entries' : False,
                        'reconstruct_python_object': reconstruct_python_object,
                        'UseAlpsNotationForComplex'  : True
                        }
        HDFArchiveGroup.__init__(self,self,"")
        self.is_top_level = True
        for k,v in init : self[k]=v

    def __del__(self):
      self._flush()
      self._close()

    # These two methods are necessary for "with"
    def __enter__(self): return self

    def __exit__(self, type, value, traceback):
      self._flush()
      self._close()

#--------------------------------------------------------------------------------

class HDFArchiveInert:
    """
    A fake class for the node in MPI. It does nothing, but
    permits to write simply :
       a= mpi.bcast(H['a']) # run on all nodes
    -[] : __getitem__ returns self so that H['a']['b'] is ok...
    - setitem : does nothing.
    """
    def HDFArchive_Inert(self):
        pass
    def __getitem__(self,x)   : return self
    def __setitem__(self,k,v) : pass

#--------------------------------------------------------------------------------