vds.py 9.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249
  1. # This file is part of h5py, a Python interface to the HDF5 library.
  2. #
  3. # http://www.h5py.org
  4. #
  5. # Copyright 2008-2013 Andrew Collette and contributors
  6. #
  7. # License: Standard 3-clause BSD; see "license.txt" for full license terms
  8. # and contributor agreement.
  9. """
  10. High-level interface for creating HDF5 virtual datasets
  11. """
  12. from copy import deepcopy as copy
  13. from collections import namedtuple
  14. import numpy as np
  15. from .compat import filename_encode
  16. from .datatype import Datatype
  17. from .selections import SimpleSelection, select
  18. from .. import h5d, h5p, h5s, h5t, h5
  19. from .. import version
  20. class VDSmap(namedtuple('VDSmap', ('vspace', 'file_name',
  21. 'dset_name', 'src_space'))):
  22. '''Defines a region in a virtual dataset mapping to part of a source dataset
  23. '''
  24. vds_support = False
  25. hdf5_version = version.hdf5_version_tuple[0:3]
  26. if hdf5_version >= h5.get_config().vds_min_hdf5_version:
  27. vds_support = True
  28. def _convert_space_for_key(space, key):
  29. """
  30. Converts the space with the given key. Mainly used to allow unlimited
  31. dimensions in virtual space selection.
  32. """
  33. key = key if isinstance(key, tuple) else (key,)
  34. type_code = space.get_select_type()
  35. # check for unlimited selections in case where selection is regular
  36. # hyperslab, which is the only allowed case for h5s.UNLIMITED to be
  37. # in the selection
  38. if type_code == h5s.SEL_HYPERSLABS and space.is_regular_hyperslab():
  39. rank = space.get_simple_extent_ndims()
  40. nargs = len(key)
  41. idx_offset = 0
  42. start, stride, count, block = space.get_regular_hyperslab()
  43. # iterate through keys. we ignore numeral indices. if we get a
  44. # slice, we check for an h5s.UNLIMITED value as the stop
  45. # if we get an ellipsis, we offset index by (rank - nargs)
  46. for i, sl in enumerate(key):
  47. if isinstance(sl, slice):
  48. if sl.stop == h5s.UNLIMITED:
  49. counts = list(count)
  50. idx = i + idx_offset
  51. counts[idx] = h5s.UNLIMITED
  52. count = tuple(counts)
  53. elif sl is Ellipsis:
  54. idx_offset = rank - nargs
  55. space.select_hyperslab(start, count, stride, block)
  56. class VirtualSource(object):
  57. """Source definition for virtual data sets.
  58. Instantiate this class to represent an entire source dataset, and then
  59. slice it to indicate which regions should be used in the virtual dataset.
  60. path_or_dataset
  61. The path to a file, or an h5py dataset. If a dataset is given,
  62. no other parameters are allowed, as the relevant values are taken from
  63. the dataset instead.
  64. name
  65. The name of the source dataset within the file.
  66. shape
  67. A tuple giving the shape of the dataset.
  68. dtype
  69. Numpy dtype or string.
  70. maxshape
  71. The source dataset is resizable up to this shape. Use None for
  72. axes you want to be unlimited.
  73. """
  74. def __init__(self, path_or_dataset, name=None,
  75. shape=None, dtype=None, maxshape=None):
  76. from .dataset import Dataset
  77. if isinstance(path_or_dataset, Dataset):
  78. failed = {k: v
  79. for k, v in
  80. {'name': name, 'shape': shape,
  81. 'dtype': dtype, 'maxshape': maxshape}.items()
  82. if v is not None}
  83. if failed:
  84. raise TypeError("If a Dataset is passed as the first argument "
  85. "then no other arguments may be passed. You "
  86. "passed {failed}".format(failed=failed))
  87. ds = path_or_dataset
  88. path = ds.file.filename
  89. name = ds.name
  90. shape = ds.shape
  91. dtype = ds.dtype
  92. maxshape = ds.maxshape
  93. else:
  94. path = path_or_dataset
  95. if name is None:
  96. raise TypeError("The name parameter is required when "
  97. "specifying a source by path")
  98. if shape is None:
  99. raise TypeError("The shape parameter is required when "
  100. "specifying a source by path")
  101. elif isinstance(shape, int):
  102. shape = (shape,)
  103. if isinstance(maxshape, int):
  104. maxshape = (maxshape,)
  105. self.path = path
  106. self.name = name
  107. self.dtype = dtype
  108. if maxshape is None:
  109. self.maxshape = shape
  110. else:
  111. self.maxshape = tuple([h5s.UNLIMITED if ix is None else ix
  112. for ix in maxshape])
  113. self.sel = SimpleSelection(shape)
  114. @property
  115. def shape(self):
  116. return self.sel.array_shape
  117. def __getitem__(self, key):
  118. tmp = copy(self)
  119. tmp.sel = select(self.shape, key, dataset=None)
  120. _convert_space_for_key(tmp.sel.id, key)
  121. return tmp
  122. class VirtualLayout(object):
  123. """Object for building a virtual dataset.
  124. Instantiate this class to define a virtual dataset, assign to slices of it
  125. (using VirtualSource objects), and then pass it to
  126. group.create_virtual_dataset() to add the virtual dataset to a file.
  127. This class does not allow access to the data; the virtual dataset must
  128. be created in a file before it can be used.
  129. shape
  130. A tuple giving the shape of the dataset.
  131. dtype
  132. Numpy dtype or string.
  133. maxshape
  134. The virtual dataset is resizable up to this shape. Use None for
  135. axes you want to be unlimited.
  136. filename
  137. The name of the destination file, if known in advance. Mappings from
  138. data in the same file will be stored with filename '.', allowing the
  139. file to be renamed later.
  140. """
  141. def __init__(self, shape, dtype, maxshape=None, filename=None):
  142. self.shape = (shape,) if isinstance(shape, int) else shape
  143. self.dtype = dtype
  144. self.maxshape = (maxshape,) if isinstance(maxshape, int) else maxshape
  145. self._filename = filename
  146. self._src_filenames = set()
  147. self.dcpl = h5p.create(h5p.DATASET_CREATE)
  148. def __setitem__(self, key, source):
  149. sel = select(self.shape, key, dataset=None)
  150. _convert_space_for_key(sel.id, key)
  151. src_filename = self._source_file_name(source.path, self._filename)
  152. self.dcpl.set_virtual(
  153. sel.id, src_filename, source.name.encode('utf-8'), source.sel.id
  154. )
  155. if self._filename is None:
  156. self._src_filenames.add(src_filename)
  157. @staticmethod
  158. def _source_file_name(src_filename, dst_filename) -> bytes:
  159. src_filename = filename_encode(src_filename)
  160. if dst_filename and (src_filename == filename_encode(dst_filename)):
  161. # use relative path if the source dataset is in the same
  162. # file, in order to keep the virtual dataset valid in case
  163. # the file is renamed.
  164. return b'.'
  165. return filename_encode(src_filename)
  166. def _get_dcpl(self, dst_filename):
  167. """Get the property list containing virtual dataset mappings
  168. If the destination filename wasn't known when the VirtualLayout was
  169. created, it is handled here.
  170. """
  171. dst_filename = filename_encode(dst_filename)
  172. if self._filename is not None:
  173. # filename was known in advance; check dst_filename matches
  174. if dst_filename != filename_encode(self._filename):
  175. raise Exception(f"{dst_filename!r} != {self._filename!r}")
  176. return self.dcpl
  177. # destination file not known in advance
  178. if dst_filename in self._src_filenames:
  179. # At least 1 source file is the same as the destination file,
  180. # but we didn't know this when making the mapping. Copy the mappings
  181. # to a new property list, replacing the dest filename with '.'
  182. new_dcpl = h5p.create(h5p.DATASET_CREATE)
  183. for i in range(self.dcpl.get_virtual_count()):
  184. src_filename = self.dcpl.get_virtual_filename(i)
  185. new_dcpl.set_virtual(
  186. self.dcpl.get_virtual_vspace(i),
  187. self._source_file_name(src_filename, dst_filename),
  188. self.dcpl.get_virtual_dsetname(i).encode('utf-8'),
  189. self.dcpl.get_virtual_srcspace(i),
  190. )
  191. return new_dcpl
  192. else:
  193. return self.dcpl # Mappings are all from other files
  194. def make_dataset(self, parent, name, fillvalue=None):
  195. """ Return a new low-level dataset identifier for a virtual dataset """
  196. dcpl = self._get_dcpl(parent.file.filename)
  197. if fillvalue is not None:
  198. dcpl.set_fill_value(np.array([fillvalue]))
  199. maxshape = self.maxshape
  200. if maxshape is not None:
  201. maxshape = tuple(m if m is not None else h5s.UNLIMITED for m in maxshape)
  202. virt_dspace = h5s.create_simple(self.shape, maxshape)
  203. if isinstance(self.dtype, Datatype):
  204. # Named types are used as-is
  205. tid = self.dtype.id
  206. else:
  207. dtype = np.dtype(self.dtype)
  208. tid = h5t.py_create(dtype, logical=1)
  209. return h5d.create(parent.id, name=name, tid=tid, space=virt_dspace,
  210. dcpl=dcpl)