Skip to content

Commit df8ce23

Browse files
committed
updates for resizable datasets
1 parent 43133da commit df8ce23

File tree

11 files changed

+438
-174
lines changed

11 files changed

+438
-174
lines changed

src/h5json/array_util.py

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
import binascii
1616
import numpy as np
1717

18-
from .hdf5dtype import isVlen
18+
from .hdf5dtype import isVlen, is_float16_dtype, guess_dtype
1919

2020
MAX_VLEN_ELEMENT = 1_000_000 # restrict largest vlen element to one million
2121

@@ -474,6 +474,33 @@ def arrayToBytes(arr, encoding=None):
474474
return data
475475

476476

477+
def array_for_new_object(data, specified_dtype=None):
478+
"""Prepare an array from data used to create a new dataset or attribute"""
479+
480+
# We mostly let HDF5 convert data as necessary when it's written.
481+
# But if we are going to a float16 datatype, pre-convert in python
482+
# to workaround a bug in the conversion.
483+
# https://github.com/h5py/h5py/issues/819
484+
if is_float16_dtype(specified_dtype):
485+
as_dtype = specified_dtype
486+
elif not isinstance(data, np.ndarray) and (specified_dtype is not None):
487+
# If we need to convert e.g. a list to an array, don't leave numpy
488+
# to guess a dtype we already know.
489+
as_dtype = specified_dtype
490+
else:
491+
as_dtype = guess_dtype(data)
492+
493+
data = np.asarray(data, order="C", dtype=as_dtype)
494+
495+
# In most cases, this does nothing. But if data was already an array,
496+
# and as_dtype is a tagged h5py dtype (e.g. for an object array of strings),
497+
# asarray() doesn't replace its dtype object. This gives it the tagged dtype:
498+
if as_dtype is not None:
499+
data = data.view(dtype=as_dtype)
500+
501+
return data
502+
503+
477504
def bytesToArray(data, dt, shape, encoding=None):
478505
"""
479506
Create numpy array based on byte representation

src/h5json/dset_util.py

Lines changed: 57 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
from .objid import isValidUuid
2020

2121
CHUNK_MIN = 512 * 1024 # Soft lower limit (512k)
22-
CHUNK_MAX = 8096 * 1024 # Hard upper limit (2M)
22+
CHUNK_MAX = 8096 * 1024 # Hard upper limit (8M)
2323

2424

2525
LAYOUT_CLASSES = (
@@ -87,30 +87,36 @@ def estimateDatasetSize(shape_json, item_size, chunk_min=CHUNK_MIN):
8787

8888
def resize_dataset(dset_json, shape):
8989
""" Update shape dims to the given shape provided new shape is valid for maxdims """
90-
shape_json = dset_json["shape"]
91-
shape_class = shape_json["class"]
90+
91+
layout_class = getDatasetLayoutClass(dset_json)
92+
if layout_class != "H5D_CHUNKED":
93+
raise TypeError("Only chunked datasets can be resized")
94+
shape_class = getShapeClass(dset_json)
9295
if shape_class != "H5S_SIMPLE":
9396
raise TypeError(f"dataset with shape class: {shape_class} cannot be resized")
94-
if len(shape_json["dims"]) != len(shape):
97+
dims = getShapeDims(dset_json)
98+
if len(dims) != len(shape):
9599
raise ValueError("Resize shape parameter doesn't match dataset's rank")
96-
if "maxdims" not in shape_json:
100+
if not isExtensible(dset_json):
97101
raise ValueError("Dataset is not resizable")
98-
dims = shape_json["dims"]
99-
maxdims = shape_json["maxdims"]
102+
maxdims = getMaxDims(dset_json)
100103

101-
if shape_json["dims"] == list(shape):
104+
if dims == tuple(shape):
102105
# no change, just return
103-
return
104-
for i in range(len(dims)):
106+
return None
107+
rank = getRank(dset_json)
108+
for i in range(rank):
105109
extent = shape[i]
106110
if extent < 0:
107111
raise ValueError("dimensions can't be negative")
108-
if maxdims[i] == "H5S_UNLIMITED":
112+
if maxdims[i] in (0, "H5S_UNLIMITED"):
109113
# any positive extent is ok
110114
continue
111115
if extent > maxdims[i]:
112116
raise ValueError(f"extent for dimension {i} can't be larger than {maxdims[i]}")
113117

118+
# update the object json with the new dimensions
119+
shape_json = dset_json["shape"]
114120
shape_json["dims"] = list(shape)
115121

116122

@@ -185,12 +191,12 @@ def getChunkSize(chunk_dims, type_size: int = 1):
185191
def getChunkDims(dset_json):
186192
"""Get chunk layout. Return shape dims for non-chunked layout"""
187193

188-
shape_json = dset_json["shape"]
189-
if shape_json["class"] == "H5S_NULL":
194+
shape_class = getShapeClass(dset_json)
195+
if shape_class == "H5S_NULL":
190196
return None
191-
if shape_json["class"] == "H5S_SCALAR":
197+
if shape_class == "H5S_SCALAR":
192198
return (1, )
193-
shape_dims = shape_json["dims"]
199+
shape_dims = getShapeDims(dset_json)
194200
layout_class = getDatasetLayoutClass(dset_json)
195201
if not layout_class:
196202
return tuple(shape_dims)
@@ -207,7 +213,7 @@ def getChunkDims(dset_json):
207213
return chunk_dims
208214

209215

210-
def validateChunkLayout(shape_json, type_json, layout):
216+
def validateLayout(shape_json, type_json, layout):
211217
"""
212218
Use chunk layout given in the creationPropertiesList (if defined and
213219
layout is valid).
@@ -218,6 +224,7 @@ def validateChunkLayout(shape_json, type_json, layout):
218224
space_dims = None
219225
chunk_dims = None
220226
max_dims = None
227+
221228
item_size = getItemSize(type_json)
222229

223230
if "dims" in shape_json:
@@ -250,7 +257,7 @@ def validateChunkLayout(shape_json, type_json, layout):
250257
if chunk_extent > dim_extent:
251258
msg = "Invalid layout value"
252259
raise ValueError(reason=msg)
253-
elif max_dims[i] != 0:
260+
elif max_dims[i] not in (0, "H5S_UNLIMITED"):
254261
if chunk_extent > max_dims[i]:
255262
msg = "Invalid layout value for extensible dimension"
256263
raise ValueError(msg)
@@ -404,7 +411,7 @@ def validateDatasetCreationProps(creation_props, type_json=None, shape=None):
404411
layout_class = None
405412
if "layout" in creation_props:
406413
layout_json = creation_props["layout"]
407-
validateChunkLayout(shape, type_json, layout_json)
414+
validateLayout(shape, type_json, layout_json)
408415
layout_class = layout_json["class"]
409416

410417
if "filters" in creation_props:
@@ -436,7 +443,7 @@ def expandChunk(layout, typesize, shape_json, chunk_min=CHUNK_MIN):
436443
if "maxdims" in shape_json:
437444
maxdims = shape_json["maxdims"]
438445
for n in range(rank):
439-
if maxdims[n] == 0 or maxdims[n] > dims[n]:
446+
if maxdims[n] in (0, "H5S_UNLIMITED") or maxdims[n] > dims[n]:
440447
extendable_dims += 1
441448

442449
dset_size = getDataSize(shape_json, typesize)
@@ -454,7 +461,7 @@ def expandChunk(layout, typesize, shape_json, chunk_min=CHUNK_MIN):
454461
dim = rank - n - 1 # start from last dim
455462

456463
if extendable_dims > 0:
457-
if maxdims[dim] == 0:
464+
if maxdims[dim] in (0, "H5S_UNLIMITED"):
458465
# infinitely extendable dimensions
459466
layout[dim] *= 2
460467
chunk_size = getChunkSize(layout, typesize)
@@ -553,7 +560,7 @@ def guessChunk(shape, typesize, chunk_min=None, chunk_max=None):
553560
typesize = 128 # just take a guess at the item size
554561

555562
# For unlimited dimensions we have to guess. use 1024
556-
shape = tuple((x if x != 0 else 1024) for i, x in enumerate(shape))
563+
shape = tuple((x if x not in (0, "H5S_UNLIMITED") else 1024) for i, x in enumerate(shape))
557564

558565
chunk_size = getChunkSize(shape, typesize)
559566
if chunk_min and chunk_size < chunk_min:
@@ -568,7 +575,7 @@ def guessChunk(shape, typesize, chunk_min=None, chunk_max=None):
568575

569576
def generateLayout(
570577
shape_json,
571-
item_size=0,
578+
type_json,
572579
chunks=None,
573580
chunk_min=CHUNK_MIN,
574581
chunk_max=CHUNK_MAX,
@@ -577,6 +584,9 @@ def generateLayout(
577584

578585
""" Create a dataset layout based on type and shape properties """
579586

587+
item_size = getItemSize(type_json)
588+
if item_size == "H5T_VARIABLE":
589+
item_size = 128 # take a guess
580590
if item_size < 0:
581591
raise ValueError("item_size is invalid")
582592

@@ -612,6 +622,13 @@ def generateLayout(
612622
chunk_dims = chunks
613623
if len(chunk_dims) != rank:
614624
raise ValueError("given chunk dims do not agree with dataset rank")
625+
for dim in range(rank):
626+
if max_dims[dim] in (0, "H5S_UNLIMITED"):
627+
pass # unlimited, so any chunk extent is ok
628+
elif chunk_dims[dim] > max_dims[dim]:
629+
msg = "Chunk shape must not be greater than data shape in any dimension. "
630+
msg += f"{chunk_dims} is not compatible with {max_dims}"
631+
raise ValueError()
615632
else:
616633
pass # otherwise we'll guess a chunk shape below
617634
if not chunk_dims:
@@ -646,12 +663,14 @@ def generateLayout(
646663
layout["partition_count"] = partition_count
647664
else:
648665
pass # partition not needed
666+
667+
validateLayout(shape_json, type_json, layout)
649668
return layout
650669

651670

652671
def generate_dcpl(
653672
shape_json,
654-
dtype,
673+
type_json,
655674
chunks=None,
656675
filters=[],
657676
chunk_min=CHUNK_MIN,
@@ -678,12 +697,12 @@ def generate_dcpl(
678697

679698
# End argument validation
680699

681-
kwargs = {"item_size": dtype.itemsize, "has_filter": filters}
700+
kwargs = {"has_filter": filters}
682701
kwargs["chunks"] = chunks
683702
kwargs["chunk_min"] = chunk_min
684703
kwargs["chunk_max"] = chunk_max
685704
kwargs["max_chunks_per_folder"] = max_chunks_per_folder
686-
plist["layout"] = generateLayout(shape_json, **kwargs)
705+
plist["layout"] = generateLayout(shape_json, type_json, **kwargs)
687706

688707
if len(filters) > 0:
689708
plist["filters"] = filters
@@ -697,3 +716,16 @@ def generate_dcpl(
697716
plist["initializer"] = initializer
698717

699718
return plist
719+
720+
721+
def getFillValue(obj_json):
722+
""" Return the fill value or None if not set """
723+
724+
if "creationProperties" in obj_json:
725+
cpl = obj_json["creationProperties"]
726+
else:
727+
cpl = obj_json # assume we've been based a cpl
728+
if "filLValue" in cpl:
729+
return cpl["fillValue"]
730+
else:
731+
return None

src/h5json/filters.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616

1717
DEFAULT_GZIP = 4
1818
DEFAULT_SZIP = 4
19+
DEFAULT_LZ4 = 1
1920
SO_INT_MINBITS_DEFAULT = 0
2021

2122
# List of registered filters. Not all are supported by every reader and writer.
@@ -97,7 +98,7 @@ def getFilterItem(name, options={}):
9798
filter_json = None
9899

99100
if isinstance(name, dict):
100-
filter_json = name
101+
filter_json = name.copy()
101102
base_keys = ("class", "id", "name")
102103
for key in base_keys:
103104
if key not in filter_json:

0 commit comments

Comments
 (0)