-
-
Notifications
You must be signed in to change notification settings - Fork 1.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Compatibility with Zarr v3b2 #9795
Merged
Merged
Changes from all commits
Commits
Show all changes
9 commits
Select commit
Hold shift + click to select a range
e18e650
Compatibility with Zarr v3b2
dcherian c0af9ee
More guards with mode="w"
dcherian a990970
refactoring
dcherian c07ed88
tweak expected requestsC
dcherian aa522e7
Merge branch 'main' into zarr-v3-b2
dcherian 4563efe
compat
dcherian ca30cd8
more compat
dcherian f2e91e9
fix
dcherian 12f577a
Merge branch 'main' into zarr-v3-b2
dcherian File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -37,6 +37,7 @@ | |
from xarray.namedarray.utils import module_available | ||
|
||
if TYPE_CHECKING: | ||
from zarr import Array as ZarrArray | ||
from zarr import Group as ZarrGroup | ||
|
||
from xarray.backends.common import AbstractDataStore | ||
|
@@ -443,7 +444,7 @@ def extract_zarr_variable_encoding( | |
shape = shape if shape else variable.shape | ||
encoding = variable.encoding.copy() | ||
|
||
safe_to_drop = {"source", "original_shape"} | ||
safe_to_drop = {"source", "original_shape", "preferred_chunks"} | ||
valid_encodings = { | ||
"codecs", | ||
"chunks", | ||
|
@@ -871,16 +872,27 @@ def store( | |
else: | ||
zarr = attempt_import("zarr") | ||
|
||
existing_keys = tuple(self.zarr_group.array_keys()) | ||
if self._mode == "w": | ||
# always overwrite, so we don't care about existing names, | ||
# and consistency of encoding | ||
new_variable_names = set(variables) | ||
existing_keys = {} | ||
existing_variable_names = {} | ||
else: | ||
existing_keys = tuple(self.zarr_group.array_keys()) | ||
existing_variable_names = { | ||
vn for vn in variables if _encode_variable_name(vn) in existing_keys | ||
} | ||
new_variable_names = set(variables) - existing_variable_names | ||
|
||
if self._mode == "r+": | ||
new_names = [k for k in variables if k not in existing_keys] | ||
if new_names: | ||
raise ValueError( | ||
f"dataset contains non-pre-existing variables {new_names}, " | ||
"which is not allowed in ``xarray.Dataset.to_zarr()`` with " | ||
"``mode='r+'``. To allow writing new variables, set ``mode='a'``." | ||
) | ||
if self._mode == "r+" and ( | ||
new_names := [k for k in variables if k not in existing_keys] | ||
): | ||
raise ValueError( | ||
f"dataset contains non-pre-existing variables {new_names!r}, " | ||
"which is not allowed in ``xarray.Dataset.to_zarr()`` with " | ||
"``mode='r+'``. To allow writing new variables, set ``mode='a'``." | ||
) | ||
|
||
if self._append_dim is not None and self._append_dim not in existing_keys: | ||
# For dimensions without coordinate values, we must parse | ||
|
@@ -895,10 +907,6 @@ def store( | |
f"dataset dimensions {existing_dims}" | ||
) | ||
|
||
existing_variable_names = { | ||
vn for vn in variables if _encode_variable_name(vn) in existing_keys | ||
} | ||
new_variable_names = set(variables) - existing_variable_names | ||
variables_encoded, attributes = self.encode( | ||
{vn: variables[vn] for vn in new_variable_names}, attributes | ||
) | ||
|
@@ -920,10 +928,9 @@ def store( | |
# Modified variables must use the same encoding as the store. | ||
vars_with_encoding = {} | ||
for vn in existing_variable_names: | ||
if self._mode in ["a", "a-", "r+"]: | ||
_validate_datatypes_for_zarr_append( | ||
vn, existing_vars[vn], variables[vn] | ||
) | ||
_validate_datatypes_for_zarr_append( | ||
vn, existing_vars[vn], variables[vn] | ||
) | ||
vars_with_encoding[vn] = variables[vn].copy(deep=False) | ||
vars_with_encoding[vn].encoding = existing_vars[vn].encoding | ||
vars_with_encoding, _ = self.encode(vars_with_encoding, {}) | ||
|
@@ -968,6 +975,69 @@ def store( | |
def sync(self): | ||
pass | ||
|
||
def _open_existing_array(self, *, name) -> ZarrArray: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. just moved it around so |
||
import zarr | ||
|
||
# TODO: if mode="a", consider overriding the existing variable | ||
# metadata. This would need some case work properly with region | ||
# and append_dim. | ||
if self._write_empty is not None: | ||
# Write to zarr_group.chunk_store instead of zarr_group.store | ||
# See https://github.com/pydata/xarray/pull/8326#discussion_r1365311316 for a longer explanation | ||
# The open_consolidated() enforces a mode of r or r+ | ||
# (and to_zarr with region provided enforces a read mode of r+), | ||
# and this function makes sure the resulting Group has a store of type ConsolidatedMetadataStore | ||
# and a 'normal Store subtype for chunk_store. | ||
# The exact type depends on if a local path was used, or a URL of some sort, | ||
# but the point is that it's not a read-only ConsolidatedMetadataStore. | ||
# It is safe to write chunk data to the chunk_store because no metadata would be changed by | ||
# to_zarr with the region parameter: | ||
# - Because the write mode is enforced to be r+, no new variables can be added to the store | ||
# (this is also checked and enforced in xarray.backends.api.py::to_zarr()). | ||
# - Existing variables already have their attrs included in the consolidated metadata file. | ||
# - The size of dimensions can not be expanded, that would require a call using `append_dim` | ||
# which is mutually exclusive with `region` | ||
zarr_array = zarr.open( | ||
store=( | ||
self.zarr_group.store if _zarr_v3() else self.zarr_group.chunk_store | ||
), | ||
# TODO: see if zarr should normalize these strings. | ||
path="/".join([self.zarr_group.name.rstrip("/"), name]).lstrip("/"), | ||
write_empty_chunks=self._write_empty, | ||
) | ||
else: | ||
zarr_array = self.zarr_group[name] | ||
|
||
return zarr_array | ||
|
||
def _create_new_array( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. just moved it around so |
||
self, *, name, shape, dtype, fill_value, encoding, attrs | ||
) -> ZarrArray: | ||
if coding.strings.check_vlen_dtype(dtype) is str: | ||
dtype = str | ||
|
||
if self._write_empty is not None: | ||
if ( | ||
"write_empty_chunks" in encoding | ||
and encoding["write_empty_chunks"] != self._write_empty | ||
): | ||
raise ValueError( | ||
'Differing "write_empty_chunks" values in encoding and parameters' | ||
f'Got {encoding["write_empty_chunks"] = } and {self._write_empty = }' | ||
) | ||
else: | ||
encoding["write_empty_chunks"] = self._write_empty | ||
|
||
zarr_array = self.zarr_group.create( | ||
name, | ||
shape=shape, | ||
dtype=dtype, | ||
fill_value=fill_value, | ||
**encoding, | ||
) | ||
zarr_array = _put_attrs(zarr_array, attrs) | ||
return zarr_array | ||
|
||
def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=None): | ||
""" | ||
This provides a centralized method to set the variables on the data | ||
|
@@ -986,8 +1056,6 @@ def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=No | |
dimensions. | ||
""" | ||
|
||
import zarr | ||
|
||
existing_keys = tuple(self.zarr_group.array_keys()) | ||
is_zarr_v3_format = _zarr_v3() and self.zarr_group.metadata.zarr_format == 3 | ||
|
||
|
@@ -1016,47 +1084,13 @@ def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=No | |
else: | ||
del v.encoding["_FillValue"] | ||
|
||
zarr_array = None | ||
zarr_shape = None | ||
write_region = self._write_region if self._write_region is not None else {} | ||
write_region = {dim: write_region.get(dim, slice(None)) for dim in dims} | ||
|
||
if name in existing_keys: | ||
if self._mode != "w" and name in existing_keys: | ||
# existing variable | ||
# TODO: if mode="a", consider overriding the existing variable | ||
# metadata. This would need some case work properly with region | ||
# and append_dim. | ||
if self._write_empty is not None: | ||
# Write to zarr_group.chunk_store instead of zarr_group.store | ||
# See https://github.com/pydata/xarray/pull/8326#discussion_r1365311316 for a longer explanation | ||
# The open_consolidated() enforces a mode of r or r+ | ||
# (and to_zarr with region provided enforces a read mode of r+), | ||
# and this function makes sure the resulting Group has a store of type ConsolidatedMetadataStore | ||
# and a 'normal Store subtype for chunk_store. | ||
# The exact type depends on if a local path was used, or a URL of some sort, | ||
# but the point is that it's not a read-only ConsolidatedMetadataStore. | ||
# It is safe to write chunk data to the chunk_store because no metadata would be changed by | ||
# to_zarr with the region parameter: | ||
# - Because the write mode is enforced to be r+, no new variables can be added to the store | ||
# (this is also checked and enforced in xarray.backends.api.py::to_zarr()). | ||
# - Existing variables already have their attrs included in the consolidated metadata file. | ||
# - The size of dimensions can not be expanded, that would require a call using `append_dim` | ||
# which is mutually exclusive with `region` | ||
zarr_array = zarr.open( | ||
store=( | ||
self.zarr_group.store | ||
if _zarr_v3() | ||
else self.zarr_group.chunk_store | ||
), | ||
# TODO: see if zarr should normalize these strings. | ||
path="/".join([self.zarr_group.name.rstrip("/"), name]).lstrip( | ||
"/" | ||
), | ||
write_empty_chunks=self._write_empty, | ||
) | ||
else: | ||
zarr_array = self.zarr_group[name] | ||
|
||
zarr_array = self._open_existing_array(name=name) | ||
if self._append_dim is not None and self._append_dim in dims: | ||
# resize existing variable | ||
append_axis = dims.index(self._append_dim) | ||
|
@@ -1089,40 +1123,27 @@ def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=No | |
shape=zarr_shape, | ||
) | ||
|
||
if name not in existing_keys: | ||
if self._mode == "w" or name not in existing_keys: | ||
# new variable | ||
encoded_attrs = {} | ||
encoded_attrs = {k: self.encode_attribute(v) for k, v in attrs.items()} | ||
# the magic for storing the hidden dimension data | ||
if is_zarr_v3_format: | ||
encoding["dimension_names"] = dims | ||
else: | ||
encoded_attrs[DIMENSION_KEY] = dims | ||
for k2, v2 in attrs.items(): | ||
encoded_attrs[k2] = self.encode_attribute(v2) | ||
|
||
if coding.strings.check_vlen_dtype(dtype) is str: | ||
dtype = str | ||
|
||
if self._write_empty is not None: | ||
if ( | ||
"write_empty_chunks" in encoding | ||
and encoding["write_empty_chunks"] != self._write_empty | ||
): | ||
raise ValueError( | ||
'Differing "write_empty_chunks" values in encoding and parameters' | ||
f'Got {encoding["write_empty_chunks"] = } and {self._write_empty = }' | ||
) | ||
else: | ||
encoding["write_empty_chunks"] = self._write_empty | ||
|
||
zarr_array = self.zarr_group.create( | ||
name, | ||
shape=shape, | ||
|
||
encoding["exists_ok" if _zarr_v3() else "overwrite"] = ( | ||
True if self._mode == "w" else False | ||
) | ||
|
||
zarr_array = self._create_new_array( | ||
name=name, | ||
dtype=dtype, | ||
shape=shape, | ||
fill_value=fill_value, | ||
**encoding, | ||
encoding=encoding, | ||
attrs=encoded_attrs, | ||
) | ||
zarr_array = _put_attrs(zarr_array, encoded_attrs) | ||
|
||
writer.add(v.data, zarr_array, region) | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
logic change. We now skip all these checks for mode="w". i think earlier, zarr would erase the group so all the logic below was not activated. Now that zarr does not erase the group, we must skip all the logic below.