From f9df44cf519abaeba737b5a1cd1c23301f39c21d Mon Sep 17 00:00:00 2001 From: Mihai Maruseac Date: Mon, 22 Jul 2024 09:06:17 -0700 Subject: [PATCH] Refactor sharded serialization to remove code duplication Similar to https://github.com/sigstore/model-transparency/pull/241, there is a duplication in the directory traversal between serializing to a digest and serializing to a manifest. This time, both supported parallelism, so there is really no need for the duplication. We make an abstract `ShardedFilesSerializer` class to contain the logic for the directory traversal and then create the better named `DigestSerializer` and `ManifestSerializer` for the two serializing classes. This time, instead of trying extremely hard to match the old behavior for digest serialization, we just update the goldens. This means that this depends on https://github.com/sigstore/model-transparency/pull/244. We still had to update some other tests: since the hashes are computed only for files, we no longer differentiate between a model with an empty directory and a model where that empty directory is completely removed. This is a corner case and it is ok to do this. In fact, ignoring empty directories is part of the optimization hinted at in https://github.com/sigstore/model-transparency/issues/197. Signed-off-by: Mihai Maruseac --- .../serialization/serialize_by_file_shard.py | 266 +++++++----------- .../serialize_by_file_shard_test.py | 84 +++--- .../deep_model_folder | 2 +- .../deep_model_folder_small_shards | 2 +- .../TestShardedDFSSerializer/empty_model_file | 2 +- .../empty_model_file_small_shards | 2 +- .../model_folder_with_empty_file | 2 +- .../model_folder_with_empty_file_small_shards | 2 +- .../sample_model_file | 2 +- .../sample_model_file_small_shards | 2 +- .../sample_model_folder | 2 +- .../sample_model_folder_small_shards | 2 +- 12 files changed, 148 insertions(+), 222 deletions(-) diff --git a/model_signing/serialization/serialize_by_file_shard.py b/model_signing/serialization/serialize_by_file_shard.py index 59aa07e8..4ebb318f 100644 --- a/model_signing/serialization/serialize_by_file_shard.py +++ b/model_signing/serialization/serialize_by_file_shard.py @@ -14,10 +14,11 @@ """Model serializers that operated at file shard level of granularity.""" +import abc import base64 import concurrent.futures import pathlib -from typing import Callable, Iterable, TypeAlias +from typing import Callable, Iterable, TypeAlias, cast from typing_extensions import override from model_signing.hashing import file @@ -27,21 +28,16 @@ from model_signing.serialization import serialize_by_file -_ShardSignTask: TypeAlias = tuple[pathlib.PurePath, str, int, int] - - def _build_header( *, - entry_name: str, - entry_type: str, + name: str, start: int, end: int, ) -> bytes: - """Builds a header to encode a path with given name and type. + """Builds a header to encode a path with given name and shard range. Args: entry_name: The name of the entry to build the header for. - entry_type: The type of the entry (file or directory). start: Offset for the start of the path shard. end: Offset for the end of the path shard. @@ -50,14 +46,11 @@ def _build_header( bytes. Each argument is separated by dots and the last byte is also a dot (so the file digest can be appended unambiguously). """ - # Note: This will get replaced in subsequent change, right now we're just - # moving existing code around. - encoded_type = entry_type.encode("utf-8") # Prevent confusion if name has a "." inside by encoding to base64. - encoded_name = base64.b64encode(entry_name.encode("utf-8")) + encoded_name = base64.b64encode(name.encode("utf-8")) encoded_range = f"{start}-{end}".encode("utf-8") # Note: empty string at the end, to terminate header with a "." - return b".".join([encoded_type, encoded_name, encoded_range, b""]) + return b".".join([encoded_name, encoded_range, b""]) def _endpoints(step: int, end: int) -> Iterable[int]: @@ -83,164 +76,15 @@ def _endpoints(step: int, end: int) -> Iterable[int]: yield end -class ShardedDFSSerializer(serialization.Serializer): - """DFSSerializer that uses a sharded hash engine to exploit parallelism.""" - - def __init__( - self, - file_hasher_factory: Callable[ - [pathlib.Path, int, int], file.ShardedFileHasher - ], - merge_hasher: hashing.StreamingHashEngine, - max_workers: int | None = None, - ): - """Initializes an instance to serialize a model with this serializer. - - Args: - hasher_factory: A callable to build the hash engine used to hash - every shard of the files in the model. Because each shard is - processed in parallel, every thread needs to call the factory to - start hashing. The arguments are the file, and the endpoints of - the shard. - merge_hasher: A `hashing.StreamingHashEngine` instance used to merge - individual file digests to compute an aggregate digest. - max_workers: Maximum number of workers to use in parallel. Default - is to defer to the `concurent.futures` library. - """ - self._file_hasher_factory = file_hasher_factory - self._merge_hasher = merge_hasher - self._max_workers = max_workers - - # Precompute some private values only once by using a mock file hasher. - # None of the arguments used to build the hasher are used. - hasher = file_hasher_factory(pathlib.Path(), 0, 1) - self._shard_size = hasher.shard_size - - @override - def serialize(self, model_path: pathlib.Path) -> manifest.DigestManifest: - # Note: This function currently uses `pathlib.Path.glob` so the DFS - # expansion relies on the `glob` implementation performing a DFS. We - # will be truthful again when switching to `pathlib.Path.walk`, after - # Python 3.12 is the minimum version we support. - - # TODO: github.com/sigstore/model-transparency/issues/196 - Add checks - # to exclude symlinks if desired. - serialize_by_file.check_file_or_directory(model_path) - - if model_path.is_file(): - entries = [model_path] - else: - # TODO: github.com/sigstore/model-transparency/issues/200 - When - # Python3.12 is the minimum supported version, this can be replaced - # with `pathlib.Path.walk` for a clearer interface, and some speed - # improvement. - entries = sorted(model_path.glob("**/*")) - - tasks = self._convert_paths_to_tasks(entries, model_path) - - digest_len = self._merge_hasher.digest_size - digests_buffer = bytearray(len(tasks) * digest_len) - - with concurrent.futures.ThreadPoolExecutor( - max_workers=self._max_workers - ) as tpe: - futures_dict = { - tpe.submit(self._perform_hash_task, model_path, task): i - for i, task in enumerate(tasks) - } - for future in concurrent.futures.as_completed(futures_dict): - i = futures_dict[future] - task_digest = future.result() - - task_path, task_type, task_start, task_end = tasks[i] - header = _build_header( - entry_name=task_path.name, - entry_type=task_type, - start=task_start, - end=task_end, - ) - self._merge_hasher.reset(header) - self._merge_hasher.update(task_digest) - digest = self._merge_hasher.compute().digest_value - - start = i * digest_len - end = start + digest_len - digests_buffer[start:end] = digest - - self._merge_hasher.reset(digests_buffer) - return manifest.DigestManifest(self._merge_hasher.compute()) - - def _convert_paths_to_tasks( - self, paths: Iterable[pathlib.Path], root_path: pathlib.Path - ) -> list[_ShardSignTask]: - """Returns the tasks that would hash shards of files in parallel. - - Every file in `paths` is replaced by a set of tasks. Each task computes - the digest over a shard of the file. Directories result in a single - task, just to compute a digest over a header. - - To differentiate between (empty) files and directories with the same - name, every task needs to also include a header. The header needs to - include relative path to the model root, as we want to obtain the same - digest if the model is moved. - - We don't construct an enum for the type of the entry, because these will - never escape this class. - - Note that the path component of the tasks is a `pathlib.PurePath`, so - operations on it cannot touch the filesystem. - """ - # TODO: github.com/sigstore/model-transparency/issues/196 - Add support - # for excluded files. - - tasks = [] - for path in paths: - serialize_by_file.check_file_or_directory(path) - relative_path = path.relative_to(root_path) - - if path.is_file(): - path_size = path.stat().st_size - start = 0 - for end in _endpoints(self._shard_size, path_size): - tasks.append((relative_path, "file", start, end)) - start = end - else: - tasks.append((relative_path, "dir", 0, 0)) - - return tasks - - def _perform_hash_task( - self, model_path: pathlib.Path, task: _ShardSignTask - ) -> bytes: - """Produces the hash of the file shard included in `task`.""" - task_path, task_type, task_start, task_end = task - - # TODO: github.com/sigstore/model-transparency/issues/197 - Directories - # don't need to use the file hasher. Rather than starting a process - # just for them, we should filter these ahead of time, and only use - # threading for file shards. For now, just return an empty result. - if task_type == "dir": - return b"" - - # TODO: github.com/sigstore/model-transparency/issues/197 - Similarly, - # empty files should be hashed outside of a parallel task, to not waste - # resources. - if task_start == task_end: - return b"" - - full_path = model_path.joinpath(task_path) - hasher = self._file_hasher_factory(full_path, task_start, task_end) - return hasher.compute().digest_value - - class ShardedFilesSerializer(serialization.Serializer): - """Model serializers that produces an itemized manifest, at shard level. + """Generic file shard serializer. Traverses the model directory and creates digests for every file found, sharding the file in equal shards and computing the digests in parallel. - Since the manifest lists each item individually, this will also enable - support for incremental updates (to be added later). + Subclasses can then create a manifest with these digests, either listing + them item by item, combining them into file digests, or combining all of + them into a single digest. """ def __init__( @@ -270,9 +114,7 @@ def __init__( self._shard_size = hasher.shard_size @override - def serialize( - self, model_path: pathlib.Path - ) -> manifest.ShardLevelManifest: + def serialize(self, model_path: pathlib.Path) -> manifest.Manifest: # TODO: github.com/sigstore/model-transparency/issues/196 - Add checks # to exclude symlinks if desired. serialize_by_file.check_file_or_directory(model_path) @@ -337,12 +179,96 @@ def _compute_hash( path=relative_path, digest=digest, start=start, end=end ) + @abc.abstractmethod def _build_manifest( self, items: Iterable[manifest.ShardedFileManifestItem] - ) -> manifest.ShardLevelManifest: + ) -> manifest.Manifest: """Builds an itemized manifest from a given list of items. Every subclass needs to implement this method to determine the format of the manifest. """ + pass + + +class ManifestSerializer(ShardedFilesSerializer): + """Model serializers that produces an itemized manifest, at shard level. + + Since the manifest lists each item individually, this will also enable + support for incremental updates (to be added later). + """ + + @override + def serialize( + self, model_path: pathlib.Path + ) -> manifest.ShardLevelManifest: + """Serializes the model given by the `model_path` argument. + + The only reason for the override is to change the return type, to be + more restrictive. This is to signal that the only manifests that can be + returned are `manifest.FileLevelManifest` instances. + """ + return cast(manifest.ShardLevelManifest, super().serialize(model_path)) + + @override + def _build_manifest( + self, items: Iterable[manifest.ShardedFileManifestItem] + ) -> manifest.ShardLevelManifest: return manifest.ShardLevelManifest(items) + + +class DigestSerializer(ShardedFilesSerializer): + """Serializer for a model that performs a traversal of the model directory. + + This serializer produces a single hash for the entire model. + """ + + def __init__( + self, + file_hasher_factory: Callable[ + [pathlib.Path, int, int], file.ShardedFileHasher + ], + merge_hasher: hashing.StreamingHashEngine, + max_workers: int | None = None, + ): + """Initializes an instance to serialize a model with this serializer. + + Args: + hasher_factory: A callable to build the hash engine used to hash + every shard of the files in the model. Because each shard is + processed in parallel, every thread needs to call the factory to + start hashing. The arguments are the file, and the endpoints of + the shard. + merge_hasher: A `hashing.StreamingHashEngine` instance used to merge + individual file shard digests to compute an aggregate digest. + max_workers: Maximum number of workers to use in parallel. Default + is to defer to the `concurent.futures` library. + """ + super().__init__(file_hasher_factory, max_workers) + self._merge_hasher = merge_hasher + + @override + def serialize(self, model_path: pathlib.Path) -> manifest.DigestManifest: + """Serializes the model given by the `model_path` argument. + + The only reason for the override is to change the return type, to be + more restrictive. This is to signal that the only manifests that can be + returned are `manifest.FileLevelManifest` instances. + """ + return cast(manifest.DigestManifest, super().serialize(model_path)) + + @override + def _build_manifest( + self, items: Iterable[manifest.ShardedFileManifestItem] + ) -> manifest.DigestManifest: + self._merge_hasher.reset() + + for item in sorted(items, key=lambda i: (i.path, i.start, i.end)): + header = _build_header( + name=item.path.name, start=item.start, end=item.end + ) + self._merge_hasher.update(header) + self._merge_hasher.update(item.digest.digest_value) + + digest = self._merge_hasher.compute() + return manifest.DigestManifest(digest) diff --git a/model_signing/serialization/serialize_by_file_shard_test.py b/model_signing/serialization/serialize_by_file_shard_test.py index eb394e3f..40a44d4c 100644 --- a/model_signing/serialization/serialize_by_file_shard_test.py +++ b/model_signing/serialization/serialize_by_file_shard_test.py @@ -65,7 +65,7 @@ def test_known_models(self, request, model_fixture_name): model = request.getfixturevalue(model_fixture_name) # Compute model manifest (act) - serializer = serialize_by_file_shard.ShardedDFSSerializer( + serializer = serialize_by_file_shard.DigestSerializer( self._hasher_factory, memory.SHA256() ) manifest = serializer.serialize(model) @@ -101,7 +101,7 @@ def test_known_models_small_shards(self, request, model_fixture_name): model = request.getfixturevalue(model_fixture_name) # Compute model manifest (act) - serializer = serialize_by_file_shard.ShardedDFSSerializer( + serializer = serialize_by_file_shard.DigestSerializer( self._hasher_factory_small_shards, memory.SHA256() ) manifest = serializer.serialize(model) @@ -117,7 +117,7 @@ def test_known_models_small_shards(self, request, model_fixture_name): assert manifest.digest.digest_hex == expected_digest def test_file_hash_is_not_same_as_hash_of_content(self, sample_model_file): - serializer = serialize_by_file_shard.ShardedDFSSerializer( + serializer = serialize_by_file_shard.DigestSerializer( self._hasher_factory, memory.SHA256() ) @@ -127,7 +127,7 @@ def test_file_hash_is_not_same_as_hash_of_content(self, sample_model_file): assert manifest.digest.digest_hex != digest.digest_hex def test_file_manifest_unchanged_when_model_moved(self, sample_model_file): - serializer = serialize_by_file_shard.ShardedDFSSerializer( + serializer = serialize_by_file_shard.DigestSerializer( self._hasher_factory, memory.SHA256() ) manifest = serializer.serialize(sample_model_file) @@ -141,7 +141,7 @@ def test_file_manifest_unchanged_when_model_moved(self, sample_model_file): def test_file_model_hash_changes_if_content_changes( self, sample_model_file ): - serializer = serialize_by_file_shard.ShardedDFSSerializer( + serializer = serialize_by_file_shard.DigestSerializer( self._hasher_factory, memory.SHA256() ) manifest = serializer.serialize(sample_model_file) @@ -153,7 +153,7 @@ def test_file_model_hash_changes_if_content_changes( assert manifest.digest.digest_value != new_manifest.digest.digest_value def test_directory_model_with_only_known_file(self, sample_model_file): - serializer = serialize_by_file_shard.ShardedDFSSerializer( + serializer = serialize_by_file_shard.DigestSerializer( self._hasher_factory, memory.SHA256() ) manifest_file = serializer.serialize(sample_model_file) @@ -167,7 +167,7 @@ def test_directory_model_with_only_known_file(self, sample_model_file): def test_folder_model_hash_is_same_if_model_is_moved( self, sample_model_folder ): - serializer = serialize_by_file_shard.ShardedDFSSerializer( + serializer = serialize_by_file_shard.DigestSerializer( self._hasher_factory, memory.SHA256() ) manifest = serializer.serialize(sample_model_folder) @@ -178,8 +178,8 @@ def test_folder_model_hash_is_same_if_model_is_moved( assert manifest == new_manifest - def test_folder_model_empty_folder_gets_included(self, sample_model_folder): - serializer = serialize_by_file_shard.ShardedDFSSerializer( + def test_folder_model_empty_folder_not_included(self, sample_model_folder): + serializer = serialize_by_file_shard.DigestSerializer( self._hasher_factory, memory.SHA256() ) manifest = serializer.serialize(sample_model_folder) @@ -189,10 +189,10 @@ def test_folder_model_empty_folder_gets_included(self, sample_model_folder): new_empty_dir.mkdir() new_manifest = serializer.serialize(sample_model_folder) - assert manifest != new_manifest + assert manifest == new_manifest - def test_folder_model_empty_file_gets_included(self, sample_model_folder): - serializer = serialize_by_file_shard.ShardedDFSSerializer( + def test_folder_model_empty_file_not_included(self, sample_model_folder): + serializer = serialize_by_file_shard.DigestSerializer( self._hasher_factory, memory.SHA256() ) manifest = serializer.serialize(sample_model_folder) @@ -202,10 +202,10 @@ def test_folder_model_empty_file_gets_included(self, sample_model_folder): new_empty_file.write_text("") new_manifest = serializer.serialize(sample_model_folder) - assert manifest != new_manifest + assert manifest == new_manifest def test_folder_model_rename_file(self, sample_model_folder): - serializer = serialize_by_file_shard.ShardedDFSSerializer( + serializer = serialize_by_file_shard.DigestSerializer( self._hasher_factory, memory.SHA256() ) manifest = serializer.serialize(sample_model_folder) @@ -219,7 +219,7 @@ def test_folder_model_rename_file(self, sample_model_folder): assert manifest != new_manifest def test_folder_model_rename_dir(self, sample_model_folder): - serializer = serialize_by_file_shard.ShardedDFSSerializer( + serializer = serialize_by_file_shard.DigestSerializer( self._hasher_factory, memory.SHA256() ) manifest = serializer.serialize(sample_model_folder) @@ -232,7 +232,7 @@ def test_folder_model_rename_dir(self, sample_model_folder): assert manifest != new_manifest def test_folder_model_replace_file_empty_folder(self, sample_model_folder): - serializer = serialize_by_file_shard.ShardedDFSSerializer( + serializer = serialize_by_file_shard.DigestSerializer( self._hasher_factory, memory.SHA256() ) manifest = serializer.serialize(sample_model_folder) @@ -246,7 +246,7 @@ def test_folder_model_replace_file_empty_folder(self, sample_model_folder): assert manifest != new_manifest def test_folder_model_change_file(self, sample_model_folder): - serializer = serialize_by_file_shard.ShardedDFSSerializer( + serializer = serialize_by_file_shard.DigestSerializer( self._hasher_factory, memory.SHA256() ) manifest = serializer.serialize(sample_model_folder) @@ -258,22 +258,22 @@ def test_folder_model_change_file(self, sample_model_folder): assert manifest != new_manifest - def test_empty_folder_hashes_differently_than_empty_file( + def test_empty_folder_hashes_same_as_empty_file( self, empty_model_file, empty_model_folder ): - serializer = serialize_by_file_shard.ShardedDFSSerializer( + serializer = serialize_by_file_shard.DigestSerializer( self._hasher_factory, memory.SHA256() ) folder_manifest = serializer.serialize(empty_model_folder) file_manifest = serializer.serialize(empty_model_file) - assert folder_manifest != file_manifest + assert folder_manifest == file_manifest - def test_model_with_empty_folder_hashes_differently_than_with_empty_file( + def test_model_with_empty_folder_hashes_same_as_with_empty_file( self, sample_model_folder ): - serializer = serialize_by_file_shard.ShardedDFSSerializer( + serializer = serialize_by_file_shard.DigestSerializer( self._hasher_factory, memory.SHA256() ) @@ -289,15 +289,15 @@ def test_model_with_empty_folder_hashes_differently_than_with_empty_file( new_empty_file.write_text("") file_manifest = serializer.serialize(sample_model_folder) - assert folder_manifest != file_manifest + assert folder_manifest == file_manifest def test_max_workers_does_not_change_digest(self, sample_model_folder): - serializer1 = serialize_by_file_shard.ShardedDFSSerializer( + serializer1 = serialize_by_file_shard.DigestSerializer( self._hasher_factory, memory.SHA256() ) manifest1 = serializer1.serialize(sample_model_folder) - serializer2 = serialize_by_file_shard.ShardedDFSSerializer( + serializer2 = serialize_by_file_shard.DigestSerializer( self._hasher_factory, memory.SHA256(), max_workers=2 ) manifest2 = serializer2.serialize(sample_model_folder) @@ -305,12 +305,12 @@ def test_max_workers_does_not_change_digest(self, sample_model_folder): assert manifest1 == manifest2 def test_shard_size_changes_digests(self, sample_model_folder): - serializer1 = serialize_by_file_shard.ShardedDFSSerializer( + serializer1 = serialize_by_file_shard.DigestSerializer( self._hasher_factory, memory.SHA256() ) manifest1 = serializer1.serialize(sample_model_folder) - serializer2 = serialize_by_file_shard.ShardedDFSSerializer( + serializer2 = serialize_by_file_shard.DigestSerializer( self._hasher_factory_small_shards, memory.SHA256() ) manifest2 = serializer2.serialize(sample_model_folder) @@ -371,7 +371,7 @@ def test_known_models(self, request, model_fixture_name): model = request.getfixturevalue(model_fixture_name) # Compute model manifest (act) - serializer = serialize_by_file_shard.ShardedFilesSerializer( + serializer = serialize_by_file_shard.ManifestSerializer( self._hasher_factory ) manifest = serializer.serialize(model) @@ -414,7 +414,7 @@ def test_known_models_small_shards(self, request, model_fixture_name): model = request.getfixturevalue(model_fixture_name) # Compute model manifest (act) - serializer = serialize_by_file_shard.ShardedFilesSerializer( + serializer = serialize_by_file_shard.ManifestSerializer( self._hasher_factory_small_shards ) manifest = serializer.serialize(model) @@ -437,7 +437,7 @@ def test_known_models_small_shards(self, request, model_fixture_name): assert items == found_items def test_file_manifest_unchanged_when_model_moved(self, sample_model_file): - serializer = serialize_by_file_shard.ShardedFilesSerializer( + serializer = serialize_by_file_shard.ManifestSerializer( self._hasher_factory ) manifest = serializer.serialize(sample_model_file) @@ -449,7 +449,7 @@ def test_file_manifest_unchanged_when_model_moved(self, sample_model_file): assert manifest == new_manifest def test_file_manifest_changes_if_content_changes(self, sample_model_file): - serializer = serialize_by_file_shard.ShardedFilesSerializer( + serializer = serialize_by_file_shard.ManifestSerializer( self._hasher_factory ) manifest = serializer.serialize(sample_model_file) @@ -466,7 +466,7 @@ def test_file_manifest_changes_if_content_changes(self, sample_model_file): assert digests != new_digests def test_directory_model_with_only_known_file(self, sample_model_file): - serializer = serialize_by_file_shard.ShardedFilesSerializer( + serializer = serialize_by_file_shard.ManifestSerializer( self._hasher_factory ) manifest_file = serializer.serialize(sample_model_file) @@ -483,7 +483,7 @@ def test_directory_model_with_only_known_file(self, sample_model_file): def test_folder_model_hash_is_same_if_model_is_moved( self, sample_model_folder ): - serializer = serialize_by_file_shard.ShardedFilesSerializer( + serializer = serialize_by_file_shard.ManifestSerializer( self._hasher_factory ) manifest = serializer.serialize(sample_model_folder) @@ -495,7 +495,7 @@ def test_folder_model_hash_is_same_if_model_is_moved( assert manifest == new_manifest def test_folder_model_empty_folder_not_included(self, sample_model_folder): - serializer = serialize_by_file_shard.ShardedFilesSerializer( + serializer = serialize_by_file_shard.ManifestSerializer( self._hasher_factory ) manifest = serializer.serialize(sample_model_folder) @@ -508,7 +508,7 @@ def test_folder_model_empty_folder_not_included(self, sample_model_folder): assert manifest == new_manifest def test_folder_model_empty_file_not_included(self, sample_model_folder): - serializer = serialize_by_file_shard.ShardedFilesSerializer( + serializer = serialize_by_file_shard.ManifestSerializer( self._hasher_factory ) manifest = serializer.serialize(sample_model_folder) @@ -546,7 +546,7 @@ def _check_manifests_match_except_on_renamed_file( def test_folder_model_rename_file_only_changes_path_part( self, sample_model_folder ): - serializer = serialize_by_file_shard.ShardedFilesSerializer( + serializer = serialize_by_file_shard.ManifestSerializer( self._hasher_factory ) manifest = serializer.serialize(sample_model_folder) @@ -594,7 +594,7 @@ def _check_manifests_match_except_on_renamed_dir( def test_folder_model_rename_dir_only_changes_path_part( self, sample_model_folder ): - serializer = serialize_by_file_shard.ShardedFilesSerializer( + serializer = serialize_by_file_shard.ManifestSerializer( self._hasher_factory ) manifest = serializer.serialize(sample_model_folder) @@ -610,7 +610,7 @@ def test_folder_model_rename_dir_only_changes_path_part( ) def test_folder_model_replace_file_empty_folder(self, sample_model_folder): - serializer = serialize_by_file_shard.ShardedFilesSerializer( + serializer = serialize_by_file_shard.ManifestSerializer( self._hasher_factory ) manifest = serializer.serialize(sample_model_folder) @@ -649,7 +649,7 @@ def _check_manifests_match_except_on_entry( assert old_manifest._item_to_digest[shard] == digest def test_folder_model_change_file(self, sample_model_folder): - serializer = serialize_by_file_shard.ShardedFilesSerializer( + serializer = serialize_by_file_shard.ManifestSerializer( self._hasher_factory ) manifest = serializer.serialize(sample_model_folder) @@ -666,13 +666,13 @@ def test_folder_model_change_file(self, sample_model_folder): ) def test_max_workers_does_not_change_digest(self, sample_model_folder): - serializer1 = serialize_by_file_shard.ShardedFilesSerializer( + serializer1 = serialize_by_file_shard.ManifestSerializer( self._hasher_factory ) - serializer2 = serialize_by_file_shard.ShardedFilesSerializer( + serializer2 = serialize_by_file_shard.ManifestSerializer( self._hasher_factory, max_workers=1 ) - serializer3 = serialize_by_file_shard.ShardedFilesSerializer( + serializer3 = serialize_by_file_shard.ManifestSerializer( self._hasher_factory, max_workers=3 ) diff --git a/model_signing/serialization/testdata/serialize_by_file_shard/TestShardedDFSSerializer/deep_model_folder b/model_signing/serialization/testdata/serialize_by_file_shard/TestShardedDFSSerializer/deep_model_folder index 528ab87c..b3a94824 100644 --- a/model_signing/serialization/testdata/serialize_by_file_shard/TestShardedDFSSerializer/deep_model_folder +++ b/model_signing/serialization/testdata/serialize_by_file_shard/TestShardedDFSSerializer/deep_model_folder @@ -1 +1 @@ -52fa3c459aec58bc5f9702c73cb3c6b8fd19e9342aa3e4db851e1bde69ab1727 +6deb22c4330a8a9eb5a2d5faa73bf56c64a5c2888961f0f0df51912798fc4954 diff --git a/model_signing/serialization/testdata/serialize_by_file_shard/TestShardedDFSSerializer/deep_model_folder_small_shards b/model_signing/serialization/testdata/serialize_by_file_shard/TestShardedDFSSerializer/deep_model_folder_small_shards index a4f2f81e..f826b95f 100644 --- a/model_signing/serialization/testdata/serialize_by_file_shard/TestShardedDFSSerializer/deep_model_folder_small_shards +++ b/model_signing/serialization/testdata/serialize_by_file_shard/TestShardedDFSSerializer/deep_model_folder_small_shards @@ -1 +1 @@ -abd66cd0d8a01f3f552ac5af717f49dc6e6575f0849ec3bfb3c9051962314ce6 +f5203504bea9ec90a7b7453a53c0aaab98a5db5d038dc1fac3613b47f6018959 diff --git a/model_signing/serialization/testdata/serialize_by_file_shard/TestShardedDFSSerializer/empty_model_file b/model_signing/serialization/testdata/serialize_by_file_shard/TestShardedDFSSerializer/empty_model_file index 9ac3ea65..c3068040 100644 --- a/model_signing/serialization/testdata/serialize_by_file_shard/TestShardedDFSSerializer/empty_model_file +++ b/model_signing/serialization/testdata/serialize_by_file_shard/TestShardedDFSSerializer/empty_model_file @@ -1 +1 @@ -5f2d126b0d3540c17481fdf724e31cf03b4436a2ebabaa1d2e94fe09831be64d +e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855 diff --git a/model_signing/serialization/testdata/serialize_by_file_shard/TestShardedDFSSerializer/empty_model_file_small_shards b/model_signing/serialization/testdata/serialize_by_file_shard/TestShardedDFSSerializer/empty_model_file_small_shards index 9ac3ea65..c3068040 100644 --- a/model_signing/serialization/testdata/serialize_by_file_shard/TestShardedDFSSerializer/empty_model_file_small_shards +++ b/model_signing/serialization/testdata/serialize_by_file_shard/TestShardedDFSSerializer/empty_model_file_small_shards @@ -1 +1 @@ -5f2d126b0d3540c17481fdf724e31cf03b4436a2ebabaa1d2e94fe09831be64d +e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855 diff --git a/model_signing/serialization/testdata/serialize_by_file_shard/TestShardedDFSSerializer/model_folder_with_empty_file b/model_signing/serialization/testdata/serialize_by_file_shard/TestShardedDFSSerializer/model_folder_with_empty_file index b6d24eaf..c3068040 100644 --- a/model_signing/serialization/testdata/serialize_by_file_shard/TestShardedDFSSerializer/model_folder_with_empty_file +++ b/model_signing/serialization/testdata/serialize_by_file_shard/TestShardedDFSSerializer/model_folder_with_empty_file @@ -1 +1 @@ -230d217d5f4f388f5087ac4174dbc9b0ff358e3122a1267b0a56669a44f11ea1 +e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855 diff --git a/model_signing/serialization/testdata/serialize_by_file_shard/TestShardedDFSSerializer/model_folder_with_empty_file_small_shards b/model_signing/serialization/testdata/serialize_by_file_shard/TestShardedDFSSerializer/model_folder_with_empty_file_small_shards index b6d24eaf..c3068040 100644 --- a/model_signing/serialization/testdata/serialize_by_file_shard/TestShardedDFSSerializer/model_folder_with_empty_file_small_shards +++ b/model_signing/serialization/testdata/serialize_by_file_shard/TestShardedDFSSerializer/model_folder_with_empty_file_small_shards @@ -1 +1 @@ -230d217d5f4f388f5087ac4174dbc9b0ff358e3122a1267b0a56669a44f11ea1 +e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855 diff --git a/model_signing/serialization/testdata/serialize_by_file_shard/TestShardedDFSSerializer/sample_model_file b/model_signing/serialization/testdata/serialize_by_file_shard/TestShardedDFSSerializer/sample_model_file index a94a0fa0..8ec1d11f 100644 --- a/model_signing/serialization/testdata/serialize_by_file_shard/TestShardedDFSSerializer/sample_model_file +++ b/model_signing/serialization/testdata/serialize_by_file_shard/TestShardedDFSSerializer/sample_model_file @@ -1 +1 @@ -2ca48c47d5311a9b2f9305519cd5f927dcef09404fc32ef7886abe8f11450eff +14aebf2e466ad30ef59ea6fce67de44dc133c673784bd543b45f75b8efc3d821 diff --git a/model_signing/serialization/testdata/serialize_by_file_shard/TestShardedDFSSerializer/sample_model_file_small_shards b/model_signing/serialization/testdata/serialize_by_file_shard/TestShardedDFSSerializer/sample_model_file_small_shards index 5b6697c8..7b4ad705 100644 --- a/model_signing/serialization/testdata/serialize_by_file_shard/TestShardedDFSSerializer/sample_model_file_small_shards +++ b/model_signing/serialization/testdata/serialize_by_file_shard/TestShardedDFSSerializer/sample_model_file_small_shards @@ -1 +1 @@ -284b613e2e1576d87e5e1c912c82da8d87b6350276f36940516404b2a35f1a74 +beb3cbbd9d73133e85a102a3cbda2ef1dc2bc61e9323e32e576e4adb0571bf86 diff --git a/model_signing/serialization/testdata/serialize_by_file_shard/TestShardedDFSSerializer/sample_model_folder b/model_signing/serialization/testdata/serialize_by_file_shard/TestShardedDFSSerializer/sample_model_folder index 7fa49a73..c94ba5d0 100644 --- a/model_signing/serialization/testdata/serialize_by_file_shard/TestShardedDFSSerializer/sample_model_folder +++ b/model_signing/serialization/testdata/serialize_by_file_shard/TestShardedDFSSerializer/sample_model_folder @@ -1 +1 @@ -d22e0441cfa5ac2bc09715ddd88c802a7f97e29c93dc50f5498bab2954958ebb +865a7da87d90b261ce99086bfc61986a6230e6914ad885912b4d22464a9fda13 diff --git a/model_signing/serialization/testdata/serialize_by_file_shard/TestShardedDFSSerializer/sample_model_folder_small_shards b/model_signing/serialization/testdata/serialize_by_file_shard/TestShardedDFSSerializer/sample_model_folder_small_shards index 161cafdf..6e6fd67f 100644 --- a/model_signing/serialization/testdata/serialize_by_file_shard/TestShardedDFSSerializer/sample_model_folder_small_shards +++ b/model_signing/serialization/testdata/serialize_by_file_shard/TestShardedDFSSerializer/sample_model_folder_small_shards @@ -1 +1 @@ -82bb608d88cf741730c5bcb75a7630f560643acafdd8fa02ad24be20f51c1250 +02be357fc0015ab3d15dbbd363a172f35d2cbd1a854b8e0a6c67fad2e2c3390f