From 59b45f30bef14b04dcfd2dc8522dbea44d09c370 Mon Sep 17 00:00:00 2001 From: Sagi Polaczek Date: Mon, 16 Sep 2024 01:39:51 +0300 Subject: [PATCH 1/5] added changes to op. save, load and push_to_hub --- .../modular_tokenizer/modular_tokenizer.py | 4 +- fuse/data/tokenizers/modular_tokenizer/op.py | 75 ++++++++++++++++++- setup.cfg | 2 +- 3 files changed, 78 insertions(+), 3 deletions(-) diff --git a/fuse/data/tokenizers/modular_tokenizer/modular_tokenizer.py b/fuse/data/tokenizers/modular_tokenizer/modular_tokenizer.py index 2d57c107b..4f9c354e5 100644 --- a/fuse/data/tokenizers/modular_tokenizer/modular_tokenizer.py +++ b/fuse/data/tokenizers/modular_tokenizer/modular_tokenizer.py @@ -6,6 +6,7 @@ from typing import Optional, List, Set, Union, Tuple, Any, Iterator import json import transformers +from pathlib import Path import os from omegaconf import OmegaConf import collections @@ -682,7 +683,7 @@ def save_jsons(self, tokenizers_info: Optional[List] = None) -> None: os.makedirs(os.path.dirname(out_path)) tokenizer_inst.save(out_path) - def save(self, path: str) -> None: + def save(self, path: Union[str, Path]) -> None: """Saves all information needed to reconstruct the modular tokenizer to path. After saving, path will contain the following: - json files: modular json files (i.e. that have common special tokens, and that all map to consistent ID space) @@ -718,6 +719,7 @@ def set_field(tokenizers_info_cfg: List, name: str, key: str, val: Any) -> List: return tokenizers_info_cfg raise Exception(f"name {name} not found") + path = str(path) if path.endswith(".json") or path.endswith(".yaml"): path = os.path.dirname(path) diff --git a/fuse/data/tokenizers/modular_tokenizer/op.py b/fuse/data/tokenizers/modular_tokenizer/op.py index 445497b45..2f24ef353 100644 --- a/fuse/data/tokenizers/modular_tokenizer/op.py +++ b/fuse/data/tokenizers/modular_tokenizer/op.py @@ -4,11 +4,14 @@ from fuse.data.tokenizers.modular_tokenizer.inject_utils import ( InjectorToModularTokenizerLib, ) +from huggingface_hub import snapshot_download, HfApi +from huggingface_hub.utils import validate_hf_hub_args, SoftTemporaryDirectory from warnings import warn +from pathlib import Path from collections import defaultdict -from typing import Tuple, Optional, Union, Any +from typing import Any, Tuple, Dict, List, Optional, Union import os import re @@ -506,3 +509,73 @@ def __call__( ) return sample_dict + + @classmethod + def from_pretrained( + cls, identifier: str, pad_token: str = "", max_size: Optional[int] = None + ) -> "ModularTokenizerOp": + if not os.path.isdir(identifier): + # Try to download from hub + try: + # Download the entire repo + identifier = snapshot_download( + repo_id=str(identifier), + # revision=revision, + # cache_dir=cache_dir, + # force_download=force_download, + # proxies=proxies, + # resume_download=resume_download, + # token=token, + # local_files_only=local_files_only, + allow_patterns="tokenizer/", + ) + identifier = os.path.join(identifier, "tokenizer") + except Exception as e: + raise Exception( + f"Couldn't find the checkpoint path nor download from HF hub! {identifier}" + ) from e + + tokenizer_op = cls( + tokenizer_path=identifier, pad_token=pad_token, max_size=max_size + ) + return tokenizer_op + + def save_pretrained(self, save_directory: Union[str, Path]) -> None: + print(f"Saving @ {save_directory=}") + self._tokenizer.save(path=save_directory) + + @validate_hf_hub_args + def push_to_hub( + self, + repo_id: str, + *, + commit_message: str = "Push model using huggingface_hub.", + private: bool = False, + token: Optional[str] = None, + branch: Optional[str] = None, + create_pr: Optional[bool] = None, + allow_patterns: Optional[Union[List[str], str]] = None, + ignore_patterns: Optional[Union[List[str], str]] = None, + delete_patterns: Optional[Union[List[str], str]] = None, + model_card_kwargs: Optional[Dict[str, Any]] = None, + ) -> None: + api = HfApi(token=token) + repo_id = api.create_repo( + repo_id=repo_id, private=private, exist_ok=True + ).repo_id + # Push the files to the repo in a single commit + with SoftTemporaryDirectory() as tmp: + saved_path = Path(tmp) / repo_id + tokenzier_dirpath = saved_path / "tokenizer" + self.save_pretrained(tokenzier_dirpath) + return api.upload_folder( + repo_id=repo_id, + repo_type="model", + folder_path=saved_path, + commit_message=commit_message, + revision=branch, + create_pr=create_pr, + allow_patterns=allow_patterns, + ignore_patterns=ignore_patterns, + delete_patterns=delete_patterns, + ) diff --git a/setup.cfg b/setup.cfg index aa58f1453..fc9b9cc65 100644 --- a/setup.cfg +++ b/setup.cfg @@ -58,7 +58,7 @@ exclude = [mypy] -python_version = 3.7 +python_version = 3.9 warn_return_any = True warn_unused_configs = True disallow_untyped_defs = True From 37fe63d9b67e27e8785439507e1996f97336c6da Mon Sep 17 00:00:00 2001 From: Sagi Polaczek Date: Tue, 17 Sep 2024 00:08:48 +0300 Subject: [PATCH 2/5] reverted changes in modular_tokenizer.py --- fuse/data/tokenizers/modular_tokenizer/modular_tokenizer.py | 4 +--- fuse/data/tokenizers/modular_tokenizer/op.py | 2 +- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/fuse/data/tokenizers/modular_tokenizer/modular_tokenizer.py b/fuse/data/tokenizers/modular_tokenizer/modular_tokenizer.py index 4f9c354e5..2d57c107b 100644 --- a/fuse/data/tokenizers/modular_tokenizer/modular_tokenizer.py +++ b/fuse/data/tokenizers/modular_tokenizer/modular_tokenizer.py @@ -6,7 +6,6 @@ from typing import Optional, List, Set, Union, Tuple, Any, Iterator import json import transformers -from pathlib import Path import os from omegaconf import OmegaConf import collections @@ -683,7 +682,7 @@ def save_jsons(self, tokenizers_info: Optional[List] = None) -> None: os.makedirs(os.path.dirname(out_path)) tokenizer_inst.save(out_path) - def save(self, path: Union[str, Path]) -> None: + def save(self, path: str) -> None: """Saves all information needed to reconstruct the modular tokenizer to path. After saving, path will contain the following: - json files: modular json files (i.e. that have common special tokens, and that all map to consistent ID space) @@ -719,7 +718,6 @@ def set_field(tokenizers_info_cfg: List, name: str, key: str, val: Any) -> List: return tokenizers_info_cfg raise Exception(f"name {name} not found") - path = str(path) if path.endswith(".json") or path.endswith(".yaml"): path = os.path.dirname(path) diff --git a/fuse/data/tokenizers/modular_tokenizer/op.py b/fuse/data/tokenizers/modular_tokenizer/op.py index 2f24ef353..21ccfc371 100644 --- a/fuse/data/tokenizers/modular_tokenizer/op.py +++ b/fuse/data/tokenizers/modular_tokenizer/op.py @@ -542,7 +542,7 @@ def from_pretrained( def save_pretrained(self, save_directory: Union[str, Path]) -> None: print(f"Saving @ {save_directory=}") - self._tokenizer.save(path=save_directory) + self._tokenizer.save(path=str(save_directory)) @validate_hf_hub_args def push_to_hub( From c60edb331edd44bbddbd9a25127e664eed74cd85 Mon Sep 17 00:00:00 2001 From: Sagi Polaczek Date: Tue, 17 Sep 2024 00:20:11 +0300 Subject: [PATCH 3/5] cleaning code --- fuse/data/tokenizers/modular_tokenizer/op.py | 30 +++++++++++++------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/fuse/data/tokenizers/modular_tokenizer/op.py b/fuse/data/tokenizers/modular_tokenizer/op.py index 21ccfc371..cd2ac0bc1 100644 --- a/fuse/data/tokenizers/modular_tokenizer/op.py +++ b/fuse/data/tokenizers/modular_tokenizer/op.py @@ -512,23 +512,34 @@ def __call__( @classmethod def from_pretrained( - cls, identifier: str, pad_token: str = "", max_size: Optional[int] = None + cls, + identifier: str, + pad_token: str = "", + max_size: Optional[int] = None, + force_download: bool = False, + resume_download: Optional[bool] = None, + proxies: Optional[Dict] = None, + token: Optional[Union[str, bool]] = None, + cache_dir: Optional[Union[str, Path]] = None, + local_files_only: bool = False, + revision: Optional[str] = None, ) -> "ModularTokenizerOp": if not os.path.isdir(identifier): # Try to download from hub try: - # Download the entire repo + # Download 'tokenizer' folder from repo identifier = snapshot_download( repo_id=str(identifier), - # revision=revision, - # cache_dir=cache_dir, - # force_download=force_download, - # proxies=proxies, - # resume_download=resume_download, - # token=token, - # local_files_only=local_files_only, + revision=revision, + cache_dir=cache_dir, + force_download=force_download, + proxies=proxies, + resume_download=resume_download, + token=token, + local_files_only=local_files_only, allow_patterns="tokenizer/", ) + # Redirecting identifier to the downloaded folder identifier = os.path.join(identifier, "tokenizer") except Exception as e: raise Exception( @@ -557,7 +568,6 @@ def push_to_hub( allow_patterns: Optional[Union[List[str], str]] = None, ignore_patterns: Optional[Union[List[str], str]] = None, delete_patterns: Optional[Union[List[str], str]] = None, - model_card_kwargs: Optional[Dict[str, Any]] = None, ) -> None: api = HfApi(token=token) repo_id = api.create_repo( From 7ccb6d0d48d77a8673519311cdd262d02bfcbe2f Mon Sep 17 00:00:00 2001 From: Sagi Polaczek Date: Wed, 18 Sep 2024 13:36:30 +0300 Subject: [PATCH 4/5] added comment --- fuse/data/tokenizers/modular_tokenizer/op.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fuse/data/tokenizers/modular_tokenizer/op.py b/fuse/data/tokenizers/modular_tokenizer/op.py index cd2ac0bc1..cc36e8dd3 100644 --- a/fuse/data/tokenizers/modular_tokenizer/op.py +++ b/fuse/data/tokenizers/modular_tokenizer/op.py @@ -524,6 +524,12 @@ def from_pretrained( local_files_only: bool = False, revision: Optional[str] = None, ) -> "ModularTokenizerOp": + """Load pre-trained tokenizer from HF repo_id or a local dirpath. + + Args: + identifier (str): repo_id or local dirpath. + * For other args see `snapshot_download()` + """ if not os.path.isdir(identifier): # Try to download from hub try: From e3bff42f67b7260e333fe07624d7d2b4c282cbc7 Mon Sep 17 00:00:00 2001 From: Sagi Polaczek Date: Wed, 18 Sep 2024 13:39:51 +0300 Subject: [PATCH 5/5] improved docu --- fuse/data/tokenizers/modular_tokenizer/op.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fuse/data/tokenizers/modular_tokenizer/op.py b/fuse/data/tokenizers/modular_tokenizer/op.py index cc36e8dd3..0f2b85697 100644 --- a/fuse/data/tokenizers/modular_tokenizer/op.py +++ b/fuse/data/tokenizers/modular_tokenizer/op.py @@ -527,7 +527,9 @@ def from_pretrained( """Load pre-trained tokenizer from HF repo_id or a local dirpath. Args: - identifier (str): repo_id or local dirpath. + identifier (str): A repo_id or local dirpath. + pad_token (str, optional): A string of the pad token. Defaults to "". + max_size (Optional[int], optional): Sequences below this size will be padded, and above this size will be truncated. Defaults to None. * For other args see `snapshot_download()` """ if not os.path.isdir(identifier):