Skip to content

Commit

Permalink
support from_pretrained
Browse files Browse the repository at this point in the history
  • Loading branch information
Sagi Polaczek committed Oct 10, 2024
1 parent 5b237b4 commit 522a657
Showing 1 changed file with 13 additions and 3 deletions.
16 changes: 13 additions & 3 deletions fuse/data/tokenizers/modular_tokenizer/op.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def __init__(
validate_ends_with_eos: Optional[bool] = True,
eos: Optional[str] = "<EOS>",
verbose: Optional[bool] = False,
on_unknown_default_value: str = "warn",
on_unknown_default_value: Optional[str] = None,
**kwargs: Any,
) -> None:
"""
Expand Down Expand Up @@ -62,7 +62,13 @@ def __init__(

self._validate_ends_with_eos = validate_ends_with_eos
self._eos = eos
self._on_unknown_default_value = on_unknown_default_value
if on_unknown_default_value is not None:
self._on_unknown_default_value = on_unknown_default_value
else:
self._on_unknown_default_value = "warn"

if on_unknown_default_value not in ["warn", "raise"]:
raise ValueError(f"Doesn't support {on_unknown_default_value=}!")

if self._validate_ends_with_eos:
eos_id = self._tokenizer.token_to_id(self._eos)
Expand Down Expand Up @@ -538,6 +544,7 @@ def from_pretrained(
identifier: str,
pad_token: str = "<PAD>",
max_size: Optional[int] = None,
on_unknown_default_value: Optional[str] = None,
force_download: bool = False,
resume_download: Optional[bool] = None,
proxies: Optional[Dict] = None,
Expand Down Expand Up @@ -577,7 +584,10 @@ def from_pretrained(
) from e

tokenizer_op = cls(
tokenizer_path=identifier, pad_token=pad_token, max_size=max_size
tokenizer_path=identifier,
pad_token=pad_token,
max_size=max_size,
on_unknown_default_value=on_unknown_default_value,
)
return tokenizer_op

Expand Down

0 comments on commit 522a657

Please sign in to comment.