diff --git a/setup.cfg b/setup.cfg index 3053b4e93..f4f75362f 100644 --- a/setup.cfg +++ b/setup.cfg @@ -37,7 +37,7 @@ install_requires = accelerate>=0.27.2 huggingface-hub>=0.11.1,<1.0.0 tokenizers>=0.13.3 - transformers==4.38.2 # if you change this, please also change version assert in petals/__init__.py + transformers==4.41.2 # if you change this, please also change version assert in petals/__init__.py speedtest-cli==2.1.3 pydantic>=1.10,<2.0 # 2.0 is incompatible with hivemind yet hivemind==1.1.10.post2 @@ -50,6 +50,7 @@ install_requires = peft==0.5.0 safetensors>=0.3.1 Dijkstar>=2.6.0 + numpy<2 [options.extras_require] dev = diff --git a/src/petals/__init__.py b/src/petals/__init__.py index ccc560e0c..55e814e7f 100644 --- a/src/petals/__init__.py +++ b/src/petals/__init__.py @@ -22,8 +22,8 @@ if not os.getenv("PETALS_IGNORE_DEPENDENCY_VERSION"): assert ( - version.parse("4.38.2") <= version.parse(transformers.__version__) < version.parse("4.39.0") - ), "Please install a proper transformers version: pip install transformers>=4.37.1,<4.39.0" + version.parse("4.41.2") <= version.parse(transformers.__version__) < version.parse("4.42.0") + ), "Please install a proper transformers version: pip install transformers>=4.41.2,<4.42.0" def _override_bfloat16_mode_default(): diff --git a/src/petals/client/remote_generation.py b/src/petals/client/remote_generation.py index 0060edeeb..4405121ec 100644 --- a/src/petals/client/remote_generation.py +++ b/src/petals/client/remote_generation.py @@ -22,20 +22,20 @@ class RemotePastKeyValues(Cache): def __init__(self) -> None: super().__init__() - self.seen_tokens = 0 + self._seen_tokens = 0 self.hypo_ids: Optional[torch.LongTensor] = None def __getitem__(self, _index: int) -> List[torch.Tensor]: return [DUMMY] # For compatibility with BloomForCausalLM.prepare_inputs_for_generation() def get_seq_length(self, layer_idx: Optional[int] = 0) -> int: - return self.seen_tokens + return self._seen_tokens def get_max_length(self) -> Optional[int]: return None def update_seen(self, new_seen: int) -> None: - self.seen_tokens += new_seen + self._seen_tokens += new_seen def reorder_cache(self, beam_idx): raise NotImplementedError("Beam search reordering is not implemented yet") diff --git a/src/petals/models/bloom/model.py b/src/petals/models/bloom/model.py index 67d2f35f7..98b6e84e5 100644 --- a/src/petals/models/bloom/model.py +++ b/src/petals/models/bloom/model.py @@ -131,7 +131,7 @@ def prepare_inputs_for_generation( if past_key_values is not None: if isinstance(past_key_values, Cache): cache_length = past_key_values.get_seq_length() - past_length = past_key_values.seen_tokens + past_length = past_key_values._seen_tokens max_cache_length = past_key_values.get_max_length() else: cache_length = past_length = past_key_values[0][0].shape[2] diff --git a/src/petals/models/llama/block.py b/src/petals/models/llama/block.py index 2eb8f731f..77f9a05f1 100644 --- a/src/petals/models/llama/block.py +++ b/src/petals/models/llama/block.py @@ -87,10 +87,7 @@ def forward( key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - kv_seq_len = key_states.shape[-2] - if past_key_value is not None: - kv_seq_len += past_key_value[0].shape[-2] - cos, sin = self.rotary_emb(value_states, position_ids, seq_len=kv_seq_len) + cos, sin = self.rotary_emb(value_states, position_ids) cos, sin = cos.unsqueeze(1), sin.unsqueeze(1) if q_len == 1 and torch.is_inference_mode_enabled() and hidden_states.device.type == "cuda":