feat(llm): Add prompt caching for Anthropic Claude models

Add prompt caching parameters for all Claude-3 series models, supporting tagged text caching to improve response speed. Each model can cache up to 4 text blocks.
langgenius · Dec 27, 2024 · 4fea270 · 4fea270
1 parent 55c327f
commit 4fea270
Show file tree

Hide file tree

Showing 6 changed files with 76 additions and 4 deletions.
diff --git a/api/core/model_runtime/model_providers/anthropic/llm/claude-3-5-sonnet-20240620.yaml b/api/core/model_runtime/model_providers/anthropic/llm/claude-3-5-sonnet-20240620.yaml
@@ -33,6 +33,15 @@ parameter_rules:
     max: 8192
   - name: response_format
     use_template: response_format
+  - name: prompt_caching
+    label:
+      en_US: Prompt Caching
+      zh_Hans: 提示词缓存
+    type: boolean
+    required: false
+    help:
+      zh_Hans: 缓存提示词以加快响应速度，使用<prompt-cache></prompt-cache> 包裹的提示词会被缓存，最多缓存 4 个块，且每个块的 token 必须大于 1024 个。
+      en_US: Cache the prompt for faster response times. The prompts wrapped in <prompt-cache></prompt-cache> will be cached, and up to 4 blocks of prompts can be cached.
 pricing:
   input: '3.00'
   output: '15.00'

diff --git a/api/core/model_runtime/model_providers/anthropic/llm/claude-3-5-sonnet-20241022.yaml b/api/core/model_runtime/model_providers/anthropic/llm/claude-3-5-sonnet-20241022.yaml
@@ -33,6 +33,15 @@ parameter_rules:
     max: 8192
   - name: response_format
     use_template: response_format
+  - name: prompt_caching
+    label:
+      en_US: Prompt Caching
+      zh_Hans: 提示词缓存
+    type: boolean
+    required: false
+    help:
+      zh_Hans: 缓存提示词以加快响应速度，使用<prompt-cache></prompt-cache> 包裹的提示词会被缓存，最多缓存 4 个块，且每个块的 token 必须大于 1024 个。
+      en_US: Cache the prompt for faster response times. The prompts wrapped in <prompt-cache></prompt-cache> will be cached, and up to 4 blocks of prompts can be cached.
 pricing:
   input: '3.00'
   output: '15.00'

diff --git a/api/core/model_runtime/model_providers/anthropic/llm/claude-3-haiku-20240307.yaml b/api/core/model_runtime/model_providers/anthropic/llm/claude-3-haiku-20240307.yaml
@@ -32,6 +32,15 @@ parameter_rules:
     max: 4096
   - name: response_format
     use_template: response_format
+  - name: prompt_caching
+    label:
+      en_US: Prompt Caching
+      zh_Hans: 提示词缓存
+    type: boolean
+    required: false
+    help:
+      zh_Hans: 缓存提示词以加快响应速度，使用<prompt-cache></prompt-cache> 包裹的提示词会被缓存，最多缓存 4 个块，且每个块的 token 必须大于 1024 个。
+      en_US: Cache the prompt for faster response times. The prompts wrapped in <prompt-cache></prompt-cache> will be cached, and up to 4 blocks of prompts can be cached.
 pricing:
   input: '0.25'
   output: '1.25'

diff --git a/api/core/model_runtime/model_providers/anthropic/llm/claude-3-opus-20240229.yaml b/api/core/model_runtime/model_providers/anthropic/llm/claude-3-opus-20240229.yaml
@@ -32,6 +32,15 @@ parameter_rules:
     max: 4096
   - name: response_format
     use_template: response_format
+  - name: prompt_caching
+    label:
+      en_US: Prompt Caching
+      zh_Hans: 提示词缓存
+    type: boolean
+    required: false
+    help:
+      zh_Hans: 缓存提示词以加快响应速度，使用<prompt-cache></prompt-cache> 包裹的提示词会被缓存，最多缓存 4 个块，且每个块的 token 必须大于 1024 个。
+      en_US: Cache the prompt for faster response times. The prompts wrapped in <prompt-cache></prompt-cache> will be cached, and up to 4 blocks of prompts can be cached.
 pricing:
   input: '15.00'
   output: '75.00'

diff --git a/api/core/model_runtime/model_providers/anthropic/llm/claude-3-sonnet-20240229.yaml b/api/core/model_runtime/model_providers/anthropic/llm/claude-3-sonnet-20240229.yaml
@@ -32,6 +32,15 @@ parameter_rules:
     max: 4096
   - name: response_format
     use_template: response_format
+  - name: prompt_caching
+    label:
+      en_US: Prompt Caching
+      zh_Hans: 提示词缓存
+    type: boolean
+    required: false
+    help:
+      zh_Hans: 缓存提示词以加快响应速度，使用<prompt-cache></prompt-cache> 包裹的提示词会被缓存，最多缓存 4 个块，且每个块的 token 必须大于 1024 个。
+      en_US: Cache the prompt for faster response times. The prompts wrapped in <prompt-cache></prompt-cache> will be cached, and up to 4 blocks of prompts can be cached.
 pricing:
   input: '3.00'
   output: '15.00'

diff --git a/api/core/model_runtime/model_providers/anthropic/llm/llm.py b/api/core/model_runtime/model_providers/anthropic/llm/llm.py
@@ -1,6 +1,7 @@
 import base64
 import json
-from collections.abc import Generator, Sequence
+import re
+from collections.abc import Generator, Iterable, Sequence
 from typing import Optional, Union, cast
 
 import anthropic
@@ -127,19 +128,31 @@ def _chat_generate(
             extra_model_kwargs["system"] = system
 
         # Add the new header for claude-3-5-sonnet-20240620 model
-        extra_headers = {}
+        beta_flags = []
         if model == "claude-3-5-sonnet-20240620":
             if model_parameters.get("max_tokens", 0) > 4096:
-                extra_headers["anthropic-beta"] = "max-tokens-3-5-sonnet-2024-07-15"
+                beta_flags.append("max-tokens-3-5-sonnet-2024-07-15")
 
         if any(
             isinstance(content, DocumentPromptMessageContent)
             for prompt_message in prompt_messages
             if isinstance(prompt_message.content, list)
             for content in prompt_message.content
         ):
-            extra_headers["anthropic-beta"] = "pdfs-2024-09-25"
+            beta_flags.append("pdfs-2024-09-25")
 
+        if (
+            any(s in model for s in ["claude-3-5-sonnet", "claude-3-haiku", "claude-3-opus"])
+            and model_parameters.get("prompt_caching") is True
+        ):
+            # remove prompt_caching parameter from model_parameters
+            model_parameters.pop("prompt_caching")
+            # append prompt-caching-2024-07-31
+            beta_flags.append("prompt-caching-2024-07-31")
+            extra_model_kwargs["system"] = self.parse_prompt_with_ephemeral_tags(system)
+        extra_headers = {}
+        if beta_flags:
+            extra_headers["anthropic-beta"] = ",".join(beta_flags)
         if tools:
             extra_model_kwargs["tools"] = [self._transform_tool_prompt(tool) for tool in tools]
             response = client.beta.tools.messages.create(
@@ -652,3 +665,17 @@ def _invoke_error_mapping(self) -> dict[type[InvokeError], list[type[Exception]]
                 anthropic.APIError,
             ],
         }
+
+    def parse_prompt_with_ephemeral_tags(self, system: str) -> Iterable[ToolsBetaMessage]:
+        parts = re.split(r"(<prompt-cache>.*?</prompt-cache>)", system, flags=re.DOTALL)
+
+        result: list[ToolsBetaMessage] = []
+        for part in parts:
+            if part.strip():  # ignore white
+                if part.startswith("<prompt-cache>") and part.endswith("</prompt-cache>"):
+                    text = part[14:-15].strip()
+                    result.append({"text": text, "type": "text", "cache_control": {"type": "ephemeral"}})
+                else:
+                    result.append({"text": part.strip(), "type": "text"})
+
+        return result