BukeLy · Copilot · Dec 15, 2025 · Dec 15, 2025 · Dec 15, 2025 · BukeLy
diff --git a/env.example b/env.example
@@ -31,6 +31,14 @@ LLM_TOKENS_PER_MINUTE=40000        # 每分钟最大令牌数（包含输入+输
 #                                  # 推荐：不设置此项，让系统自动计算以确保不超过 TPM/RPM 限制
 #                                  # 计算示例：min(800, 40000/3500) = min(800, 11) = 11 并发
 
+# --- LLM Token 估算配置（用于速率限制） ---
+# 估算输出 tokens 数量，用于速率限制计算
+# 如果估算过高，并发会受限；如果估算不足，可能触发 429 错误
+# LLM_ESTIMATED_OUTPUT_TOKENS=3000  # LLM 输出估算（实体提取约 3000 tokens，默认 3000）
+# LLM_VLM_ESTIMATED_OUTPUT_TOKENS=500  # VLM 输出估算（图片描述较短，默认 500）
+# LLM_VLM_MAX_TOKENS=500            # VLM API 最大输出 tokens（默认 500）
+# LLM_VLM_IMAGE_TOKENS_ESTIMATE=200 # VLM 图片输入估算 tokens（默认 200）
+
 # ====== Embedding 配置 ======
 # 用于向量化文本，支持语义检索
 EMBEDDING_BASE_URL="https://api.siliconflow.cn/v1"
@@ -142,6 +150,9 @@ DS_OCR_REQUESTS_PER_MINUTE=800     # 每分钟最大请求数（默认 800）
 DS_OCR_TOKENS_PER_MINUTE=40000     # 每分钟最大令牌数（默认 40000）
 # DS_OCR_MAX_ASYNC=8               # 【可选】全局默认并发数（未设置时使用硬编码默认值 8）
 
+# --- DeepSeek-OCR Token 估算配置（用于速率限制） ---
+# DS_OCR_IMAGE_TOKENS_ESTIMATE=1000  # 图片输入估算 tokens（默认 1000）
+
 # ====== 智能 Parser 选择器配置（v2.0） ======
 # 基于文档复杂度自动选择最优 Parser 和模式
 

diff --git a/src/config.py b/src/config.py
@@ -30,6 +30,26 @@ class LLMConfig(BaseSettings):
     tokens_per_minute: int = Field(default=40000, description="Maximum tokens per minute (input + output)")
     max_async: Optional[int] = Field(default=None, description="Maximum concurrent requests (optional, auto-calculated if not set)")
 
+    # Token estimation for rate limiting (LLM)
+    estimated_output_tokens: int = Field(
+        default=3000,
+        description="Estimated output tokens for LLM calls (entity extraction typically outputs ~3000 tokens)"
+    )
+
+    # Token estimation for rate limiting (VLM)
+    vlm_estimated_output_tokens: int = Field(
+        default=500,
+        description="Estimated output tokens for VLM calls (image descriptions are typically shorter)"
+    )
+    vlm_max_tokens: int = Field(
+        default=500,
+        description="Maximum output tokens for VLM API calls"
+    )
+    vlm_image_tokens_estimate: int = Field(
+        default=200,
+        description="Estimated tokens for image input in VLM calls"
+    )
+
     class Config:
         env_prefix = "LLM_"
         env_file = ".env"
@@ -149,6 +169,12 @@ class DeepSeekOCRConfig(BaseSettings):
     tokens_per_minute: int = Field(default=40000, description="Maximum tokens per minute")
     max_async: Optional[int] = Field(default=None, description="Maximum concurrent requests (optional, auto-calculated if not set)")
 
+    # Token estimation for rate limiting
+    image_tokens_estimate: int = Field(
+        default=1000,
+        description="Estimated tokens for image input in OCR calls"
+    )
+
     class Config:
         env_prefix = "DS_OCR_"
         env_file = ".env"

diff --git a/src/deepseek_ocr_client.py b/src/deepseek_ocr_client.py
@@ -57,6 +57,9 @@ class DSSeekConfig:
     fallback_mode: str = field(default_factory=lambda: config.ds_ocr.fallback_mode)
     min_output_threshold: int = field(default_factory=lambda: config.ds_ocr.min_output_threshold)
 
+    # Token 估算配置
+    image_tokens_estimate: int = field(default_factory=lambda: config.ds_ocr.image_tokens_estimate)
+
     def __post_init__(self):
         """验证配置"""
         if not self.api_key:
@@ -281,8 +284,8 @@ async def _call_api(self, img_base64: str, prompt: str) -> str:
         Raises:
             Exception: API 调用失败时抛出异常
         """
-        # 估算 tokens（提示词 + 图片约 1000 tokens + 输出约 2000 tokens）
-        estimated_tokens = len(prompt) // 3 + 1000 + self.config.max_tokens
+        # 估算 tokens（提示词 + 图片 + 输出）
+        estimated_tokens = len(prompt) // 3 + self.config.image_tokens_estimate + self.config.max_tokens
 
         # 获取速率限制许可
         await self.rate_limiter.rate_limiter.acquire(estimated_tokens)
@@ -348,8 +351,8 @@ def _call_api_sync(self, img_base64: str, prompt: str) -> str:
         """
         import asyncio
 
-        # 估算 tokens（提示词 + 图片约 1000 tokens + 输出约 2000 tokens）
-        estimated_tokens = len(prompt) // 3 + 1000 + self.config.max_tokens
+        # 估算 tokens（提示词 + 图片 + 输出）
+        estimated_tokens = len(prompt) // 3 + self.config.image_tokens_estimate + self.config.max_tokens
 
         # 在同步函数中调用异步速率限制器
         try:

diff --git a/src/multi_tenant.py b/src/multi_tenant.py
@@ -64,6 +64,12 @@ def __init__(
         self.max_async = config.llm.max_async
         self.vlm_timeout = config.llm.vlm_timeout
 
+        # Token 估算配置
+        self.llm_estimated_output_tokens = config.llm.estimated_output_tokens
+        self.vlm_estimated_output_tokens = config.llm.vlm_estimated_output_tokens
+        self.vlm_max_tokens = config.llm.vlm_max_tokens
+        self.vlm_image_tokens_estimate = config.llm.vlm_image_tokens_estimate
+
         # 存储配置
         self.use_external_storage = config.storage.use_external
         self.kv_storage = config.storage.kv_storage
@@ -106,11 +112,14 @@ def _create_llm_func(self, llm_config: Dict):
         # 获取 rate_limiter 实际使用的并发数（将用于 LightRAG）
         actual_max_concurrent = rate_limiter.max_concurrent
 
+        # 获取 token 估算配置（支持租户覆盖）
+        llm_estimated_output = llm_config.get("estimated_output_tokens", self.llm_estimated_output_tokens)
+
         def llm_model_func(prompt, **kwargs):
             # 精确计算输入 tokens（使用 tiktoken）
             input_tokens = count_tokens(prompt, model="cl100k_base")
             # 保守估算输出 tokens（实体提取通常输出较长）
-            estimated_output = 3000  # 50 entities + 46 relations ≈ 3000 tokens
+            estimated_output = llm_estimated_output  # 从配置读取
             estimated_tokens = input_tokens + estimated_output
 
             # Debug: 输出 token 计数
@@ -295,6 +304,11 @@ def _create_vision_model_func(self, llm_config: Dict):
             tokens_per_minute=tokens_per_minute
         )
 
+        # 获取 VLM token 估算配置（支持租户覆盖）
+        vlm_image_tokens = llm_config.get("vlm_image_tokens_estimate", self.vlm_image_tokens_estimate)
+        vlm_estimated_output = llm_config.get("vlm_estimated_output_tokens", self.vlm_estimated_output_tokens)
+        vlm_max_tokens = llm_config.get("vlm_max_tokens", self.vlm_max_tokens)
+
         async def seed_vision_model_func(prompt: str, image_data: str, system_prompt: str) -> str:
             """
             使用 VLM 理解图片内容（带速率限制）
@@ -309,8 +323,8 @@ async def seed_vision_model_func(prompt: str, image_data: str, system_prompt: st
             """
             # 精确计算 tokens（使用 tiktoken）
             prompt_tokens = count_tokens(prompt, model="cl100k_base")
-            image_tokens = 200  # 图片约 200 tokens（固定估算）
-            estimated_output = 500  # VLM 输出通常较短
+            image_tokens = vlm_image_tokens  # 从配置读取
+            estimated_output = vlm_estimated_output  # 从配置读取
             estimated_tokens = prompt_tokens + image_tokens + estimated_output
 
             # Debug: 输出 token 计数
@@ -336,7 +350,7 @@ async def seed_vision_model_func(prompt: str, image_data: str, system_prompt: st
                             ]
                         }
                     ],
-                    "max_tokens": 500,
+                    "max_tokens": vlm_max_tokens,  # 从配置读取
                     "temperature": 0.1
                 }