feat: add context compress (#590)

* feat:Add context compress

* feat: Add unit test

* feat: add unit test for context manager

* feat: add postprocessor param && code format

* feat: add configuration guide

* fix: fix the configuration_guide

* fix: fix the unit test

* fix: fix the default value

* feat: add test and log for context_manager
This commit is contained in:
Fancy-hjyp
2025-09-27 06:42:22 -07:00
committed by GitHub
parent c214999606
commit 5f4eb38fdb
9 changed files with 1032 additions and 7 deletions

View File

@@ -0,0 +1,212 @@
# src/tools/search_postprocessor.py
import re
import base64
import logging
from typing import List, Dict, Any
from urllib.parse import urlparse
logger = logging.getLogger(__name__)
class SearchResultPostProcessor:
"""Search result post-processor"""
base64_pattern = r"data:image/[^;]+;base64,[a-zA-Z0-9+/=]+"
def __init__(self, min_score_threshold: float, max_content_length_per_page: int):
"""
Initialize the post-processor
Args:
min_score_threshold: Minimum relevance score threshold
max_content_length_per_page: Maximum content length
"""
self.min_score_threshold = min_score_threshold
self.max_content_length_per_page = max_content_length_per_page
def process_results(self, results: List[Dict]) -> List[Dict]:
"""
Process search results
Args:
results: Original search result list
Returns:
Processed result list
"""
if not results:
return []
# Combined processing in a single loop for efficiency
cleaned_results = []
seen_urls = set()
for result in results:
# 1. Remove duplicates
cleaned_result = self._remove_duplicates(result, seen_urls)
if not cleaned_result:
continue
# 2. Filter low quality results
if (
"page" == cleaned_result.get("type")
and self.min_score_threshold
and self.min_score_threshold > 0
and cleaned_result.get("score", 0) < self.min_score_threshold
):
continue
# 3. Clean base64 images from content
cleaned_result = self._remove_base64_images(cleaned_result)
if not cleaned_result:
continue
# 4. When max_content_length_per_page is set, truncate long content
if (
self.max_content_length_per_page
and self.max_content_length_per_page > 0
):
cleaned_result = self._truncate_long_content(cleaned_result)
if cleaned_result:
cleaned_results.append(cleaned_result)
# 5. Sort (by score descending)
sorted_results = sorted(
cleaned_results, key=lambda x: x.get("score", 0), reverse=True
)
logger.info(
f"Search result post-processing: {len(results)} -> {len(sorted_results)}"
)
return sorted_results
def _remove_base64_images(self, result: Dict) -> Dict:
"""Remove base64 encoded images from content"""
if "page" == result.get("type"):
cleaned_result = self.processPage(result)
elif "image" == result.get("type"):
cleaned_result = self.processImage(result)
else:
# For other types, keep as is
cleaned_result = result.copy()
return cleaned_result
def processPage(self, result: Dict) -> Dict:
"""Process page type result"""
# Clean base64 images from content
cleaned_result = result.copy()
if "content" in result:
original_content = result["content"]
cleaned_content = re.sub(self.base64_pattern, " ", original_content)
cleaned_result["content"] = cleaned_content
# Log if significant content was removed
if len(cleaned_content) < len(original_content) * 0.8:
logger.debug(
f"Removed base64 images from search content: {result.get('url', 'unknown')}"
)
# Clean base64 images from raw content
if "raw_content" in cleaned_result:
original_raw_content = cleaned_result["raw_content"]
cleaned_raw_content = re.sub(self.base64_pattern, " ", original_raw_content)
cleaned_result["raw_content"] = cleaned_raw_content
# Log if significant content was removed
if len(cleaned_raw_content) < len(original_raw_content) * 0.8:
logger.debug(
f"Removed base64 images from search raw content: {result.get('url', 'unknown')}"
)
return cleaned_result
def processImage(self, result: Dict) -> Dict:
"""Process image type result - clean up base64 data and long fields"""
cleaned_result = result.copy()
# Remove base64 encoded data from image_url if present
if "image_url" in cleaned_result and isinstance(
cleaned_result["image_url"], str
):
# Check if image_url contains base64 data
if "data:image" in cleaned_result["image_url"]:
original_image_url = cleaned_result["image_url"]
cleaned_image_url = re.sub(self.base64_pattern, " ", original_image_url)
if len(cleaned_image_url) == 0 or not cleaned_image_url.startswith(
"http"
):
logger.debug(
f"Removed base64 data from image_url and the cleaned_image_url is empty or not start with http, origin image_url: {result.get('image_url', 'unknown')}"
)
return {}
cleaned_result["image_url"] = cleaned_image_url
logger.debug(
f"Removed base64 data from image_url: {result.get('image_url', 'unknown')}"
)
# Truncate very long image descriptions
if "image_description" in cleaned_result and isinstance(
cleaned_result["image_description"], str
):
if (
self.max_content_length_per_page
and len(cleaned_result["image_description"])
> self.max_content_length_per_page
):
cleaned_result["image_description"] = (
cleaned_result["image_description"][
: self.max_content_length_per_page
]
+ "..."
)
logger.info(
f"Truncated long image description from search result: {result.get('image_url', 'unknown')}"
)
return cleaned_result
def _truncate_long_content(self, result: Dict) -> Dict:
"""Truncate long content"""
truncated_result = result.copy()
# Truncate content length
if "content" in truncated_result:
content = truncated_result["content"]
if len(content) > self.max_content_length_per_page:
truncated_result["content"] = (
content[: self.max_content_length_per_page] + "..."
)
logger.info(
f"Truncated long content from search result: {result.get('url', 'unknown')}"
)
# Truncate raw content length (can be slightly longer)
if "raw_content" in truncated_result:
raw_content = truncated_result["raw_content"]
if len(raw_content) > self.max_content_length_per_page * 2:
truncated_result["raw_content"] = (
raw_content[: self.max_content_length_per_page * 2] + "..."
)
logger.info(
f"Truncated long raw content from search result: {result.get('url', 'unknown')}"
)
return truncated_result
def _remove_duplicates(self, result: Dict, seen_urls: set) -> Dict:
"""Remove duplicate results"""
url = result.get("url", result.get("image_url", ""))
if url and url not in seen_urls:
seen_urls.add(url)
return result.copy() # Return a copy to avoid modifying original
elif not url:
# Keep results with empty URLs
return result.copy() # Return a copy to avoid modifying original
return {} # Return empty dict for duplicates

View File

@@ -11,6 +11,14 @@ from langchain_tavily._utilities import TAVILY_API_URL
from langchain_tavily.tavily_search import (
TavilySearchAPIWrapper as OriginalTavilySearchAPIWrapper,
)
from src.tools.search_postprocessor import SearchResultPostProcessor
from src.config import load_yaml_config
def get_search_config():
config = load_yaml_config("conf.yaml")
search_config = config.get("SEARCH_ENGINE", {})
return search_config
class EnhancedTavilySearchAPIWrapper(OriginalTavilySearchAPIWrapper):
@@ -110,4 +118,13 @@ class EnhancedTavilySearchAPIWrapper(OriginalTavilySearchAPIWrapper):
"image_description": image["description"],
}
clean_results.append(clean_result)
search_config = get_search_config()
clean_results = SearchResultPostProcessor(
min_score_threshold=search_config.get("min_score_threshold"),
max_content_length_per_page=search_config.get(
"max_content_length_per_page"
),
).process_results(clean_results)
return clean_results