feat: add Domain Control Features for Tavily Search Engine (#401)

* feat: add Domain Control Features for Tavily Search Engine * fixed * chore: update config.md
2026-04-03 06:12:14 +08:00 · 2025-07-12 08:53:51 +08:00
parent 859c6e3c5d
commit dfd4712d9f
3 changed files with 57 additions and 0 deletions
--- a/conf.yaml.example
+++ b/conf.yaml.example
@@ -20,3 +20,18 @@ BASIC_MODEL:
 #   base_url: https://ark-cn-beijing.bytedance.net/api/v3
 #   model: "doubao-1-5-thinking-pro-m-250428"
 #   api_key: xxxx
+
+# OTHER SETTINGS:
+# Search engine configuration (Only supports Tavily currently)
+# SEARCH_ENGINE:
+#   engine: tavily
+#   # Only include results from these domains
+#   include_domains:
+#     - example.com
+#     - trusted-news.com
+#     - reliable-source.org
+#     - gov.cn
+#     - edu.cn
+#   # Exclude results from these domains
+#   exclude_domains:
+#     - example.com
--- a/docs/configuration_guide.md
+++ b/docs/configuration_guide.md
@@ -115,3 +115,25 @@ BASIC_MODEL:
  api_version: $AZURE_API_VERSION
  api_key: $AZURE_API_KEY
 ```
+## About Search Engine
+
+### How to control search domains for Tavily?
+
+DeerFlow allows you to control which domains are included or excluded in Tavily search results through the configuration file. This helps improve search result quality and reduce hallucinations by focusing on trusted sources.
+
+`Tips`: it only supports Tavily currently. 
+
+You can configure domain filtering in your `conf.yaml` file as follows:
+
+```yaml
+SEARCH_ENGINE:
+  engine: tavily
+  # Only include results from these domains (whitelist)
+  include_domains:
+    - trusted-news.com
+    - gov.org
+    - reliable-source.edu
+  # Exclude results from these domains (blacklist)
+  exclude_domains:
+    - unreliable-site.com
+    - spam-domain.net
--- a/src/tools/search.py
+++ b/src/tools/search.py
@@ -4,12 +4,14 @@
 import json
 import logging
 import os
+from typing import List, Optional

 from langchain_community.tools import BraveSearch, DuckDuckGoSearchResults
 from langchain_community.tools.arxiv import ArxivQueryRun
 from langchain_community.utilities import ArxivAPIWrapper, BraveSearchWrapper

 from src.config import SearchEngine, SELECTED_SEARCH_ENGINE
+from src.config import load_yaml_config
 from src.tools.tavily_search.tavily_search_results_with_images import (
    TavilySearchResultsWithImages,
 )
@@ -25,15 +27,33 @@ LoggedBraveSearch = create_logged_tool(BraveSearch)
 LoggedArxivSearch = create_logged_tool(ArxivQueryRun)


+def get_search_config():
+    config = load_yaml_config("conf.yaml")
+    search_config = config.get("SEARCH_ENGINE", {})
+    return search_config
+
+
 # Get the selected search tool
 def get_web_search_tool(max_search_results: int):
+    search_config = get_search_config()
+
    if SELECTED_SEARCH_ENGINE == SearchEngine.TAVILY.value:
+        # Only get and apply include/exclude domains for Tavily
+        include_domains: Optional[List[str]] = search_config.get("include_domains", [])
+        exclude_domains: Optional[List[str]] = search_config.get("exclude_domains", [])
+
+        logger.info(
+            f"Tavily search configuration loaded: include_domains={include_domains}, exclude_domains={exclude_domains}"
+        )
+
        return LoggedTavilySearch(
            name="web_search",
            max_results=max_search_results,
            include_raw_content=True,
            include_images=True,
            include_image_descriptions=True,
+            include_domains=include_domains,
+            exclude_domains=exclude_domains,
        )
    elif SELECTED_SEARCH_ENGINE == SearchEngine.DUCKDUCKGO.value:
        return LoggedDuckDuckGoSearch(