From dfd4712d9fda931a6b9583e151839154868cc829 Mon Sep 17 00:00:00 2001 From: HagonChan <77165006+HagonChan@users.noreply.github.com> Date: Sat, 12 Jul 2025 08:53:51 +0800 Subject: [PATCH] feat: add Domain Control Features for Tavily Search Engine (#401) * feat: add Domain Control Features for Tavily Search Engine * fixed * chore: update config.md --- conf.yaml.example | 15 +++++++++++++++ docs/configuration_guide.md | 22 ++++++++++++++++++++++ src/tools/search.py | 20 ++++++++++++++++++++ 3 files changed, 57 insertions(+) diff --git a/conf.yaml.example b/conf.yaml.example index eb9319c..9249961 100644 --- a/conf.yaml.example +++ b/conf.yaml.example @@ -20,3 +20,18 @@ BASIC_MODEL: # base_url: https://ark-cn-beijing.bytedance.net/api/v3 # model: "doubao-1-5-thinking-pro-m-250428" # api_key: xxxx + +# OTHER SETTINGS: +# Search engine configuration (Only supports Tavily currently) +# SEARCH_ENGINE: +# engine: tavily +# # Only include results from these domains +# include_domains: +# - example.com +# - trusted-news.com +# - reliable-source.org +# - gov.cn +# - edu.cn +# # Exclude results from these domains +# exclude_domains: +# - example.com diff --git a/docs/configuration_guide.md b/docs/configuration_guide.md index 1b16ed6..610e442 100644 --- a/docs/configuration_guide.md +++ b/docs/configuration_guide.md @@ -115,3 +115,25 @@ BASIC_MODEL: api_version: $AZURE_API_VERSION api_key: $AZURE_API_KEY ``` +## About Search Engine + +### How to control search domains for Tavily? + +DeerFlow allows you to control which domains are included or excluded in Tavily search results through the configuration file. This helps improve search result quality and reduce hallucinations by focusing on trusted sources. + +`Tips`: it only supports Tavily currently. + +You can configure domain filtering in your `conf.yaml` file as follows: + +```yaml +SEARCH_ENGINE: + engine: tavily + # Only include results from these domains (whitelist) + include_domains: + - trusted-news.com + - gov.org + - reliable-source.edu + # Exclude results from these domains (blacklist) + exclude_domains: + - unreliable-site.com + - spam-domain.net \ No newline at end of file diff --git a/src/tools/search.py b/src/tools/search.py index bbe4fa8..3f9f2bf 100644 --- a/src/tools/search.py +++ b/src/tools/search.py @@ -4,12 +4,14 @@ import json import logging import os +from typing import List, Optional from langchain_community.tools import BraveSearch, DuckDuckGoSearchResults from langchain_community.tools.arxiv import ArxivQueryRun from langchain_community.utilities import ArxivAPIWrapper, BraveSearchWrapper from src.config import SearchEngine, SELECTED_SEARCH_ENGINE +from src.config import load_yaml_config from src.tools.tavily_search.tavily_search_results_with_images import ( TavilySearchResultsWithImages, ) @@ -25,15 +27,33 @@ LoggedBraveSearch = create_logged_tool(BraveSearch) LoggedArxivSearch = create_logged_tool(ArxivQueryRun) +def get_search_config(): + config = load_yaml_config("conf.yaml") + search_config = config.get("SEARCH_ENGINE", {}) + return search_config + + # Get the selected search tool def get_web_search_tool(max_search_results: int): + search_config = get_search_config() + if SELECTED_SEARCH_ENGINE == SearchEngine.TAVILY.value: + # Only get and apply include/exclude domains for Tavily + include_domains: Optional[List[str]] = search_config.get("include_domains", []) + exclude_domains: Optional[List[str]] = search_config.get("exclude_domains", []) + + logger.info( + f"Tavily search configuration loaded: include_domains={include_domains}, exclude_domains={exclude_domains}" + ) + return LoggedTavilySearch( name="web_search", max_results=max_search_results, include_raw_content=True, include_images=True, include_image_descriptions=True, + include_domains=include_domains, + exclude_domains=exclude_domains, ) elif SELECTED_SEARCH_ENGINE == SearchEngine.DUCKDUCKGO.value: return LoggedDuckDuckGoSearch(