feat: support dify in rag module (#550)

Co-authored-by: Willem Jiang <willem.jiang@gmail.com>
This commit is contained in:
Chayton Bai
2025-09-16 20:30:45 +08:00
committed by GitHub
parent 5085bf8ee9
commit 7694bb5d72
19 changed files with 407 additions and 87 deletions

View File

@@ -2,6 +2,7 @@
# SPDX-License-Identifier: MIT
from .builder import build_retriever
from .dify import DifyProvider
from .ragflow import RAGFlowProvider
from .moi import MOIProvider
from .retriever import Chunk, Document, Resource, Retriever
@@ -11,6 +12,7 @@ __all__ = [
Retriever,
Document,
Resource,
DifyProvider,
RAGFlowProvider,
MOIProvider,
VikingDBKnowledgeBaseProvider,

View File

@@ -2,14 +2,17 @@
# SPDX-License-Identifier: MIT
from src.config.tools import SELECTED_RAG_PROVIDER, RAGProvider
from src.rag.dify import DifyProvider
from src.rag.milvus import MilvusProvider
from src.rag.ragflow import RAGFlowProvider
from src.rag.moi import MOIProvider
from src.rag.retriever import Retriever
from src.rag.vikingdb_knowledge_base import VikingDBKnowledgeBaseProvider
from src.rag.milvus import MilvusProvider
def build_retriever() -> Retriever | None:
if SELECTED_RAG_PROVIDER == RAGProvider.DIFY.value:
return DifyProvider()
if SELECTED_RAG_PROVIDER == RAGProvider.RAGFLOW.value:
return RAGFlowProvider()
elif SELECTED_RAG_PROVIDER == RAGProvider.MOI.value:

132
src/rag/dify.py Normal file
View File

@@ -0,0 +1,132 @@
# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
# SPDX-License-Identifier: MIT
import os
from urllib.parse import urlparse
import requests
from src.rag.retriever import Chunk, Document, Resource, Retriever
class DifyProvider(Retriever):
"""
DifyProvider is a provider that uses dify to retrieve documents.
"""
api_url: str
api_key: str
def __init__(self):
api_url = os.getenv("DIFY_API_URL")
if not api_url:
raise ValueError("DIFY_API_URL is not set")
self.api_url = api_url
api_key = os.getenv("DIFY_API_KEY")
if not api_key:
raise ValueError("DIFY_API_KEY is not set")
self.api_key = api_key
def query_relevant_documents(
self, query: str, resources: list[Resource] = []
) -> list[Document]:
if not resources:
return []
headers = {
"Authorization": f"Bearer {self.api_key}",
"Content-Type": "application/json",
}
all_documents = {}
for resource in resources:
dataset_id, _ = parse_uri(resource.uri)
payload = {
"query": query,
"retrieval_model": {
"search_method": "hybrid_search",
"reranking_enable": False,
"weights": {
"weight_type": "customized",
"keyword_setting": {"keyword_weight": 0.3},
"vector_setting": {"vector_weight": 0.7},
},
"top_k": 3,
"score_threshold_enabled": True,
"score_threshold": 0.5,
},
}
response = requests.post(
f"{self.api_url}/datasets/{dataset_id}/retrieve",
headers=headers,
json=payload,
)
if response.status_code != 200:
raise Exception(f"Failed to query documents: {response.text}")
result = response.json()
records = result.get("records", {})
for record in records:
segment = record.get("segment")
if not segment:
continue
document_info = segment.get("document")
if not document_info:
continue
doc_id = document_info.get("id")
doc_name = document_info.get("name")
if not doc_id or not doc_name:
continue
if doc_id not in all_documents:
all_documents[doc_id] = Document(
id=doc_id, title=doc_name, chunks=[]
)
chunk = Chunk(
content=segment.get("content", ""),
similarity=record.get("score", 0.0),
)
all_documents[doc_id].chunks.append(chunk)
return list(all_documents.values())
def list_resources(self, query: str | None = None) -> list[Resource]:
headers = {
"Authorization": f"Bearer {self.api_key}",
"Content-Type": "application/json",
}
params = {}
if query:
params["keyword"] = query
response = requests.get(
f"{self.api_url}/datasets", headers=headers, params=params
)
if response.status_code != 200:
raise Exception(f"Failed to list resources: {response.text}")
result = response.json()
resources = []
for item in result.get("data", []):
item = Resource(
uri=f"rag://dataset/{item.get('id')}",
title=item.get("name", ""),
description=item.get("description", ""),
)
resources.append(item)
return resources
def parse_uri(uri: str) -> tuple[str, str]:
parsed = urlparse(uri)
if parsed.scheme != "rag":
raise ValueError(f"Invalid URI: {uri}")
return parsed.path.split("/")[1], parsed.fragment

View File

@@ -7,11 +7,12 @@ from pathlib import Path
from typing import Any, Dict, Iterable, List, Optional, Sequence, Set
from langchain_milvus.vectorstores import Milvus as LangchainMilvus
from pymilvus import MilvusClient, CollectionSchema, FieldSchema, DataType
from langchain_openai import OpenAIEmbeddings
from openai import OpenAI
from pymilvus import CollectionSchema, DataType, FieldSchema, MilvusClient
from src.config.loader import get_bool_env, get_int_env, get_str_env
from src.rag.retriever import Chunk, Document, Resource, Retriever
from src.config.loader import get_bool_env, get_str_env, get_int_env
logger = logging.getLogger(__name__)
@@ -466,7 +467,7 @@ class MilvusRetriever(Retriever):
resources.append(
Resource(
uri=r.get(self.url_field, "")
or f"milvus://{r.get(self.id_field,'')}",
or f"milvus://{r.get(self.id_field, '')}",
title=r.get(self.title_field, "")
or r.get(self.id_field, "Unnamed"),
description="Stored Milvus document",
@@ -476,21 +477,23 @@ class MilvusRetriever(Retriever):
# Use similarity_search_by_vector for lightweight listing.
# If a query is provided embed it; else use a zero vector.
docs: Iterable[Any] = self.client.similarity_search(
query, k=100, expr="source == 'examples'" # Limit to 100 results
query,
k=100,
expr="source == 'examples'", # Limit to 100 results
)
for d in docs:
meta = getattr(d, "metadata", {}) or {}
# check if the resource is in the list of resources
if resources and any(
r.uri == meta.get(self.url_field, "")
or r.uri == f"milvus://{meta.get(self.id_field,'')}"
or r.uri == f"milvus://{meta.get(self.id_field, '')}"
for r in resources
):
continue
resources.append(
Resource(
uri=meta.get(self.url_field, "")
or f"milvus://{meta.get(self.id_field,'')}",
or f"milvus://{meta.get(self.id_field, '')}",
title=meta.get(self.title_field, "")
or meta.get(self.id_field, "Unnamed"),
description="Stored Milvus document",