feat: RAG Integration (#238)

* feat: add rag provider and retriever

* feat: retriever tool

* feat: add retriever tool to the researcher node

* feat: add rag http apis

* feat: new message input supports resource mentions

* feat: new message input component support resource mentions

* refactor: need_web_search to need_search

* chore: RAG integration docs

* chore: change example api host

* fix: user message color in dark mode

* fix: mentions style

* feat: add local_search_tool to researcher prompt

* chore: research prompt

* fix: ragflow page size and reporter with

* docs: ragflow integration and add acknowledgment projects

* chore: format
This commit is contained in:
JeffJiang
2025-05-28 14:13:46 +08:00
committed by GitHub
parent 0565ab6d27
commit 462752b462
43 changed files with 1172 additions and 181 deletions

5
src/rag/__init__.py Normal file
View File

@@ -0,0 +1,5 @@
from .retriever import Retriever, Document, Resource
from .ragflow import RAGFlowProvider
from .builder import build_retriever
__all__ = [Retriever, Document, Resource, RAGFlowProvider, build_retriever]

11
src/rag/builder.py Normal file
View File

@@ -0,0 +1,11 @@
from src.config.tools import SELECTED_RAG_PROVIDER, RAGProvider
from src.rag.ragflow import RAGFlowProvider
from src.rag.retriever import Retriever
def build_retriever() -> Retriever | None:
if SELECTED_RAG_PROVIDER == RAGProvider.RAGFLOW.value:
return RAGFlowProvider()
elif SELECTED_RAG_PROVIDER:
raise ValueError(f"Unsupported RAG provider: {SELECTED_RAG_PROVIDER}")
return None

130
src/rag/ragflow.py Normal file
View File

@@ -0,0 +1,130 @@
import os
import requests
from src.rag.retriever import Chunk, Document, Resource, Retriever
from urllib.parse import urlparse
class RAGFlowProvider(Retriever):
"""
RAGFlowProvider is a provider that uses RAGFlow to retrieve documents.
"""
api_url: str
api_key: str
page_size: int = 10
def __init__(self):
api_url = os.getenv("RAGFLOW_API_URL")
if not api_url:
raise ValueError("RAGFLOW_API_URL is not set")
self.api_url = api_url
api_key = os.getenv("RAGFLOW_API_KEY")
if not api_key:
raise ValueError("RAGFLOW_API_KEY is not set")
self.api_key = api_key
page_size = os.getenv("RAGFLOW_PAGE_SIZE")
if page_size:
self.page_size = int(page_size)
def query_relevant_documents(
self, query: str, resources: list[Resource] = []
) -> list[Document]:
headers = {
"Authorization": f"Bearer {self.api_key}",
"Content-Type": "application/json",
}
dataset_ids: list[str] = []
document_ids: list[str] = []
for resource in resources:
dataset_id, document_id = parse_uri(resource.uri)
dataset_ids.append(dataset_id)
if document_id:
document_ids.append(document_id)
payload = {
"question": query,
"dataset_ids": dataset_ids,
"document_ids": document_ids,
"page_size": self.page_size,
}
response = requests.post(
f"{self.api_url}/api/v1/retrieval", headers=headers, json=payload
)
if response.status_code != 200:
raise Exception(f"Failed to query documents: {response.text}")
result = response.json()
data = result.get("data", {})
doc_aggs = data.get("doc_aggs", [])
docs: dict[str, Document] = {
doc.get("doc_id"): Document(
id=doc.get("doc_id"),
title=doc.get("doc_name"),
chunks=[],
)
for doc in doc_aggs
}
for chunk in data.get("chunks", []):
doc = docs.get(chunk.get("document_id"))
if doc:
doc.chunks.append(
Chunk(
content=chunk.get("content"),
similarity=chunk.get("similarity"),
)
)
return list(docs.values())
def list_resources(self, query: str | None = None) -> list[Resource]:
headers = {
"Authorization": f"Bearer {self.api_key}",
"Content-Type": "application/json",
}
params = {}
if query:
params["name"] = query
response = requests.get(
f"{self.api_url}/api/v1/datasets", headers=headers, params=params
)
if response.status_code != 200:
raise Exception(f"Failed to list resources: {response.text}")
result = response.json()
resources = []
for item in result.get("data", []):
item = Resource(
uri=f"rag://dataset/{item.get('id')}",
title=item.get("name", ""),
description=item.get("description", ""),
)
resources.append(item)
return resources
def parse_uri(uri: str) -> tuple[str, str]:
parsed = urlparse(uri)
if parsed.scheme != "rag":
raise ValueError(f"Invalid URI: {uri}")
return parsed.path.split("/")[1], parsed.fragment
if __name__ == "__main__":
uri = "rag://dataset/123#abc"
parsed = urlparse(uri)
print(parsed.scheme)
print(parsed.netloc)
print(parsed.path)
print(parsed.fragment)

77
src/rag/retriever.py Normal file
View File

@@ -0,0 +1,77 @@
import abc
from pydantic import BaseModel, Field
class Chunk:
content: str
similarity: float
def __init__(self, content: str, similarity: float):
self.content = content
self.similarity = similarity
class Document:
"""
Document is a class that represents a document.
"""
id: str
url: str | None = None
title: str | None = None
chunks: list[Chunk] = []
def __init__(
self,
id: str,
url: str | None = None,
title: str | None = None,
chunks: list[Chunk] = [],
):
self.id = id
self.url = url
self.title = title
self.chunks = chunks
def to_dict(self) -> dict:
d = {
"id": self.id,
"content": "\n\n".join([chunk.content for chunk in self.chunks]),
}
if self.url:
d["url"] = self.url
if self.title:
d["title"] = self.title
return d
class Resource(BaseModel):
"""
Resource is a class that represents a resource.
"""
uri: str = Field(..., description="The URI of the resource")
title: str = Field(..., description="The title of the resource")
description: str | None = Field("", description="The description of the resource")
class Retriever(abc.ABC):
"""
Define a RAG provider, which can be used to query documents and resources.
"""
@abc.abstractmethod
def list_resources(self, query: str | None = None) -> list[Resource]:
"""
List resources from the rag provider.
"""
pass
@abc.abstractmethod
def query_relevant_documents(
self, query: str, resources: list[Resource] = []
) -> list[Document]:
"""
Query relevant documents from the resources.
"""
pass