src/rag/retriever.py

# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
# SPDX-License-Identifier: MIT

import abc

from pydantic import BaseModel, Field


class Chunk:
    content: str
    similarity: float

    def __init__(self, content: str, similarity: float):
        self.content = content
        self.similarity = similarity


class Document:
    """
    Document is a class that represents a document.
    """

    id: str
    url: str | None = None
    title: str | None = None
    chunks: list[Chunk] = []

    def __init__(
        self,
        id: str,
        url: str | None = None,
        title: str | None = None,
        chunks: list[Chunk] = [],
    ):
        self.id = id
        self.url = url
        self.title = title
        self.chunks = chunks

    def to_dict(self) -> dict:
        d = {
            "id": self.id,
            "content": "\n\n".join([chunk.content for chunk in self.chunks]),
        }
        if self.url:
            d["url"] = self.url
        if self.title:
            d["title"] = self.title
        return d


class Resource(BaseModel):
    """
    Resource is a class that represents a resource.
    """

    uri: str = Field(..., description="The URI of the resource")
    title: str = Field(..., description="The title of the resource")
    description: str | None = Field("", description="The description of the resource")


class Retriever(abc.ABC):
    """
    Define a RAG provider, which can be used to query documents and resources.
    """

    @abc.abstractmethod
    def list_resources(self, query: str | None = None) -> list[Resource]:
        """
        List resources from the rag provider.
        """
        pass

    @abc.abstractmethod
    def query_relevant_documents(
        self, query: str, resources: list[Resource] = []
    ) -> list[Document]:
        """
        Query relevant documents from the resources.
        """
        pass
feat: rag retrieving tool call result display (#263) * feat: local search tool call result display * chore: add file copyright * fix: miss edit plan interrupt feedback * feat: disable pasting html into input box 2025-05-29 19:52:34 +08:00			`# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates`
			`# SPDX-License-Identifier: MIT`

feat: RAG Integration (#238) * feat: add rag provider and retriever * feat: retriever tool * feat: add retriever tool to the researcher node * feat: add rag http apis * feat: new message input supports resource mentions * feat: new message input component support resource mentions * refactor: need_web_search to need_search * chore: RAG integration docs * chore: change example api host * fix: user message color in dark mode * fix: mentions style * feat: add local_search_tool to researcher prompt * chore: research prompt * fix: ragflow page size and reporter with * docs: ragflow integration and add acknowledgment projects * chore: format 2025-05-28 14:13:46 +08:00			`import abc`
feat: 1. replace black with ruff for fomatting and sort import (#489) 2. use tavily from`langchain-tavily` rather than the older one from `langchain-community` Co-authored-by: Willem Jiang <willem.jiang@gmail.com> 2025-08-17 22:57:23 +08:00
feat: RAG Integration (#238) * feat: add rag provider and retriever * feat: retriever tool * feat: add retriever tool to the researcher node * feat: add rag http apis * feat: new message input supports resource mentions * feat: new message input component support resource mentions * refactor: need_web_search to need_search * chore: RAG integration docs * chore: change example api host * fix: user message color in dark mode * fix: mentions style * feat: add local_search_tool to researcher prompt * chore: research prompt * fix: ragflow page size and reporter with * docs: ragflow integration and add acknowledgment projects * chore: format 2025-05-28 14:13:46 +08:00			`from pydantic import BaseModel, Field`


			`class Chunk:`
			`content: str`
			`similarity: float`

			`def __init__(self, content: str, similarity: float):`
			`self.content = content`
			`self.similarity = similarity`


			`class Document:`
			`"""`
			`Document is a class that represents a document.`
			`"""`

			`id: str`
			`url: str \| None = None`
			`title: str \| None = None`
			`chunks: list[Chunk] = []`

			`def __init__(`
			`self,`
			`id: str,`
			`url: str \| None = None,`
			`title: str \| None = None,`
			`chunks: list[Chunk] = [],`
			`):`
			`self.id = id`
			`self.url = url`
			`self.title = title`
			`self.chunks = chunks`

			`def to_dict(self) -> dict:`
			`d = {`
			`"id": self.id,`
			`"content": "\n\n".join([chunk.content for chunk in self.chunks]),`
			`}`
			`if self.url:`
			`d["url"] = self.url`
			`if self.title:`
			`d["title"] = self.title`
			`return d`


			`class Resource(BaseModel):`
			`"""`
			`Resource is a class that represents a resource.`
			`"""`

			`uri: str = Field(..., description="The URI of the resource")`
			`title: str = Field(..., description="The title of the resource")`
			`description: str \| None = Field("", description="The description of the resource")`


			`class Retriever(abc.ABC):`
			`"""`
			`Define a RAG provider, which can be used to query documents and resources.`
			`"""`

			`@abc.abstractmethod`
			`def list_resources(self, query: str \| None = None) -> list[Resource]:`
			`"""`
			`List resources from the rag provider.`
			`"""`
			`pass`

			`@abc.abstractmethod`
			`def query_relevant_documents(`
			`self, query: str, resources: list[Resource] = []`
			`) -> list[Document]:`
			`"""`
			`Query relevant documents from the resources.`
			`"""`
			`pass`