mirror of
https://gitee.com/wanwujie/deer-flow
synced 2026-04-08 16:24:45 +08:00
* feat: add resource upload support for RAG - Backend: Added ingest_file method to Retriever and MilvusRetriever - Backend: Added /api/rag/upload endpoint - Frontend: Added RAGTab in settings for uploading resources - Frontend: Updated translations and settings registration * Apply suggestions from code review Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Apply suggestions from code review * Apply suggestions from code review of src/rag/milvus.py --------- Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
131 lines
4.0 KiB
Python
131 lines
4.0 KiB
Python
# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
|
|
# SPDX-License-Identifier: MIT
|
|
|
|
import abc
|
|
|
|
from pydantic import BaseModel, Field
|
|
|
|
|
|
class Chunk:
|
|
content: str
|
|
similarity: float
|
|
|
|
def __init__(self, content: str, similarity: float):
|
|
self.content = content
|
|
self.similarity = similarity
|
|
|
|
|
|
class Document:
|
|
"""
|
|
Document is a class that represents a document.
|
|
"""
|
|
|
|
id: str
|
|
url: str | None = None
|
|
title: str | None = None
|
|
chunks: list[Chunk] = []
|
|
|
|
def __init__(
|
|
self,
|
|
id: str,
|
|
url: str | None = None,
|
|
title: str | None = None,
|
|
chunks: list[Chunk] = [],
|
|
):
|
|
self.id = id
|
|
self.url = url
|
|
self.title = title
|
|
self.chunks = chunks
|
|
|
|
def to_dict(self) -> dict:
|
|
d = {
|
|
"id": self.id,
|
|
"content": "\n\n".join([chunk.content for chunk in self.chunks]),
|
|
}
|
|
if self.url:
|
|
d["url"] = self.url
|
|
if self.title:
|
|
d["title"] = self.title
|
|
return d
|
|
|
|
|
|
class Resource(BaseModel):
|
|
"""
|
|
Resource is a class that represents a resource.
|
|
"""
|
|
|
|
uri: str = Field(..., description="The URI of the resource")
|
|
title: str = Field(..., description="The title of the resource")
|
|
description: str | None = Field("", description="The description of the resource")
|
|
|
|
|
|
class Retriever(abc.ABC):
|
|
"""
|
|
Define a RAG provider, which can be used to query documents and resources.
|
|
"""
|
|
|
|
@abc.abstractmethod
|
|
def list_resources(self, query: str | None = None) -> list[Resource]:
|
|
"""
|
|
List resources from the rag provider.
|
|
"""
|
|
pass
|
|
|
|
@abc.abstractmethod
|
|
def query_relevant_documents(
|
|
self, query: str, resources: list[Resource] = []
|
|
) -> list[Document]:
|
|
"""
|
|
Query relevant documents from the resources.
|
|
"""
|
|
pass
|
|
|
|
def ingest_file(self, file_content: bytes, filename: str, **kwargs) -> Resource:
|
|
"""
|
|
Ingest a file into the RAG provider and register it as a :class:`Resource`.
|
|
|
|
This method is intended to be overridden by concrete retriever implementations.
|
|
The default implementation always raises :class:`NotImplementedError`.
|
|
|
|
Parameters
|
|
----------
|
|
file_content:
|
|
Raw bytes of the file to ingest. For text-based formats, implementations
|
|
will typically assume UTF-8 encoding unless documented otherwise. Binary
|
|
formats (such as PDF, images, or office documents) should be passed as
|
|
their original bytes.
|
|
filename:
|
|
The original filename, including extension (e.g. ``"report.pdf"``). This
|
|
can be used by implementations to infer the file type, MIME type, or to
|
|
populate the resulting resource's title.
|
|
**kwargs:
|
|
Additional, implementation-specific options. Examples may include:
|
|
|
|
- Explicit MIME type or file type hints.
|
|
- Additional metadata to associate with the resource.
|
|
- Chunking, indexing, or preprocessing parameters.
|
|
|
|
Unsupported or invalid keyword arguments may result in an exception being
|
|
raised by the concrete implementation.
|
|
|
|
Returns
|
|
-------
|
|
Resource
|
|
A :class:`Resource` instance describing the ingested file, including its
|
|
URI and title. The exact URI scheme and how the resource is stored are
|
|
implementation-defined.
|
|
|
|
Raises
|
|
------
|
|
NotImplementedError
|
|
Always raised by the base ``Retriever`` implementation. Concrete
|
|
implementations should override this method to provide functionality.
|
|
ValueError
|
|
May be raised by implementations if the input bytes, filename, or
|
|
provided options are invalid.
|
|
RuntimeError
|
|
May be raised by implementations to signal unexpected ingestion or
|
|
storage failures (e.g. backend service errors).
|
|
"""
|
|
raise NotImplementedError("ingest_file is not implemented")
|