feat: support for moi in RAG module (#571)

* feat: add support for moi * small adjust * small adjust * according 2 comments * add more intro * add more intro
2026-04-22 05:34:45 +08:00 · 2025-09-16 20:25:59 +08:00
parent ea0fe62971
commit 5085bf8ee9
6 changed files with 176 additions and 0 deletions
--- a/.env.example
+++ b/.env.example
@@ -41,6 +41,12 @@ TAVILY_API_KEY=tvly-xxx
 # RAGFLOW_RETRIEVAL_SIZE=10
 # RAGFLOW_CROSS_LANGUAGES=English,Chinese,Spanish,French,German,Japanese,Korean # Optional. To use RAGFlow's cross-language search, please separate each language with a single comma
 # MOI is a hybrid database that mainly serves enterprise users (https://www.matrixorigin.io/matrixone-intelligence)
 # RAG_PROVIDER=moi
 # MOI_API_URL="https://freetier-01.cn-hangzhou.cluster.matrixonecloud.cn"
 # MOI_API_KEY="xxx-xxx-xxx-xxx"
 # MOI_RETRIEVAL_SIZE=10
 # MOI_LIST_LIMIT=10
 # RAG_PROVIDER: milvus  (using free milvus instance on zilliz cloud: https://docs.zilliz.com/docs/quick-start )
 # RAG_PROVIDER=milvus
--- a/README_zh.md
+++ b/README_zh.md
@@ -183,6 +183,16 @@ DeerFlow 支持基于私有域知识的检索，您可以将文档上传到多
   RAGFLOW_RETRIEVAL_SIZE=10
   ```
 - **[MOI]**：AI 原生多模态数据智能平台
   ```
   # 参照示例进行配置 .env.example
   RAG_PROVIDER=moi
   MOI_API_URL="https://freetier-01.cn-hangzhou.cluster.matrixonecloud.cn"
   MOI_API_KEY="xxx-xxx-xxx-xxx"
   MOI_RETRIEVAL_SIZE=10
   MOI_LIST_LIMIT=10
   ```
 - **[VikingDB 知识库](https://www.volcengine.com/docs/84313/1254457)**：火山引擎提供的公有云知识库引擎
   > 注意先从 [火山引擎](https://www.volcengine.com/docs/84313/1254485) 获取账号 AK/SK
   ```
--- a/src/config/tools.py
+++ b/src/config/tools.py
@@ -24,6 +24,7 @@ SELECTED_SEARCH_ENGINE = os.getenv("SEARCH_API", SearchEngine.TAVILY.value)
 class RAGProvider(enum.Enum):
    RAGFLOW = "ragflow"
    VIKINGDB_KNOWLEDGE_BASE = "vikingdb_knowledge_base"
    MOI = "moi"
    MILVUS = "milvus"
--- a/src/rag/init.py
+++ b/src/rag/init.py
@@ -3,6 +3,7 @@
 from .builder import build_retriever
 from .ragflow import RAGFlowProvider
 from .moi import MOIProvider
 from .retriever import Chunk, Document, Resource, Retriever
 from .vikingdb_knowledge_base import VikingDBKnowledgeBaseProvider
@@ -11,6 +12,7 @@ __all__ = [
    Document,
    Resource,
    RAGFlowProvider,
    MOIProvider,
    VikingDBKnowledgeBaseProvider,
    Chunk,
    build_retriever,
--- a/src/rag/builder.py
+++ b/src/rag/builder.py
@@ -3,6 +3,7 @@
 from src.config.tools import SELECTED_RAG_PROVIDER, RAGProvider
 from src.rag.ragflow import RAGFlowProvider
 from src.rag.moi import MOIProvider
 from src.rag.retriever import Retriever
 from src.rag.vikingdb_knowledge_base import VikingDBKnowledgeBaseProvider
 from src.rag.milvus import MilvusProvider
@@ -11,6 +12,8 @@ from src.rag.milvus import MilvusProvider
 def build_retriever() -> Retriever | None:
    if SELECTED_RAG_PROVIDER == RAGProvider.RAGFLOW.value:
        return RAGFlowProvider()
    elif SELECTED_RAG_PROVIDER == RAGProvider.MOI.value:
        return MOIProvider()
    elif SELECTED_RAG_PROVIDER == RAGProvider.VIKINGDB_KNOWLEDGE_BASE.value:
        return VikingDBKnowledgeBaseProvider()
    elif SELECTED_RAG_PROVIDER == RAGProvider.MILVUS.value:
--- a/src/rag/moi.py
+++ b/src/rag/moi.py
@@ -0,0 +1,154 @@
 # Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
 # SPDX-License-Identifier: MIT
 import os
 from urllib.parse import urlparse
 import requests
 from src.rag.retriever import Chunk, Document, Resource, Retriever
 class MOIProvider(Retriever):
    """
    MatrixOne Intelligence (MOI) is a multimodal data AI processing platform. 
    It supports connecting, processing, managing, and using both structured and unstructured data. 
    Through steps such as parsing, extraction, segmentation, cleaning, and enhancement, 
    it transforms raw data like documents, images, and audio/video into AI-ready application data. 
    With its self-developed data service layer (the MatrixOne database), 
    it can directly provide retrieval services for the processed data.
    The open-source repository is available at: https://github.com/matrixorigin/matrixone
    For more information, please visit the website: https://www.matrixorigin.io/matrixone-intelligence
    Documentation: https://docs.matrixorigin.cn/zh/m1intelligence/MatrixOne-Intelligence/Workspace-Mgmt/overview/
    Online Demo: https://www.matrixorigin.io/demo
    """
    def __init__(self):
        # Initialize MOI API configuration from environment variables
        self.api_url = os.getenv("MOI_API_URL")
        if not self.api_url:
            raise ValueError("MOI_API_URL is not set")
        # Add /byoa suffix to the API URL for MOI compatibility
        if not self.api_url.endswith("/byoa"):
            self.api_url = self.api_url + "/byoa"
        self.api_key = os.getenv("MOI_API_KEY")
        if not self.api_key:
            raise ValueError("MOI_API_KEY is not set")
        # Set page size for document retrieval
        self.page_size = 10
        moi_size = os.getenv("MOI_RETRIEVAL_SIZE")
        if moi_size:
            self.page_size = int(moi_size)
        # Set MOI-specific list limit parameter
        self.moi_list_limit = None
        moi_list_limit = os.getenv("MOI_LIST_LIMIT")
        if moi_list_limit:
            self.moi_list_limit = int(moi_list_limit)
    def query_relevant_documents(
        self, query: str, resources: list[Resource] = []
    ) -> list[Document]:
        """
        Query relevant documents from MOI API using the provided resources.
        """
        headers = {
            "moi-key": f"{self.api_key}",
            "Content-Type": "application/json",
        }
        dataset_ids: list[str] = []
        document_ids: list[str] = []
        for resource in resources:
            dataset_id, document_id = self._parse_uri(resource.uri)
            dataset_ids.append(dataset_id)
            if document_id:
                document_ids.append(document_id)
        payload = {
            "question": query,
            "dataset_ids": dataset_ids,
            "document_ids": document_ids,
            "page_size": self.page_size,
        }
        response = requests.post(
            f"{self.api_url}/api/v1/retrieval", headers=headers, json=payload
        )
        if response.status_code != 200:
            raise Exception(f"Failed to query documents: {response.text}")
        result = response.json()
        data = result.get("data", {})
        doc_aggs = data.get("doc_aggs", [])
        docs: dict[str, Document] = {
            doc.get("doc_id"): Document(
                id=doc.get("doc_id"),
                title=doc.get("doc_name"),
                chunks=[],
            )
            for doc in doc_aggs
        }
        for chunk in data.get("chunks", []):
            doc = docs.get(chunk.get("document_id"))
            if doc:
                doc.chunks.append(
                    Chunk(
                        content=chunk.get("content"),
                        similarity=chunk.get("similarity"),
                    )
                )
        return list(docs.values())
    def list_resources(self, query: str | None = None) -> list[Resource]:
        """
        List resources from MOI API with optional query filtering and limit support.
        """
        headers = {
            "Authorization": f"Bearer {self.api_key}",
            "Content-Type": "application/json",
        }
        params = {}
        if query:
            params["name"] = query
        if self.moi_list_limit:
            params["limit"] = self.moi_list_limit
        response = requests.get(
            f"{self.api_url}/api/v1/datasets", headers=headers, params=params
        )
        if response.status_code != 200:
            raise Exception(f"Failed to list resources: {response.text}")
        result = response.json()
        resources = []
        for item in result.get("data", []):
            resource = Resource(
                uri=f"rag://dataset/{item.get('id')}",
                title=item.get("name", ""),
                description=item.get("description", ""),
            )
            resources.append(resource)
        return resources
    def _parse_uri(self, uri: str) -> tuple[str, str]:
        """
        Parse URI to extract dataset ID and document ID.
        """
        parsed = urlparse(uri)
        if parsed.scheme != "rag":
            raise ValueError(f"Invalid URI: {uri}")
        return parsed.path.split("/")[1], parsed.fragment