fix(client): Harden upload validation and conversion flow (#989)

* fix(client): Harden upload validation and conversion flow * test(client): cover event-loop upload conversion reuse * test(client): remove unused import in upload regression coverage * fix(client): load optional shared checkpointer fallback Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-opencode) Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai> * docs(backend): document config preflight and IM channels Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-opencode) Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai> --------- Co-authored-by: Willem Jiang <willem.jiang@gmail.com> Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
2026-04-12 01:54:45 +08:00 · 2026-03-11 15:17:31 +08:00
parent a0c38a5cf3
commit 4bae3c724c
4 changed files with 174 additions and 49 deletions
--- a/backend/src/client.py
+++ b/backend/src/client.py
@@ -19,6 +19,7 @@ import asyncio
 import json
 import logging
 import mimetypes
+import os
 import re
 import shutil
 import tempfile
@@ -723,52 +724,79 @@ class DeerFlowClient:

        Raises:
            FileNotFoundError: If any file does not exist.
+            ValueError: If any supplied path exists but is not a regular file.
        """
        from src.gateway.routers.uploads import CONVERTIBLE_EXTENSIONS, convert_file_to_markdown

        # Validate all files upfront to avoid partial uploads.
        resolved_files = []
+        convertible_extensions = {ext.lower() for ext in CONVERTIBLE_EXTENSIONS}
+        has_convertible_file = False
        for f in files:
            p = Path(f)
            if not p.exists():
                raise FileNotFoundError(f"File not found: {f}")
+            if not p.is_file():
+                raise ValueError(f"Path is not a file: {f}")
            resolved_files.append(p)
+            if not has_convertible_file and p.suffix.lower() in convertible_extensions:
+                has_convertible_file = True

        uploads_dir = self._get_uploads_dir(thread_id)
        uploaded_files: list[dict] = []

-        for src_path in resolved_files:
-            dest = uploads_dir / src_path.name
-            shutil.copy2(src_path, dest)
+        conversion_pool = None
+        if has_convertible_file:
+            try:
+                asyncio.get_running_loop()
+            except RuntimeError:
+                conversion_pool = None
+            else:
+                import concurrent.futures

-            info: dict[str, Any] = {
-                "filename": src_path.name,
-                "size": str(dest.stat().st_size),
-                "path": str(dest),
-                "virtual_path": f"/mnt/user-data/uploads/{src_path.name}",
-                "artifact_url": f"/api/threads/{thread_id}/artifacts/mnt/user-data/uploads/{src_path.name}",
-            }
+                # Reuse one worker when already inside an event loop to avoid
+                # creating a new ThreadPoolExecutor per converted file.
+                conversion_pool = concurrent.futures.ThreadPoolExecutor(max_workers=1)

-            if src_path.suffix.lower() in CONVERTIBLE_EXTENSIONS:
-                try:
+        def _convert_in_thread(path: Path):
+            return asyncio.run(convert_file_to_markdown(path))
+
+        try:
+            for src_path in resolved_files:
+                dest = uploads_dir / src_path.name
+                shutil.copy2(src_path, dest)
+
+                info: dict[str, Any] = {
+                    "filename": src_path.name,
+                    "size": str(dest.stat().st_size),
+                    "path": str(dest),
+                    "virtual_path": f"/mnt/user-data/uploads/{src_path.name}",
+                    "artifact_url": f"/api/threads/{thread_id}/artifacts/mnt/user-data/uploads/{src_path.name}",
+                }
+
+                if src_path.suffix.lower() in convertible_extensions:
                    try:
-                        asyncio.get_running_loop()
-                        import concurrent.futures
+                        if conversion_pool is not None:
+                            md_path = conversion_pool.submit(_convert_in_thread, dest).result()
+                        else:
+                            md_path = asyncio.run(convert_file_to_markdown(dest))
+                    except Exception:
+                        logger.warning(
+                            "Failed to convert %s to markdown",
+                            src_path.name,
+                            exc_info=True,
+                        )
+                        md_path = None

-                        with concurrent.futures.ThreadPoolExecutor() as pool:
-                            md_path = pool.submit(lambda: asyncio.run(convert_file_to_markdown(dest))).result()
-                    except RuntimeError:
-                        md_path = asyncio.run(convert_file_to_markdown(dest))
-                except Exception:
-                    logger.warning("Failed to convert %s to markdown", src_path.name, exc_info=True)
-                    md_path = None
+                    if md_path is not None:
+                        info["markdown_file"] = md_path.name
+                        info["markdown_virtual_path"] = f"/mnt/user-data/uploads/{md_path.name}"
+                        info["markdown_artifact_url"] = f"/api/threads/{thread_id}/artifacts/mnt/user-data/uploads/{md_path.name}"

-                if md_path is not None:
-                    info["markdown_file"] = md_path.name
-                    info["markdown_virtual_path"] = f"/mnt/user-data/uploads/{md_path.name}"
-                    info["markdown_artifact_url"] = f"/api/threads/{thread_id}/artifacts/mnt/user-data/uploads/{md_path.name}"
-
-            uploaded_files.append(info)
+                uploaded_files.append(info)
+        finally:
+            if conversion_pool is not None:
+                conversion_pool.shutdown(wait=True)

        return {
            "success": True,
@@ -791,20 +819,23 @@ class DeerFlowClient:
            return {"files": [], "count": 0}

        files = []
-        for fp in sorted(uploads_dir.iterdir()):
-            if fp.is_file():
-                stat = fp.stat()
-                files.append(
-                    {
-                        "filename": fp.name,
-                        "size": str(stat.st_size),
-                        "path": str(fp),
-                        "virtual_path": f"/mnt/user-data/uploads/{fp.name}",
-                        "artifact_url": f"/api/threads/{thread_id}/artifacts/mnt/user-data/uploads/{fp.name}",
-                        "extension": fp.suffix,
-                        "modified": stat.st_mtime,
-                    }
-                )
+        with os.scandir(uploads_dir) as entries:
+            file_entries = [entry for entry in entries if entry.is_file()]
+
+        for entry in sorted(file_entries, key=lambda item: item.name):
+            stat = entry.stat()
+            filename = entry.name
+            files.append(
+                {
+                    "filename": filename,
+                    "size": str(stat.st_size),
+                    "path": str(Path(entry.path)),
+                    "virtual_path": f"/mnt/user-data/uploads/{filename}",
+                    "artifact_url": f"/api/threads/{thread_id}/artifacts/mnt/user-data/uploads/{filename}",
+                    "extension": Path(filename).suffix,
+                    "modified": stat.st_mtime,
+                }
+            )
        return {"files": files, "count": len(files)}

    def delete_upload(self, thread_id: str, filename: str) -> dict: