From 191b60a326f3031298d109dc4e8117ac40a00b23 Mon Sep 17 00:00:00 2001
From: -Astraia- <91442300+Doge2077@users.noreply.github.com>
Date: Mon, 16 Mar 2026 16:53:12 +0800
Subject: [PATCH] fix: issue 1138 windows encoding (#1139)

* fix(windows): use utf-8 for text file operations

* fix(windows): normalize sandbox path masking

* fix(windows): preserve utf-8 handling after backend split
---
 backend/app/gateway/routers/artifacts.py      |  8 ++---
 backend/app/gateway/routers/mcp.py            |  2 +-
 backend/app/gateway/routers/skills.py         |  2 +-
 .../aio_sandbox/aio_sandbox_provider.py       |  2 +-
 .../deerflow/sandbox/local/local_sandbox.py   |  4 +--
 .../harness/deerflow/sandbox/tools.py         | 12 ++++---
 .../harness/deerflow/skills/validation.py     |  2 +-
 backend/tests/test_artifacts_router.py        | 27 +++++++++++++++
 backend/tests/test_local_sandbox_encoding.py  | 33 +++++++++++++++++++
 backend/tests/test_sandbox_tools_security.py  |  4 +--
 backend/tests/test_skills_router.py           | 28 ++++++++++++++++
 .../image-generation/scripts/generate.py      |  2 +-
 .../public/ppt-generation/scripts/generate.py |  2 +-
 .../scripts/aggregate_benchmark.py            | 10 +++---
 .../video-generation/scripts/generate.py      |  2 +-
 15 files changed, 116 insertions(+), 24 deletions(-)
 create mode 100644 backend/tests/test_artifacts_router.py
 create mode 100644 backend/tests/test_local_sandbox_encoding.py

diff --git a/backend/app/gateway/routers/artifacts.py b/backend/app/gateway/routers/artifacts.py
index b2312bc..b9e8afb 100644
--- a/backend/app/gateway/routers/artifacts.py
+++ b/backend/app/gateway/routers/artifacts.py
@@ -63,7 +63,7 @@ def _extract_file_from_skill_archive(zip_path: Path, internal_path: str) -> byte
     summary="Get Artifact File",
     description="Retrieve an artifact file generated by the AI agent. Supports text, HTML, and binary files.",
 )
-async def get_artifact(thread_id: str, path: str, request: Request) -> FileResponse:
+async def get_artifact(thread_id: str, path: str, request: Request) -> Response:
     """Get an artifact file by its path.
 
     The endpoint automatically detects file types and returns appropriate content types.
@@ -147,12 +147,12 @@ async def get_artifact(thread_id: str, path: str, request: Request) -> FileRespo
         return FileResponse(path=actual_path, filename=actual_path.name, media_type=mime_type, headers={"Content-Disposition": f"attachment; filename*=UTF-8''{encoded_filename}"})
 
     if mime_type and mime_type == "text/html":
-        return HTMLResponse(content=actual_path.read_text())
+        return HTMLResponse(content=actual_path.read_text(encoding="utf-8"))
 
     if mime_type and mime_type.startswith("text/"):
-        return PlainTextResponse(content=actual_path.read_text(), media_type=mime_type)
+        return PlainTextResponse(content=actual_path.read_text(encoding="utf-8"), media_type=mime_type)
 
     if is_text_file_by_content(actual_path):
-        return PlainTextResponse(content=actual_path.read_text(), media_type=mime_type)
+        return PlainTextResponse(content=actual_path.read_text(encoding="utf-8"), media_type=mime_type)
 
     return Response(content=actual_path.read_bytes(), media_type=mime_type, headers={"Content-Disposition": f"inline; filename*=UTF-8''{encoded_filename}"})
diff --git a/backend/app/gateway/routers/mcp.py b/backend/app/gateway/routers/mcp.py
index 09133ea..386fc13 100644
--- a/backend/app/gateway/routers/mcp.py
+++ b/backend/app/gateway/routers/mcp.py
@@ -152,7 +152,7 @@ async def update_mcp_configuration(request: McpConfigUpdateRequest) -> McpConfig
         }
 
         # Write the configuration to file
-        with open(config_path, "w") as f:
+        with open(config_path, "w", encoding="utf-8") as f:
             json.dump(config_data, f, indent=2)
 
         logger.info(f"MCP configuration updated and saved to: {config_path}")
diff --git a/backend/app/gateway/routers/skills.py b/backend/app/gateway/routers/skills.py
index c208dba..8214fb7 100644
--- a/backend/app/gateway/routers/skills.py
+++ b/backend/app/gateway/routers/skills.py
@@ -307,7 +307,7 @@ async def update_skill(skill_name: str, request: SkillUpdateRequest) -> SkillRes
         }
 
         # Write the configuration to file
-        with open(config_path, "w") as f:
+        with open(config_path, "w", encoding="utf-8") as f:
             json.dump(config_data, f, indent=2)
 
         logger.info(f"Skills configuration updated and saved to: {config_path}")
diff --git a/backend/packages/harness/deerflow/community/aio_sandbox/aio_sandbox_provider.py b/backend/packages/harness/deerflow/community/aio_sandbox/aio_sandbox_provider.py
index 0acb552..72cae86 100644
--- a/backend/packages/harness/deerflow/community/aio_sandbox/aio_sandbox_provider.py
+++ b/backend/packages/harness/deerflow/community/aio_sandbox/aio_sandbox_provider.py
@@ -401,7 +401,7 @@ class AioSandboxProvider(SandboxProvider):
         paths.ensure_thread_dirs(thread_id)
         lock_path = paths.thread_dir(thread_id) / f"{sandbox_id}.lock"
 
-        with open(lock_path, "a") as lock_file:
+        with open(lock_path, "a", encoding="utf-8") as lock_file:
             try:
                 fcntl.flock(lock_file, fcntl.LOCK_EX)
                 # Re-check in-process caches under the file lock in case another
diff --git a/backend/packages/harness/deerflow/sandbox/local/local_sandbox.py b/backend/packages/harness/deerflow/sandbox/local/local_sandbox.py
index 70655c8..6abe0a1 100644
--- a/backend/packages/harness/deerflow/sandbox/local/local_sandbox.py
+++ b/backend/packages/harness/deerflow/sandbox/local/local_sandbox.py
@@ -180,7 +180,7 @@ class LocalSandbox(Sandbox):
     def read_file(self, path: str) -> str:
         resolved_path = self._resolve_path(path)
         try:
-            with open(resolved_path) as f:
+            with open(resolved_path, encoding="utf-8") as f:
                 return f.read()
         except OSError as e:
             # Re-raise with the original path for clearer error messages, hiding internal resolved paths
@@ -193,7 +193,7 @@ class LocalSandbox(Sandbox):
             if dir_path:
                 os.makedirs(dir_path, exist_ok=True)
             mode = "a" if append else "w"
-            with open(resolved_path, mode) as f:
+            with open(resolved_path, mode, encoding="utf-8") as f:
                 f.write(content)
         except OSError as e:
             # Re-raise with the original path for clearer error messages, hiding internal resolved paths
diff --git a/backend/packages/harness/deerflow/sandbox/tools.py b/backend/packages/harness/deerflow/sandbox/tools.py
index ab1879c..0d34d86 100644
--- a/backend/packages/harness/deerflow/sandbox/tools.py
+++ b/backend/packages/harness/deerflow/sandbox/tools.py
@@ -25,6 +25,10 @@ _LOCAL_BASH_SYSTEM_PATH_PREFIXES = (
 )
 
 
+def _path_variants(path: str) -> set[str]:
+    return {path, path.replace("\\", "/"), path.replace("/", "\\")}
+
+
 def replace_virtual_path(path: str, thread_data: ThreadDataState | None) -> str:
     """Replace virtual /mnt/user-data paths with actual thread data paths.
 
@@ -101,15 +105,15 @@ def mask_local_paths_in_output(output: str, thread_data: ThreadDataState | None)
     for actual_base, virtual_base in sorted(mappings.items(), key=lambda item: len(item[0]), reverse=True):
         raw_base = str(Path(actual_base))
         resolved_base = str(Path(actual_base).resolve())
-        for base in {raw_base, resolved_base}:
-            escaped_actual = re.escape(base)
-            pattern = re.compile(escaped_actual + r"(?:/[^\s\"';&|<>()]*)?")
+        for base in _path_variants(raw_base) | _path_variants(resolved_base):
+            escaped_actual = re.escape(base).replace(r"\\", r"[/\\]")
+            pattern = re.compile(escaped_actual + r"(?:[/\\][^\s\"';&|<>()]*)?")
 
             def replace_match(match: re.Match) -> str:
                 matched_path = match.group(0)
                 if matched_path == base:
                     return virtual_base
-                relative = matched_path[len(base) :].lstrip("/")
+                relative = matched_path[len(base) :].lstrip("/\\")
                 return f"{virtual_base}/{relative}" if relative else virtual_base
 
             result = pattern.sub(replace_match, result)
diff --git a/backend/packages/harness/deerflow/skills/validation.py b/backend/packages/harness/deerflow/skills/validation.py
index 648f2f6..4c0f808 100644
--- a/backend/packages/harness/deerflow/skills/validation.py
+++ b/backend/packages/harness/deerflow/skills/validation.py
@@ -25,7 +25,7 @@ def _validate_skill_frontmatter(skill_dir: Path) -> tuple[bool, str, str | None]
     if not skill_md.exists():
         return False, "SKILL.md not found", None
 
-    content = skill_md.read_text()
+    content = skill_md.read_text(encoding="utf-8")
     if not content.startswith("---"):
         return False, "No YAML frontmatter found", None
 
diff --git a/backend/tests/test_artifacts_router.py b/backend/tests/test_artifacts_router.py
new file mode 100644
index 0000000..9a4e5dd
--- /dev/null
+++ b/backend/tests/test_artifacts_router.py
@@ -0,0 +1,27 @@
+import asyncio
+from pathlib import Path
+
+from starlette.requests import Request
+
+import app.gateway.routers.artifacts as artifacts_router
+
+
+def test_get_artifact_reads_utf8_text_file_on_windows_locale(tmp_path, monkeypatch) -> None:
+    artifact_path = tmp_path / "note.txt"
+    text = "Curly quotes: \u201cutf8\u201d"
+    artifact_path.write_text(text, encoding="utf-8")
+
+    original_read_text = Path.read_text
+
+    def read_text_with_gbk_default(self, *args, **kwargs):
+        kwargs.setdefault("encoding", "gbk")
+        return original_read_text(self, *args, **kwargs)
+
+    monkeypatch.setattr(Path, "read_text", read_text_with_gbk_default)
+    monkeypatch.setattr(artifacts_router, "resolve_thread_virtual_path", lambda _thread_id, _path: artifact_path)
+
+    request = Request({"type": "http", "method": "GET", "path": "/", "headers": [], "query_string": b""})
+    response = asyncio.run(artifacts_router.get_artifact("thread-1", "mnt/user-data/outputs/note.txt", request))
+
+    assert bytes(response.body).decode("utf-8") == text
+    assert response.media_type == "text/plain"
diff --git a/backend/tests/test_local_sandbox_encoding.py b/backend/tests/test_local_sandbox_encoding.py
new file mode 100644
index 0000000..6040e73
--- /dev/null
+++ b/backend/tests/test_local_sandbox_encoding.py
@@ -0,0 +1,33 @@
+import builtins
+
+import deerflow.sandbox.local.local_sandbox as local_sandbox
+from deerflow.sandbox.local.local_sandbox import LocalSandbox
+
+
+def _open(base, file, mode="r", *args, **kwargs):
+    if "b" in mode:
+        return base(file, mode, *args, **kwargs)
+    return base(file, mode, *args, encoding=kwargs.pop("encoding", "gbk"), **kwargs)
+
+
+def test_read_file_uses_utf8_on_windows_locale(tmp_path, monkeypatch):
+    path = tmp_path / "utf8.txt"
+    text = "\u201cutf8\u201d"
+    path.write_text(text, encoding="utf-8")
+    base = builtins.open
+
+    monkeypatch.setattr(local_sandbox, "open", lambda file, mode="r", *args, **kwargs: _open(base, file, mode, *args, **kwargs), raising=False)
+
+    assert LocalSandbox("t").read_file(str(path)) == text
+
+
+def test_write_file_uses_utf8_on_windows_locale(tmp_path, monkeypatch):
+    path = tmp_path / "utf8.txt"
+    text = "emoji \U0001F600"
+    base = builtins.open
+
+    monkeypatch.setattr(local_sandbox, "open", lambda file, mode="r", *args, **kwargs: _open(base, file, mode, *args, **kwargs), raising=False)
+
+    LocalSandbox("t").write_file(str(path), text)
+
+    assert path.read_text(encoding="utf-8") == text
diff --git a/backend/tests/test_sandbox_tools_security.py b/backend/tests/test_sandbox_tools_security.py
index b50e563..ea079e9 100644
--- a/backend/tests/test_sandbox_tools_security.py
+++ b/backend/tests/test_sandbox_tools_security.py
@@ -18,8 +18,8 @@ def test_replace_virtual_path_maps_virtual_root_and_subpaths() -> None:
         "outputs_path": "/tmp/deer-flow/threads/t1/user-data/outputs",
     }
 
-    assert replace_virtual_path("/mnt/user-data/workspace/a.txt", thread_data) == "/tmp/deer-flow/threads/t1/user-data/workspace/a.txt"
-    assert replace_virtual_path("/mnt/user-data", thread_data) == "/tmp/deer-flow/threads/t1/user-data"
+    assert Path(replace_virtual_path("/mnt/user-data/workspace/a.txt", thread_data)).as_posix() == "/tmp/deer-flow/threads/t1/user-data/workspace/a.txt"
+    assert Path(replace_virtual_path("/mnt/user-data", thread_data)).as_posix() == "/tmp/deer-flow/threads/t1/user-data"
 
 
 def test_mask_local_paths_in_output_hides_host_paths() -> None:
diff --git a/backend/tests/test_skills_router.py b/backend/tests/test_skills_router.py
index e4cf993..470ad69 100644
--- a/backend/tests/test_skills_router.py
+++ b/backend/tests/test_skills_router.py
@@ -58,3 +58,31 @@ unsupported: true
     assert valid is False
     assert "unsupported" in message
     assert skill_name is None
+
+
+def test_validate_skill_frontmatter_reads_utf8_on_windows_locale(tmp_path, monkeypatch) -> None:
+    skill_dir = tmp_path / "demo-skill"
+    _write_skill(
+        skill_dir,
+        """---
+name: demo-skill
+description: "Curly quotes: \u201cutf8\u201d"
+---
+
+# Demo Skill
+""",
+    )
+
+    original_read_text = Path.read_text
+
+    def read_text_with_gbk_default(self, *args, **kwargs):
+        kwargs.setdefault("encoding", "gbk")
+        return original_read_text(self, *args, **kwargs)
+
+    monkeypatch.setattr(Path, "read_text", read_text_with_gbk_default)
+
+    valid, message, skill_name = VALIDATE_SKILL_FRONTMATTER(skill_dir)
+
+    assert valid is True
+    assert message == "Skill is valid!"
+    assert skill_name == "demo-skill"
diff --git a/skills/public/image-generation/scripts/generate.py b/skills/public/image-generation/scripts/generate.py
index 9665faf..7670176 100644
--- a/skills/public/image-generation/scripts/generate.py
+++ b/skills/public/image-generation/scripts/generate.py
@@ -33,7 +33,7 @@ def generate_image(
     output_file: str,
     aspect_ratio: str = "16:9",
 ) -> str:
-    with open(prompt_file, "r") as f:
+    with open(prompt_file, "r", encoding="utf-8") as f:
         prompt = f.read()
     parts = []
     i = 0
diff --git a/skills/public/ppt-generation/scripts/generate.py b/skills/public/ppt-generation/scripts/generate.py
index dc277d1..a80676d 100644
--- a/skills/public/ppt-generation/scripts/generate.py
+++ b/skills/public/ppt-generation/scripts/generate.py
@@ -24,7 +24,7 @@ def generate_ppt(
         Status message
     """
     # Load presentation plan
-    with open(plan_file, "r") as f:
+    with open(plan_file, "r", encoding="utf-8") as f:
         plan = json.load(f)
 
     # Determine slide dimensions based on aspect ratio
diff --git a/skills/public/skill-creator/scripts/aggregate_benchmark.py b/skills/public/skill-creator/scripts/aggregate_benchmark.py
index 3e66e8c..fa44b38 100755
--- a/skills/public/skill-creator/scripts/aggregate_benchmark.py
+++ b/skills/public/skill-creator/scripts/aggregate_benchmark.py
@@ -87,7 +87,7 @@ def load_run_results(benchmark_dir: Path) -> dict:
         metadata_path = eval_dir / "eval_metadata.json"
         if metadata_path.exists():
             try:
-                with open(metadata_path) as mf:
+                with open(metadata_path, encoding="utf-8") as mf:
                     eval_id = json.load(mf).get("eval_id", eval_idx)
             except (json.JSONDecodeError, OSError):
                 eval_id = eval_idx
@@ -117,7 +117,7 @@ def load_run_results(benchmark_dir: Path) -> dict:
                     continue
 
                 try:
-                    with open(grading_file) as f:
+                    with open(grading_file, encoding="utf-8") as f:
                         grading = json.load(f)
                 except json.JSONDecodeError as e:
                     print(f"Warning: Invalid JSON in {grading_file}: {e}")
@@ -139,7 +139,7 @@ def load_run_results(benchmark_dir: Path) -> dict:
                 timing_file = run_dir / "timing.json"
                 if result["time_seconds"] == 0.0 and timing_file.exists():
                     try:
-                        with open(timing_file) as tf:
+                        with open(timing_file, encoding="utf-8") as tf:
                             timing_data = json.load(tf)
                         result["time_seconds"] = timing_data.get("total_duration_seconds", 0.0)
                         result["tokens"] = timing_data.get("total_tokens", 0)
@@ -374,13 +374,13 @@ def main():
     output_md = output_json.with_suffix(".md")
 
     # Write benchmark.json
-    with open(output_json, "w") as f:
+    with open(output_json, "w", encoding="utf-8") as f:
         json.dump(benchmark, f, indent=2)
     print(f"Generated: {output_json}")
 
     # Write benchmark.md
     markdown = generate_markdown(benchmark)
-    with open(output_md, "w") as f:
+    with open(output_md, "w", encoding="utf-8") as f:
         f.write(markdown)
     print(f"Generated: {output_md}")
 
diff --git a/skills/public/video-generation/scripts/generate.py b/skills/public/video-generation/scripts/generate.py
index e01ebb3..6f28f57 100644
--- a/skills/public/video-generation/scripts/generate.py
+++ b/skills/public/video-generation/scripts/generate.py
@@ -11,7 +11,7 @@ def generate_video(
     output_file: str,
     aspect_ratio: str = "16:9",
 ) -> str:
-    with open(prompt_file, "r") as f:
+    with open(prompt_file, "r", encoding="utf-8") as f:
         prompt = f.read()
     referenceImages = []
     i = 0