From 191b60a326f3031298d109dc4e8117ac40a00b23 Mon Sep 17 00:00:00 2001 From: -Astraia- <91442300+Doge2077@users.noreply.github.com> Date: Mon, 16 Mar 2026 16:53:12 +0800 Subject: [PATCH] fix: issue 1138 windows encoding (#1139) * fix(windows): use utf-8 for text file operations * fix(windows): normalize sandbox path masking * fix(windows): preserve utf-8 handling after backend split --- backend/app/gateway/routers/artifacts.py | 8 ++--- backend/app/gateway/routers/mcp.py | 2 +- backend/app/gateway/routers/skills.py | 2 +- .../aio_sandbox/aio_sandbox_provider.py | 2 +- .../deerflow/sandbox/local/local_sandbox.py | 4 +-- .../harness/deerflow/sandbox/tools.py | 12 ++++--- .../harness/deerflow/skills/validation.py | 2 +- backend/tests/test_artifacts_router.py | 27 +++++++++++++++ backend/tests/test_local_sandbox_encoding.py | 33 +++++++++++++++++++ backend/tests/test_sandbox_tools_security.py | 4 +-- backend/tests/test_skills_router.py | 28 ++++++++++++++++ .../image-generation/scripts/generate.py | 2 +- .../public/ppt-generation/scripts/generate.py | 2 +- .../scripts/aggregate_benchmark.py | 10 +++--- .../video-generation/scripts/generate.py | 2 +- 15 files changed, 116 insertions(+), 24 deletions(-) create mode 100644 backend/tests/test_artifacts_router.py create mode 100644 backend/tests/test_local_sandbox_encoding.py diff --git a/backend/app/gateway/routers/artifacts.py b/backend/app/gateway/routers/artifacts.py index b2312bc..b9e8afb 100644 --- a/backend/app/gateway/routers/artifacts.py +++ b/backend/app/gateway/routers/artifacts.py @@ -63,7 +63,7 @@ def _extract_file_from_skill_archive(zip_path: Path, internal_path: str) -> byte summary="Get Artifact File", description="Retrieve an artifact file generated by the AI agent. Supports text, HTML, and binary files.", ) -async def get_artifact(thread_id: str, path: str, request: Request) -> FileResponse: +async def get_artifact(thread_id: str, path: str, request: Request) -> Response: """Get an artifact file by its path. The endpoint automatically detects file types and returns appropriate content types. @@ -147,12 +147,12 @@ async def get_artifact(thread_id: str, path: str, request: Request) -> FileRespo return FileResponse(path=actual_path, filename=actual_path.name, media_type=mime_type, headers={"Content-Disposition": f"attachment; filename*=UTF-8''{encoded_filename}"}) if mime_type and mime_type == "text/html": - return HTMLResponse(content=actual_path.read_text()) + return HTMLResponse(content=actual_path.read_text(encoding="utf-8")) if mime_type and mime_type.startswith("text/"): - return PlainTextResponse(content=actual_path.read_text(), media_type=mime_type) + return PlainTextResponse(content=actual_path.read_text(encoding="utf-8"), media_type=mime_type) if is_text_file_by_content(actual_path): - return PlainTextResponse(content=actual_path.read_text(), media_type=mime_type) + return PlainTextResponse(content=actual_path.read_text(encoding="utf-8"), media_type=mime_type) return Response(content=actual_path.read_bytes(), media_type=mime_type, headers={"Content-Disposition": f"inline; filename*=UTF-8''{encoded_filename}"}) diff --git a/backend/app/gateway/routers/mcp.py b/backend/app/gateway/routers/mcp.py index 09133ea..386fc13 100644 --- a/backend/app/gateway/routers/mcp.py +++ b/backend/app/gateway/routers/mcp.py @@ -152,7 +152,7 @@ async def update_mcp_configuration(request: McpConfigUpdateRequest) -> McpConfig } # Write the configuration to file - with open(config_path, "w") as f: + with open(config_path, "w", encoding="utf-8") as f: json.dump(config_data, f, indent=2) logger.info(f"MCP configuration updated and saved to: {config_path}") diff --git a/backend/app/gateway/routers/skills.py b/backend/app/gateway/routers/skills.py index c208dba..8214fb7 100644 --- a/backend/app/gateway/routers/skills.py +++ b/backend/app/gateway/routers/skills.py @@ -307,7 +307,7 @@ async def update_skill(skill_name: str, request: SkillUpdateRequest) -> SkillRes } # Write the configuration to file - with open(config_path, "w") as f: + with open(config_path, "w", encoding="utf-8") as f: json.dump(config_data, f, indent=2) logger.info(f"Skills configuration updated and saved to: {config_path}") diff --git a/backend/packages/harness/deerflow/community/aio_sandbox/aio_sandbox_provider.py b/backend/packages/harness/deerflow/community/aio_sandbox/aio_sandbox_provider.py index 0acb552..72cae86 100644 --- a/backend/packages/harness/deerflow/community/aio_sandbox/aio_sandbox_provider.py +++ b/backend/packages/harness/deerflow/community/aio_sandbox/aio_sandbox_provider.py @@ -401,7 +401,7 @@ class AioSandboxProvider(SandboxProvider): paths.ensure_thread_dirs(thread_id) lock_path = paths.thread_dir(thread_id) / f"{sandbox_id}.lock" - with open(lock_path, "a") as lock_file: + with open(lock_path, "a", encoding="utf-8") as lock_file: try: fcntl.flock(lock_file, fcntl.LOCK_EX) # Re-check in-process caches under the file lock in case another diff --git a/backend/packages/harness/deerflow/sandbox/local/local_sandbox.py b/backend/packages/harness/deerflow/sandbox/local/local_sandbox.py index 70655c8..6abe0a1 100644 --- a/backend/packages/harness/deerflow/sandbox/local/local_sandbox.py +++ b/backend/packages/harness/deerflow/sandbox/local/local_sandbox.py @@ -180,7 +180,7 @@ class LocalSandbox(Sandbox): def read_file(self, path: str) -> str: resolved_path = self._resolve_path(path) try: - with open(resolved_path) as f: + with open(resolved_path, encoding="utf-8") as f: return f.read() except OSError as e: # Re-raise with the original path for clearer error messages, hiding internal resolved paths @@ -193,7 +193,7 @@ class LocalSandbox(Sandbox): if dir_path: os.makedirs(dir_path, exist_ok=True) mode = "a" if append else "w" - with open(resolved_path, mode) as f: + with open(resolved_path, mode, encoding="utf-8") as f: f.write(content) except OSError as e: # Re-raise with the original path for clearer error messages, hiding internal resolved paths diff --git a/backend/packages/harness/deerflow/sandbox/tools.py b/backend/packages/harness/deerflow/sandbox/tools.py index ab1879c..0d34d86 100644 --- a/backend/packages/harness/deerflow/sandbox/tools.py +++ b/backend/packages/harness/deerflow/sandbox/tools.py @@ -25,6 +25,10 @@ _LOCAL_BASH_SYSTEM_PATH_PREFIXES = ( ) +def _path_variants(path: str) -> set[str]: + return {path, path.replace("\\", "/"), path.replace("/", "\\")} + + def replace_virtual_path(path: str, thread_data: ThreadDataState | None) -> str: """Replace virtual /mnt/user-data paths with actual thread data paths. @@ -101,15 +105,15 @@ def mask_local_paths_in_output(output: str, thread_data: ThreadDataState | None) for actual_base, virtual_base in sorted(mappings.items(), key=lambda item: len(item[0]), reverse=True): raw_base = str(Path(actual_base)) resolved_base = str(Path(actual_base).resolve()) - for base in {raw_base, resolved_base}: - escaped_actual = re.escape(base) - pattern = re.compile(escaped_actual + r"(?:/[^\s\"';&|<>()]*)?") + for base in _path_variants(raw_base) | _path_variants(resolved_base): + escaped_actual = re.escape(base).replace(r"\\", r"[/\\]") + pattern = re.compile(escaped_actual + r"(?:[/\\][^\s\"';&|<>()]*)?") def replace_match(match: re.Match) -> str: matched_path = match.group(0) if matched_path == base: return virtual_base - relative = matched_path[len(base) :].lstrip("/") + relative = matched_path[len(base) :].lstrip("/\\") return f"{virtual_base}/{relative}" if relative else virtual_base result = pattern.sub(replace_match, result) diff --git a/backend/packages/harness/deerflow/skills/validation.py b/backend/packages/harness/deerflow/skills/validation.py index 648f2f6..4c0f808 100644 --- a/backend/packages/harness/deerflow/skills/validation.py +++ b/backend/packages/harness/deerflow/skills/validation.py @@ -25,7 +25,7 @@ def _validate_skill_frontmatter(skill_dir: Path) -> tuple[bool, str, str | None] if not skill_md.exists(): return False, "SKILL.md not found", None - content = skill_md.read_text() + content = skill_md.read_text(encoding="utf-8") if not content.startswith("---"): return False, "No YAML frontmatter found", None diff --git a/backend/tests/test_artifacts_router.py b/backend/tests/test_artifacts_router.py new file mode 100644 index 0000000..9a4e5dd --- /dev/null +++ b/backend/tests/test_artifacts_router.py @@ -0,0 +1,27 @@ +import asyncio +from pathlib import Path + +from starlette.requests import Request + +import app.gateway.routers.artifacts as artifacts_router + + +def test_get_artifact_reads_utf8_text_file_on_windows_locale(tmp_path, monkeypatch) -> None: + artifact_path = tmp_path / "note.txt" + text = "Curly quotes: \u201cutf8\u201d" + artifact_path.write_text(text, encoding="utf-8") + + original_read_text = Path.read_text + + def read_text_with_gbk_default(self, *args, **kwargs): + kwargs.setdefault("encoding", "gbk") + return original_read_text(self, *args, **kwargs) + + monkeypatch.setattr(Path, "read_text", read_text_with_gbk_default) + monkeypatch.setattr(artifacts_router, "resolve_thread_virtual_path", lambda _thread_id, _path: artifact_path) + + request = Request({"type": "http", "method": "GET", "path": "/", "headers": [], "query_string": b""}) + response = asyncio.run(artifacts_router.get_artifact("thread-1", "mnt/user-data/outputs/note.txt", request)) + + assert bytes(response.body).decode("utf-8") == text + assert response.media_type == "text/plain" diff --git a/backend/tests/test_local_sandbox_encoding.py b/backend/tests/test_local_sandbox_encoding.py new file mode 100644 index 0000000..6040e73 --- /dev/null +++ b/backend/tests/test_local_sandbox_encoding.py @@ -0,0 +1,33 @@ +import builtins + +import deerflow.sandbox.local.local_sandbox as local_sandbox +from deerflow.sandbox.local.local_sandbox import LocalSandbox + + +def _open(base, file, mode="r", *args, **kwargs): + if "b" in mode: + return base(file, mode, *args, **kwargs) + return base(file, mode, *args, encoding=kwargs.pop("encoding", "gbk"), **kwargs) + + +def test_read_file_uses_utf8_on_windows_locale(tmp_path, monkeypatch): + path = tmp_path / "utf8.txt" + text = "\u201cutf8\u201d" + path.write_text(text, encoding="utf-8") + base = builtins.open + + monkeypatch.setattr(local_sandbox, "open", lambda file, mode="r", *args, **kwargs: _open(base, file, mode, *args, **kwargs), raising=False) + + assert LocalSandbox("t").read_file(str(path)) == text + + +def test_write_file_uses_utf8_on_windows_locale(tmp_path, monkeypatch): + path = tmp_path / "utf8.txt" + text = "emoji \U0001F600" + base = builtins.open + + monkeypatch.setattr(local_sandbox, "open", lambda file, mode="r", *args, **kwargs: _open(base, file, mode, *args, **kwargs), raising=False) + + LocalSandbox("t").write_file(str(path), text) + + assert path.read_text(encoding="utf-8") == text diff --git a/backend/tests/test_sandbox_tools_security.py b/backend/tests/test_sandbox_tools_security.py index b50e563..ea079e9 100644 --- a/backend/tests/test_sandbox_tools_security.py +++ b/backend/tests/test_sandbox_tools_security.py @@ -18,8 +18,8 @@ def test_replace_virtual_path_maps_virtual_root_and_subpaths() -> None: "outputs_path": "/tmp/deer-flow/threads/t1/user-data/outputs", } - assert replace_virtual_path("/mnt/user-data/workspace/a.txt", thread_data) == "/tmp/deer-flow/threads/t1/user-data/workspace/a.txt" - assert replace_virtual_path("/mnt/user-data", thread_data) == "/tmp/deer-flow/threads/t1/user-data" + assert Path(replace_virtual_path("/mnt/user-data/workspace/a.txt", thread_data)).as_posix() == "/tmp/deer-flow/threads/t1/user-data/workspace/a.txt" + assert Path(replace_virtual_path("/mnt/user-data", thread_data)).as_posix() == "/tmp/deer-flow/threads/t1/user-data" def test_mask_local_paths_in_output_hides_host_paths() -> None: diff --git a/backend/tests/test_skills_router.py b/backend/tests/test_skills_router.py index e4cf993..470ad69 100644 --- a/backend/tests/test_skills_router.py +++ b/backend/tests/test_skills_router.py @@ -58,3 +58,31 @@ unsupported: true assert valid is False assert "unsupported" in message assert skill_name is None + + +def test_validate_skill_frontmatter_reads_utf8_on_windows_locale(tmp_path, monkeypatch) -> None: + skill_dir = tmp_path / "demo-skill" + _write_skill( + skill_dir, + """--- +name: demo-skill +description: "Curly quotes: \u201cutf8\u201d" +--- + +# Demo Skill +""", + ) + + original_read_text = Path.read_text + + def read_text_with_gbk_default(self, *args, **kwargs): + kwargs.setdefault("encoding", "gbk") + return original_read_text(self, *args, **kwargs) + + monkeypatch.setattr(Path, "read_text", read_text_with_gbk_default) + + valid, message, skill_name = VALIDATE_SKILL_FRONTMATTER(skill_dir) + + assert valid is True + assert message == "Skill is valid!" + assert skill_name == "demo-skill" diff --git a/skills/public/image-generation/scripts/generate.py b/skills/public/image-generation/scripts/generate.py index 9665faf..7670176 100644 --- a/skills/public/image-generation/scripts/generate.py +++ b/skills/public/image-generation/scripts/generate.py @@ -33,7 +33,7 @@ def generate_image( output_file: str, aspect_ratio: str = "16:9", ) -> str: - with open(prompt_file, "r") as f: + with open(prompt_file, "r", encoding="utf-8") as f: prompt = f.read() parts = [] i = 0 diff --git a/skills/public/ppt-generation/scripts/generate.py b/skills/public/ppt-generation/scripts/generate.py index dc277d1..a80676d 100644 --- a/skills/public/ppt-generation/scripts/generate.py +++ b/skills/public/ppt-generation/scripts/generate.py @@ -24,7 +24,7 @@ def generate_ppt( Status message """ # Load presentation plan - with open(plan_file, "r") as f: + with open(plan_file, "r", encoding="utf-8") as f: plan = json.load(f) # Determine slide dimensions based on aspect ratio diff --git a/skills/public/skill-creator/scripts/aggregate_benchmark.py b/skills/public/skill-creator/scripts/aggregate_benchmark.py index 3e66e8c..fa44b38 100755 --- a/skills/public/skill-creator/scripts/aggregate_benchmark.py +++ b/skills/public/skill-creator/scripts/aggregate_benchmark.py @@ -87,7 +87,7 @@ def load_run_results(benchmark_dir: Path) -> dict: metadata_path = eval_dir / "eval_metadata.json" if metadata_path.exists(): try: - with open(metadata_path) as mf: + with open(metadata_path, encoding="utf-8") as mf: eval_id = json.load(mf).get("eval_id", eval_idx) except (json.JSONDecodeError, OSError): eval_id = eval_idx @@ -117,7 +117,7 @@ def load_run_results(benchmark_dir: Path) -> dict: continue try: - with open(grading_file) as f: + with open(grading_file, encoding="utf-8") as f: grading = json.load(f) except json.JSONDecodeError as e: print(f"Warning: Invalid JSON in {grading_file}: {e}") @@ -139,7 +139,7 @@ def load_run_results(benchmark_dir: Path) -> dict: timing_file = run_dir / "timing.json" if result["time_seconds"] == 0.0 and timing_file.exists(): try: - with open(timing_file) as tf: + with open(timing_file, encoding="utf-8") as tf: timing_data = json.load(tf) result["time_seconds"] = timing_data.get("total_duration_seconds", 0.0) result["tokens"] = timing_data.get("total_tokens", 0) @@ -374,13 +374,13 @@ def main(): output_md = output_json.with_suffix(".md") # Write benchmark.json - with open(output_json, "w") as f: + with open(output_json, "w", encoding="utf-8") as f: json.dump(benchmark, f, indent=2) print(f"Generated: {output_json}") # Write benchmark.md markdown = generate_markdown(benchmark) - with open(output_md, "w") as f: + with open(output_md, "w", encoding="utf-8") as f: f.write(markdown) print(f"Generated: {output_md}") diff --git a/skills/public/video-generation/scripts/generate.py b/skills/public/video-generation/scripts/generate.py index e01ebb3..6f28f57 100644 --- a/skills/public/video-generation/scripts/generate.py +++ b/skills/public/video-generation/scripts/generate.py @@ -11,7 +11,7 @@ def generate_video( output_file: str, aspect_ratio: str = "16:9", ) -> str: - with open(prompt_file, "r") as f: + with open(prompt_file, "r", encoding="utf-8") as f: prompt = f.read() referenceImages = [] i = 0