mirror of
https://gitee.com/wanwujie/deer-flow
synced 2026-04-18 12:04:45 +08:00
48 lines
1.2 KiB
Python
48 lines
1.2 KiB
Python
|
|
"""File conversion utilities.
|
||
|
|
|
||
|
|
Converts document files (PDF, PPT, Excel, Word) to Markdown using markitdown.
|
||
|
|
No FastAPI or HTTP dependencies — pure utility functions.
|
||
|
|
"""
|
||
|
|
|
||
|
|
import logging
|
||
|
|
from pathlib import Path
|
||
|
|
|
||
|
|
logger = logging.getLogger(__name__)
|
||
|
|
|
||
|
|
# File extensions that should be converted to markdown
|
||
|
|
CONVERTIBLE_EXTENSIONS = {
|
||
|
|
".pdf",
|
||
|
|
".ppt",
|
||
|
|
".pptx",
|
||
|
|
".xls",
|
||
|
|
".xlsx",
|
||
|
|
".doc",
|
||
|
|
".docx",
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
async def convert_file_to_markdown(file_path: Path) -> Path | None:
|
||
|
|
"""Convert a file to markdown using markitdown.
|
||
|
|
|
||
|
|
Args:
|
||
|
|
file_path: Path to the file to convert.
|
||
|
|
|
||
|
|
Returns:
|
||
|
|
Path to the markdown file if conversion was successful, None otherwise.
|
||
|
|
"""
|
||
|
|
try:
|
||
|
|
from markitdown import MarkItDown
|
||
|
|
|
||
|
|
md = MarkItDown()
|
||
|
|
result = md.convert(str(file_path))
|
||
|
|
|
||
|
|
# Save as .md file with same name
|
||
|
|
md_path = file_path.with_suffix(".md")
|
||
|
|
md_path.write_text(result.text_content, encoding="utf-8")
|
||
|
|
|
||
|
|
logger.info(f"Converted {file_path.name} to markdown: {md_path.name}")
|
||
|
|
return md_path
|
||
|
|
except Exception as e:
|
||
|
|
logger.error(f"Failed to convert {file_path.name} to markdown: {e}")
|
||
|
|
return None
|