Files
deer-flow/src/crawler/readability_extractor.py
2025-04-17 11:34:42 +08:00

16 lines
454 B
Python

# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
# SPDX-License-Identifier: MIT
from readabilipy import simple_json_from_html_string
from .article import Article
class ReadabilityExtractor:
def extract_article(self, html: str) -> Article:
article = simple_json_from_html_string(html, use_readability=True)
return Article(
title=article.get("title"),
html_content=article.get("content"),
)