fix(backend): Fix readability extraction crash when Node parser fails (#937)

* Fix readability fallback when Node extraction fails

* Narrow readability fallback errors and enrich logs

---------

Co-authored-by: Willem Jiang <willem.jiang@gmail.com>
This commit is contained in:
エイカク
2026-03-01 23:24:02 +09:00
committed by GitHub
parent d728bb26d5
commit 80316c131e
2 changed files with 73 additions and 1 deletions

View File

@@ -1,9 +1,13 @@
import logging
import re import re
import subprocess
from urllib.parse import urljoin from urllib.parse import urljoin
from markdownify import markdownify as md from markdownify import markdownify as md
from readabilipy import simple_json_from_html_string from readabilipy import simple_json_from_html_string
logger = logging.getLogger(__name__)
class Article: class Article:
url: str url: str
@@ -53,7 +57,20 @@ class Article:
class ReadabilityExtractor: class ReadabilityExtractor:
def extract_article(self, html: str) -> Article: def extract_article(self, html: str) -> Article:
article = simple_json_from_html_string(html, use_readability=True) try:
article = simple_json_from_html_string(html, use_readability=True)
except (subprocess.CalledProcessError, FileNotFoundError) as exc:
stderr = getattr(exc, "stderr", None)
if isinstance(stderr, bytes):
stderr = stderr.decode(errors="replace")
stderr_info = f"; stderr={stderr.strip()}" if isinstance(stderr, str) and stderr.strip() else ""
logger.warning(
"Readability.js extraction failed with %s%s; falling back to pure-Python extraction",
type(exc).__name__,
stderr_info,
exc_info=True,
)
article = simple_json_from_html_string(html, use_readability=False)
html_content = article.get("content") html_content = article.get("content")
if not html_content or not str(html_content).strip(): if not html_content or not str(html_content).strip():

View File

@@ -0,0 +1,55 @@
"""Tests for readability extraction fallback behavior."""
import subprocess
import pytest
from src.utils.readability import ReadabilityExtractor
def test_extract_article_falls_back_when_readability_js_fails(monkeypatch):
"""When Node-based readability fails, extraction should fall back to Python mode."""
calls: list[bool] = []
def _fake_simple_json_from_html_string(html: str, use_readability: bool = False):
calls.append(use_readability)
if use_readability:
raise subprocess.CalledProcessError(
returncode=1,
cmd=["node", "ExtractArticle.js"],
stderr="boom",
)
return {"title": "Fallback Title", "content": "<p>Fallback Content</p>"}
monkeypatch.setattr(
"src.utils.readability.simple_json_from_html_string",
_fake_simple_json_from_html_string,
)
article = ReadabilityExtractor().extract_article("<html><body>test</body></html>")
assert calls == [True, False]
assert article.title == "Fallback Title"
assert article.html_content == "<p>Fallback Content</p>"
def test_extract_article_re_raises_unexpected_exception(monkeypatch):
"""Unexpected errors should be surfaced instead of silently falling back."""
calls: list[bool] = []
def _fake_simple_json_from_html_string(html: str, use_readability: bool = False):
calls.append(use_readability)
if use_readability:
raise RuntimeError("unexpected parser failure")
return {"title": "Should Not Reach Fallback", "content": "<p>Fallback</p>"}
monkeypatch.setattr(
"src.utils.readability.simple_json_from_html_string",
_fake_simple_json_from_html_string,
)
with pytest.raises(RuntimeError, match="unexpected parser failure"):
ReadabilityExtractor().extract_article("<html><body>test</body></html>")
assert calls == [True]