diff --git a/backend/src/utils/readability.py b/backend/src/utils/readability.py index 8915098..e905f71 100644 --- a/backend/src/utils/readability.py +++ b/backend/src/utils/readability.py @@ -1,9 +1,13 @@ +import logging import re +import subprocess from urllib.parse import urljoin from markdownify import markdownify as md from readabilipy import simple_json_from_html_string +logger = logging.getLogger(__name__) + class Article: url: str @@ -53,7 +57,20 @@ class Article: class ReadabilityExtractor: def extract_article(self, html: str) -> Article: - article = simple_json_from_html_string(html, use_readability=True) + try: + article = simple_json_from_html_string(html, use_readability=True) + except (subprocess.CalledProcessError, FileNotFoundError) as exc: + stderr = getattr(exc, "stderr", None) + if isinstance(stderr, bytes): + stderr = stderr.decode(errors="replace") + stderr_info = f"; stderr={stderr.strip()}" if isinstance(stderr, str) and stderr.strip() else "" + logger.warning( + "Readability.js extraction failed with %s%s; falling back to pure-Python extraction", + type(exc).__name__, + stderr_info, + exc_info=True, + ) + article = simple_json_from_html_string(html, use_readability=False) html_content = article.get("content") if not html_content or not str(html_content).strip(): diff --git a/backend/tests/test_readability.py b/backend/tests/test_readability.py new file mode 100644 index 0000000..9545ee2 --- /dev/null +++ b/backend/tests/test_readability.py @@ -0,0 +1,55 @@ +"""Tests for readability extraction fallback behavior.""" + +import subprocess + +import pytest + +from src.utils.readability import ReadabilityExtractor + + +def test_extract_article_falls_back_when_readability_js_fails(monkeypatch): + """When Node-based readability fails, extraction should fall back to Python mode.""" + + calls: list[bool] = [] + + def _fake_simple_json_from_html_string(html: str, use_readability: bool = False): + calls.append(use_readability) + if use_readability: + raise subprocess.CalledProcessError( + returncode=1, + cmd=["node", "ExtractArticle.js"], + stderr="boom", + ) + return {"title": "Fallback Title", "content": "

Fallback Content

"} + + monkeypatch.setattr( + "src.utils.readability.simple_json_from_html_string", + _fake_simple_json_from_html_string, + ) + + article = ReadabilityExtractor().extract_article("test") + + assert calls == [True, False] + assert article.title == "Fallback Title" + assert article.html_content == "

Fallback Content

" + + +def test_extract_article_re_raises_unexpected_exception(monkeypatch): + """Unexpected errors should be surfaced instead of silently falling back.""" + + calls: list[bool] = [] + + def _fake_simple_json_from_html_string(html: str, use_readability: bool = False): + calls.append(use_readability) + if use_readability: + raise RuntimeError("unexpected parser failure") + return {"title": "Should Not Reach Fallback", "content": "

Fallback

"} + + monkeypatch.setattr( + "src.utils.readability.simple_json_from_html_string", + _fake_simple_json_from_html_string, + ) + + with pytest.raises(RuntimeError, match="unexpected parser failure"): + ReadabilityExtractor().extract_article("test") + assert calls == [True]