diff --git a/docs/supported_publishers.md b/docs/supported_publishers.md index 2f9daf60..d8cf5972 100644 --- a/docs/supported_publishers.md +++ b/docs/supported_publishers.md @@ -1284,6 +1284,21 @@
MainichiShimbun
+ TheJapanNews
diff --git a/src/fundus/parser/utility.py b/src/fundus/parser/utility.py
index 6895902b..e67d422e 100644
--- a/src/fundus/parser/utility.py
+++ b/src/fundus/parser/utility.py
@@ -391,11 +391,20 @@ def generic_text_extraction_with_css(doc, selector: XPath) -> Optional[str]:
return strip_nodes_to_text(nodes)
-def generic_topic_parsing(keywords: Optional[Union[str, List[str]]], delimiter: str = ",") -> List[str]:
+def generic_topic_parsing(
+ keywords: Optional[Union[str, List[str]]], delimiter: Union[str, List[str]] = ","
+) -> List[str]:
+ if isinstance(delimiter, str):
+ delimiter = [delimiter]
+
if not keywords:
topics = []
elif isinstance(keywords, str):
- topics = [cleaned for keyword in keywords.split(delimiter) if (cleaned := keyword.strip())]
+ topics = [
+ cleaned
+ for keyword in re.split(pattern=f"[{''.join(delimiter)}]", string=keywords)
+ if (cleaned := keyword.strip())
+ ]
elif isinstance(keywords, list) and all(isinstance(s, str) for s in keywords):
topics = keywords
else:
diff --git a/src/fundus/publishers/jp/__init__.py b/src/fundus/publishers/jp/__init__.py
index 13a28c01..2b37673c 100644
--- a/src/fundus/publishers/jp/__init__.py
+++ b/src/fundus/publishers/jp/__init__.py
@@ -1,10 +1,11 @@
from fundus.publishers.base_objects import Publisher, PublisherGroup
from fundus.publishers.jp.asahi_shimbun import AsahiShimbunParser
+from fundus.publishers.jp.mainichi_shimbun import MainichiShimbunParser
from fundus.publishers.jp.the_japan_news import TheJapanNewsParser
from fundus.publishers.jp.tokyo_chunichi_shimbun import TokyoChunichiShimbunParser
from fundus.publishers.jp.yomiuri_shimbun import YomiuriShimbunParser
from fundus.scraping.filter import regex_filter
-from fundus.scraping.url import NewsMap, Sitemap
+from fundus.scraping.url import NewsMap, RSSFeed, Sitemap
class JP(metaclass=PublisherGroup):
@@ -51,3 +52,12 @@ class JP(metaclass=PublisherGroup):
parser=TokyoChunichiShimbunParser,
sources=[NewsMap("https://www.chunichi.co.jp/sitemap.xml")],
)
+
+ MainichiShimbun = Publisher(
+ name="Mainichi Shimbun",
+ domain="https://mainichi.jp/",
+ parser=MainichiShimbunParser,
+ sources=[
+ RSSFeed("https://mainichi.jp/rss/etc/mainichi-flash.rss"),
+ ],
+ )
diff --git a/src/fundus/publishers/jp/mainichi_shimbun.py b/src/fundus/publishers/jp/mainichi_shimbun.py
new file mode 100644
index 00000000..7521ddd5
--- /dev/null
+++ b/src/fundus/publishers/jp/mainichi_shimbun.py
@@ -0,0 +1,66 @@
+import datetime
+import re
+from typing import List, Optional
+
+from lxml.cssselect import CSSSelector
+from lxml.etree import XPath
+
+from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute
+from fundus.parser.utility import (
+ apply_substitution_pattern_over_list,
+ extract_article_body_with_selector,
+ generic_author_parsing,
+ generic_date_parsing,
+ generic_topic_parsing,
+ image_extraction,
+ normalize_whitespace,
+)
+
+
+class MainichiShimbunParser(ParserProxy):
+ class V1(BaseParser):
+ _paragraph_selector = CSSSelector("#articledetail-body > p")
+ _subheadline_selector = CSSSelector("#articledetail-body > h2")
+
+ _topic_bloat_pattern = re.compile("速報")
+
+ @attribute
+ def body(self) -> Optional[ArticleBody]:
+ return extract_article_body_with_selector(
+ self.precomputed.doc,
+ paragraph_selector=self._paragraph_selector,
+ subheadline_selector=self._subheadline_selector,
+ )
+
+ @attribute
+ def title(self) -> Optional[str]:
+ if (title := self.precomputed.meta.get("title")) is not None:
+ return normalize_whitespace(title)
+ return None
+
+ @attribute
+ def publishing_date(self) -> Optional[datetime.datetime]:
+ return generic_date_parsing(self.precomputed.ld.bf_search("datePublished"))
+
+ @attribute
+ def authors(self) -> List[str]:
+ return generic_author_parsing(self.precomputed.meta.get("cXenseParse:author"))
+
+ @attribute
+ def topics(self) -> List[str]:
+ return apply_substitution_pattern_over_list(
+ generic_topic_parsing(self.precomputed.meta.get("keywords"), delimiter=[",", "・"]),
+ self._topic_bloat_pattern,
+ )
+
+ @attribute
+ def images(self) -> List[Image]:
+ return image_extraction(
+ doc=self.precomputed.doc,
+ paragraph_selector=self._paragraph_selector,
+ image_selector=XPath("//figure//img[not(ancestor::a[contains(@class,'articledetail-image-scale')])]"),
+ upper_boundary_selector=CSSSelector("#main"),
+ # https://regex101.com/r/awU0Rq/1
+ author_selector=re.compile(r"(、|=(?=.*?撮影$))(?P