-
Notifications
You must be signed in to change notification settings - Fork 79
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #685 from flairNLP/add-mainichi-shimbun
Add `MainichiShimbun`
- Loading branch information
Showing
7 changed files
with
168 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
import datetime | ||
import re | ||
from typing import List, Optional | ||
|
||
from lxml.cssselect import CSSSelector | ||
from lxml.etree import XPath | ||
|
||
from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute | ||
from fundus.parser.utility import ( | ||
apply_substitution_pattern_over_list, | ||
extract_article_body_with_selector, | ||
generic_author_parsing, | ||
generic_date_parsing, | ||
generic_topic_parsing, | ||
image_extraction, | ||
normalize_whitespace, | ||
) | ||
|
||
|
||
class MainichiShimbunParser(ParserProxy): | ||
class V1(BaseParser): | ||
_paragraph_selector = CSSSelector("#articledetail-body > p") | ||
_subheadline_selector = CSSSelector("#articledetail-body > h2") | ||
|
||
_topic_bloat_pattern = re.compile("速報") | ||
|
||
@attribute | ||
def body(self) -> Optional[ArticleBody]: | ||
return extract_article_body_with_selector( | ||
self.precomputed.doc, | ||
paragraph_selector=self._paragraph_selector, | ||
subheadline_selector=self._subheadline_selector, | ||
) | ||
|
||
@attribute | ||
def title(self) -> Optional[str]: | ||
if (title := self.precomputed.meta.get("title")) is not None: | ||
return normalize_whitespace(title) | ||
return None | ||
|
||
@attribute | ||
def publishing_date(self) -> Optional[datetime.datetime]: | ||
return generic_date_parsing(self.precomputed.ld.bf_search("datePublished")) | ||
|
||
@attribute | ||
def authors(self) -> List[str]: | ||
return generic_author_parsing(self.precomputed.meta.get("cXenseParse:author")) | ||
|
||
@attribute | ||
def topics(self) -> List[str]: | ||
return apply_substitution_pattern_over_list( | ||
generic_topic_parsing(self.precomputed.meta.get("keywords"), delimiter=[",", "・"]), | ||
self._topic_bloat_pattern, | ||
) | ||
|
||
@attribute | ||
def images(self) -> List[Image]: | ||
return image_extraction( | ||
doc=self.precomputed.doc, | ||
paragraph_selector=self._paragraph_selector, | ||
image_selector=XPath("//figure//img[not(ancestor::a[contains(@class,'articledetail-image-scale')])]"), | ||
upper_boundary_selector=CSSSelector("#main"), | ||
# https://regex101.com/r/awU0Rq/1 | ||
author_selector=re.compile(r"(、|=(?=.*?撮影$))(?P<credits>[^、]*?)(撮影)?\s*$"), | ||
relative_urls=True, | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
{ | ||
"V1": { | ||
"authors": [ | ||
"松岡大地" | ||
], | ||
"body": { | ||
"summary": [], | ||
"sections": [ | ||
{ | ||
"headline": [], | ||
"paragraphs": [ | ||
"パレスチナ自治区ガザ地区のイスラム組織ハマスとイスラエルの停戦交渉を仲介しているカタールの外務省報道官は14日、「これまでで最も合意に近づいている」と語った。対立する双方ともに停戦合意の最終草案に同意しているとみられる。早期停戦を求めるトランプ次期米大統領の就任が20日に迫る中、停戦実現の期待が高まっている。", | ||
"米ニュースサイト「アクシオス」によると、イスラエルと仲介国は最終草案に合意。ハマスも14日の声明で、指導部が交渉内容に満足していると明らかにし、カタールでの今回の交渉で「明確で包括的な合意」がまとまることへの期待を示した。", | ||
"アクシオスが報じた合意案によると、「第1段階」で42日間の停戦を実施。イスラエル軍はガザとエジプトの境界などから徐々に撤退し、ハマスは女性や子どもなど33人の人質を解放する。第1段階の停戦中に、恒久的停戦やイスラエル軍の完全撤退を含む「第2段階」に向けた協議を始めるという。", | ||
"停戦に向けた交渉が大詰めを迎える中、バイデン米大統領は13日、カタールのタミム首長と停戦合意に向けて電話協議を実施。ハマスもタミム氏やトルコの諜報(ちょうほう)機関トップと協議した。", | ||
"こうした動きにネタニヤフ連立政権の一角を占める極右政党は反発。対パレスチナ強硬派のスモトリッチ財務相は、停戦合意を「大惨事」だとし、合意に賛成しない方針を示した。ネタニヤフ首相はベングビール国家治安相と会談し、政権維持へ協力を求めた。", | ||
"イスラエル軍は13日もガザ地区への攻撃を続け、中東メディアによると、45人が死亡した。ガザ保健当局によると、2023年10月の戦闘開始以降のガザ側の死者は、4万6584人になった。", | ||
"一方で、イスラエル軍は13日、ガザ北部の戦闘でイスラエル兵5人が死亡したと発表した。ガザでは約100人の人質が拘束されている。【エルサレム松岡大地、カイロ金子淳】" | ||
] | ||
} | ||
] | ||
}, | ||
"images": [ | ||
{ | ||
"versions": [ | ||
{ | ||
"url": "https://cdn.mainichi.jp/vol1/2023/06/06/20230606k0000m030221000p/9.webp?1", | ||
"query_width": null, | ||
"size": null, | ||
"type": "image/webp" | ||
}, | ||
{ | ||
"url": "https://cdn.mainichi.jp/vol1/2023/06/06/20230606k0000m030221000p/9.jpg?1", | ||
"query_width": null, | ||
"size": { | ||
"width": 800, | ||
"height": 528 | ||
}, | ||
"type": "image/jpeg" | ||
} | ||
], | ||
"is_cover": true, | ||
"description": "イスラエルの国旗=同国で2019年5月", | ||
"caption": "イスラエルの国旗=同国で2019年5月", | ||
"authors": [], | ||
"position": 891 | ||
} | ||
], | ||
"publishing_date": "2025-01-14 21:17:27+09:00", | ||
"title": "イスラエルとハマスの停戦「最も合意に近い」 最終案に双方同意か", | ||
"topics": [ | ||
"国際", | ||
"中東", | ||
"緊迫する中東情勢", | ||
"松岡大地", | ||
"イスラエル", | ||
"カタール", | ||
"パレスチナ" | ||
] | ||
} | ||
} |
Binary file added
BIN
+24.1 KB
tests/resources/parser/test_data/jp/MainichiShimbun_2025_01_14.html.gz
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters