Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add NikkanGeadai #689

Merged
merged 4 commits into from
Feb 3, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions docs/supported_publishers.md
Original file line number Diff line number Diff line change
Expand Up @@ -1299,6 +1299,21 @@
<td>&#160;</td>
<td>&#160;</td>
</tr>
<tr>
<td>
<code>NikkanGeadai</code>
</td>
<td>
<div>Nikkan Geadai</div>
</td>
<td>
<a href="https://www.nikkan-gendai.com/">
<span>www.nikkan-gendai.com</span>
</a>
</td>
<td>&#160;</td>
<td>&#160;</td>
</tr>
<tr>
<td>
<code>SankeiShimbun</code>
Expand Down
49 changes: 49 additions & 0 deletions src/fundus/parser/utility.py
Original file line number Diff line number Diff line change
Expand Up @@ -271,6 +271,55 @@ def get_meta_content(root: lxml.html.HtmlElement) -> Dict[str, str]:
return metadata


def transform_breaks_to_paragraphs(element: lxml.html.HtmlElement, **attribs: str) -> lxml.html.HtmlElement:
MaxDall marked this conversation as resolved.
Show resolved Hide resolved
"""Splits the content of <element> on <br> tags into paragraphs and transform them in <p> elements.

Args:
element: The element on which to perform the transformation
**attribs: The attributes of the wrapped paragraphs as keyword arguments. I.e. the
default {"class": "br-wrap"} wil produce the following elements: <p class='br-wrap'>.
To use python keywords wrap them dunder scores. __class__ for class.

Returns:
The transformed element
"""

if not attribs:
attribs = {"class": "br-wrap"}
else:
attribs = {re.sub(r"^__(.*?)__$", r"\1", key): value for key, value in attribs.items()}

def get_paragraphs() -> List[str]:
raw_html = lxml.etree.tostring(element, method="html", encoding="unicode")
if match := re.match(r"^<[^>]*?>\s*(?P<content>.*?)\s*<[^>]*?>\s*$", raw_html, re.S):
content = match.group("content")
return list(filter(bool, (text.strip() for text in content.split("<br>"))))
return []

def generate_attrs() -> str:
return " ".join([f"{attribute}='{value}'" for attribute, value in attribs.items()]) if attribs else ""

def clear_element():
for child in element:
element.remove(child)
element.tail = None
element.text = None

# split content on <br> tags
if not (paragraphs := get_paragraphs()):
return element

# remove children, tail and text from element
clear_element()

# add paragraphs to cleared element
for paragraph in paragraphs:
wrapped = f"<p{' ' + generate_attrs()}>{paragraph}</p>"
element.append(lxml.html.fromstring(wrapped))

return element


def strip_nodes_to_text(text_nodes: List[lxml.html.HtmlElement], join_on: str = "\n\n") -> Optional[str]:
if not text_nodes:
return None
Expand Down
14 changes: 14 additions & 0 deletions src/fundus/publishers/jp/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from .asahi_shimbun import AsahiShimbunParser
from .mainichi_shimbun import MainichiShimbunParser
from .nikkan_geadai import NikkanGeadaiParser
from .nikkei import NikkeiParser
from .sankei_shimbun import SankeiShimbunParser
from .the_japan_news import TheJapanNewsParser
Expand Down Expand Up @@ -86,3 +87,16 @@ class JP(metaclass=PublisherGroup):
NewsMap("https://www.sankei.com/feeds/sitemapindex-category/?outputType=xml"),
],
)

NikkanGeadai = Publisher(
name="Nikkan Geadai",
domain="https://www.nikkan-gendai.com/",
parser=NikkanGeadaiParser,
sources=[
Sitemap(
"https://www.nikkan-gendai.com/sitemap.xml",
reverse=True,
sitemap_filter=inverse(regex_filter(r"type=articles")),
)
],
)
77 changes: 77 additions & 0 deletions src/fundus/publishers/jp/nikkan_geadai.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
import datetime
import re
from typing import List, Optional

from lxml.cssselect import CSSSelector
from lxml.etree import XPath

from fundus.parser import (
ArticleBody,
BaseParser,
Image,
ParserProxy,
attribute,
function,
)
from fundus.parser.utility import (
extract_article_body_with_selector,
generic_author_parsing,
generic_date_parsing,
generic_topic_parsing,
image_extraction,
transform_breaks_to_paragraphs,
)


class NikkanGeadaiParser(ParserProxy):
class V1(BaseParser):
_paragraph_selector = XPath(
"//div[@class='article-wrap'] //p[@class='full-text'] /p[@class='br-wrap' and text()]"
)

_full_text_selector = CSSSelector("div.article-wrap p.full-text")

_topic_selector = XPath("//main //div[contains(@class, 'm-keyword-list')] /ul /li //text()")

@function(priority=0)
def _transform_br_element(self):
if nodes := self._full_text_selector(self.precomputed.doc):
if len(nodes) != 1:
raise ValueError(f"Expected exactly one node")
else:
transform_breaks_to_paragraphs(nodes[0], __class__="br-wrap")

@attribute
def body(self) -> Optional[ArticleBody]:
return extract_article_body_with_selector(
self.precomputed.doc,
paragraph_selector=self._paragraph_selector,
)

@attribute
def title(self) -> Optional[str]:
return self.precomputed.ld.bf_search("headline")

@attribute
def authors(self) -> List[str]:
return generic_author_parsing(self.precomputed.ld.bf_search("author"))

@attribute
def publishing_date(self) -> Optional[datetime.datetime]:
return generic_date_parsing(self.precomputed.ld.bf_search("datePublished"))

@attribute
def topics(self) -> List[str]:
if topics := self._topic_selector(self.precomputed.doc):
return generic_topic_parsing(topics)
return []

@attribute
def images(self) -> List[Image]:
return image_extraction(
doc=self.precomputed.doc,
paragraph_selector=self._paragraph_selector,
upper_boundary_selector=CSSSelector("div.article-wrap"),
# https://regex101.com/r/uY6o2z/1
author_selector=re.compile(r"(C)(?P<credits>.*?)\s*$"),
)
82 changes: 82 additions & 0 deletions tests/resources/parser/test_data/jp/NikkanGeadai.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
{
"V1": {
"authors": [
"日刊現代"
],
"body": {
"summary": [],
"sections": [
{
"headline": [],
"paragraphs": [
"元タレントの中居正広氏(52)の女性トラブルを発端にしたフジテレビ問題が、スポーツ界に飛び火している。",
"今回の一件でフジテレビのガバナンスを問題視するスポンサー企業が続出。",
"75社以上がCM差し止めに踏み切る中、フジテレビと放映権契約を結ぶなど、密接な関係を築いてきたスポーツの各競技団体もテンヤワンヤになっているのだ。",
"サッカーのJリーグは、フジと放映権契約を結ぶ「ルヴァン杯」の開幕が3月20日に控える。フジはCMやリーグの関連番組にも関わっており、1月28日のJリーグ理事会で今後の対応について協議した。",
"日本バレーボール協会(JVA)もフジとの関係は深い。昨年発足したSVリーグの今季開幕戦を地上波で生中継。今季の今後のリーグ戦の中継も、フジのCS局が行う予定。春高バレーは先日終わったものの、JVAは「本件に関して、現在、対応を検討中です。これ以上、お答えできることはございません」と回答したが、世界バレーやネーションズリーグはTBSと放映権契約を結んでおり、「鞍替え」が検討されても不思議ではない。",
"ゴルフ界も対岸の火事ではない。",
"国内ツアー「フジサンケイレディス」(4月)、「フジサンケイクラシック」(9月)はフジが主催。今季3戦目の「アクサレディス」(3月)はフジが後援を務める。",
"日本女子プロゴルフ協会は現在、対応を検討中だが、現場ではフジサンケイレディスの開催を危ぶむ声が少なくないという。",
"フィギュアは、かねて世界フィギュア選手権(3月)などをフジテレビが中継している。",
"日本スケート連盟は「世界フィギュア選手権の(放映権)契約は国際スケート連盟とフジテレビとなっておりますので、本連盟はお答えする立場にはございません」と回答した。",
"フジテレビからの放映権料は、各競技団体の収入源になっている。しかし、番組スポンサーが撤退すれば、放送すらおぼつかなくなっても不思議ではない。"
]
}
]
},
"images": [
{
"versions": [
{
"url": "https://c799eb2b0cad47596bf7b1e050e83426.cdnext.stream.ne.jp/img/article/000/367/061/8fc28b260fd17152bf603b00ed91013420250131105559706_262_262.jpg",
"query_width": null,
"size": {
"width": 262,
"height": 262
},
"type": "image/jpeg"
},
{
"url": "https://c799eb2b0cad47596bf7b1e050e83426.cdnext.stream.ne.jp/img/article/000/367/061/8fc28b260fd17152bf603b00ed91013420250131105559706_262_262.webp",
"query_width": null,
"size": {
"width": 262,
"height": 262
},
"type": "image/webp"
},
{
"url": "https://c799eb2b0cad47596bf7b1e050e83426.cdnext.stream.ne.jp/img/article/000/367/061/8fc28b260fd17152bf603b00ed91013420250131105559706.jpg",
"query_width": null,
"size": {
"width": 600,
"height": 600
},
"type": "image/jpeg"
},
{
"url": "https://c799eb2b0cad47596bf7b1e050e83426.cdnext.stream.ne.jp/img/article/000/367/061/8fc28b260fd17152bf603b00ed91013420250131105559706_600_resize.webp",
"query_width": null,
"size": {
"width": 600,
"height": 600
},
"type": "image/webp"
}
],
"is_cover": true,
"description": null,
"caption": "バレー男子ネーションズリーグ、日本代表の(左から)西田、小野寺、高橋藍、石川",
"authors": [
"共同通信社"
],
"position": 228
}
],
"publishing_date": "2025-01-31 11:30:00+09:00",
"title": "フジテレビ問題でスポーツ界にも大激震!協賛企業&quot;総スカン&quot;で各競技団体のビジネスモデル完全崩壊へ",
"topics": [
"フジテレビ"
]
}
}
Binary file not shown.
4 changes: 4 additions & 0 deletions tests/resources/parser/test_data/jp/meta.info
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@
"url": "https://mainichi.jp/articles/20250114/k00/00m/030/335000c",
"crawl_date": "2025-01-14 14:55:19.277555"
},
"NikkanGeadai_2025_01_31.html.gz": {
"url": "https://www.nikkan-gendai.com/articles/view/sports/367061",
"crawl_date": "2025-01-31 13:40:41.093585"
},
"Nikkei_2025_01_27.html.gz": {
"url": "https://www.nikkei.com/article/DGXZQOUB148MY0U5A110C2000000/",
"crawl_date": "2025-01-27 16:41:04.576095"
Expand Down