From 840da9e2d661843084241e3cc561ad6dba4a3d97 Mon Sep 17 00:00:00 2001 From: xxyzz Date: Tue, 2 Jan 2024 19:47:48 +0800 Subject: [PATCH] Remove pos length check in `test_long` Some pages use the template `zh-see` to redirect simplified Chinese characters don't have POS title. Not sure why the pr 174 of wikitextprocessor affects this test. Another change is adding the redirect data to `pos_data` if it's not empty. --- src/wiktextract/extractor/en/page.py | 9 ++++++--- tests/test_long.py | 2 +- tests/test_page.py | 30 ++++++++++++++++++++++++++-- 3 files changed, 35 insertions(+), 6 deletions(-) diff --git a/src/wiktextract/extractor/en/page.py b/src/wiktextract/extractor/en/page.py index a57cc191..0e1c9035 100644 --- a/src/wiktextract/extractor/en/page.py +++ b/src/wiktextract/extractor/en/page.py @@ -3011,9 +3011,12 @@ def skip_template_fn(name, ht): stack.pop() if len(redirect_list) > 0: - new_page_data = base_data.copy() - new_page_data["redirects"] = redirect_list - page_datas.append(new_page_data) + if len(pos_data) > 0: + pos_data["redirects"] = redirect_list + else: + new_page_data = base_data.copy() + new_page_data["redirects"] = redirect_list + page_datas.append(new_page_data) def extract_examples(others, sense_base): """Parses through a list of definitions and quotes to find examples. diff --git a/tests/test_long.py b/tests/test_long.py index 4942b3cf..8440d24f 100644 --- a/tests/test_long.py +++ b/tests/test_long.py @@ -76,8 +76,8 @@ def test_long(self): words[word] += 1 lang = data.get("lang", "") self.assertGreater(len(lang), 0) + # redirect Chinese character maybe not have pos pos = data.get("pos", "") - self.assertGreater(len(pos), 0) langs[lang] += 1 poses[pos] += 1 if data.get("translations"): diff --git a/tests/test_page.py b/tests/test_page.py index 029ee297..d733f3ea 100644 --- a/tests/test_page.py +++ b/tests/test_page.py @@ -2,12 +2,10 @@ # # Copyright (c) 2021 Tatu Ylonen. See file LICENSE and https://ylonen.org -import json import unittest from unittest.mock import patch from wikitextprocessor import Page, Wtp - from wiktextract.config import WiktionaryConfig from wiktextract.page import parse_page from wiktextract.thesaurus import close_thesaurus_db @@ -494,6 +492,34 @@ def test_zh_see(self, mock_get_page): } ], ) + data = parse_page( + self.wxr, + "车", + """ +==Chinese== +===Glyph origin=== +From cursive script of 車. + +===Definitions=== +{{zh-see|車|s}} +{{zh-see|龺|ss}} + """, + ) + self.assertEqual( + data, + [ + { + "etymology_templates": [], + "etymology_text": "From cursive script of 車.", + "lang": "Chinese", + "lang_code": "zh", + "redirects": ["車", "龺"], + "word": "车", + "pos": "character", + "senses": [{"tags": ["no-gloss"]}], + } + ], + ) @patch( "wikitextprocessor.Wtp.get_page",