-
-
Notifications
You must be signed in to change notification settings - Fork 719
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #3629 from lonvia/additional-breaks
Introduce new break types and phrase splitting for Japanese addresses
- Loading branch information
Showing
9 changed files
with
155 additions
and
22 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
61 changes: 61 additions & 0 deletions
61
src/nominatim_api/query_preprocessing/split_japanese_phrases.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
# SPDX-License-Identifier: GPL-3.0-or-later | ||
# | ||
# This file is part of Nominatim. (https://nominatim.org) | ||
# | ||
# Copyright (C) 2025 by the Nominatim developer community. | ||
# For a full list of authors see the git log. | ||
""" | ||
This file divides Japanese addresses into three categories: | ||
prefecture, municipality, and other. | ||
The division is not strict but simple using these keywords. | ||
""" | ||
from typing import List | ||
import re | ||
|
||
from .config import QueryConfig | ||
from .base import QueryProcessingFunc | ||
from ..search.query import Phrase | ||
|
||
MATCH_PATTERNS = [ | ||
r''' | ||
(...??[都都道府県縣]) # [group1] prefecture | ||
(.+?[市区區町村]) # [group2] municipalities (city/wards/towns/villages) | ||
(.+) # [group3] other words | ||
''', | ||
r''' | ||
(...??[都都道府県縣]) # [group1] prefecture | ||
(.+) # [group3] other words | ||
''', | ||
r''' | ||
(.+?[市区區町村]) # [group2] municipalities (city/wards/towns/villages) | ||
(.+) # [group3] other words | ||
''' | ||
] | ||
|
||
|
||
class _JapanesePreprocessing: | ||
|
||
def __init__(self, config: QueryConfig) -> None: | ||
self.config = config | ||
|
||
def split_phrase(self, phrase: Phrase) -> Phrase: | ||
""" | ||
This function performs a division on the given text using a regular expression. | ||
""" | ||
for pattern in MATCH_PATTERNS: | ||
result = re.match(pattern, phrase.text, re.VERBOSE) | ||
if result is not None: | ||
return Phrase(phrase.ptype, ':'.join(result.groups())) | ||
|
||
return phrase | ||
|
||
def __call__(self, phrases: List[Phrase]) -> List[Phrase]: | ||
"""Split a Japanese address using japanese_tokenizer. | ||
""" | ||
return [self.split_phrase(p) for p in phrases] | ||
|
||
|
||
def create(config: QueryConfig) -> QueryProcessingFunc: | ||
""" Create a function of japanese preprocessing. | ||
""" | ||
return _JapanesePreprocessing(config) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
34 changes: 34 additions & 0 deletions
34
test/python/api/query_processing/test_split_japanese_phrases.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
# SPDX-License-Identifier: GPL-3.0-or-later | ||
# | ||
# This file is part of Nominatim. (https://nominatim.org) | ||
# | ||
# Copyright (C) 2025 by the Nominatim developer community. | ||
# For a full list of authors see the git log. | ||
""" | ||
Tests for japanese phrase splitting. | ||
""" | ||
from pathlib import Path | ||
|
||
import pytest | ||
|
||
from icu import Transliterator | ||
|
||
import nominatim_api.search.query as qmod | ||
from nominatim_api.query_preprocessing.config import QueryConfig | ||
from nominatim_api.query_preprocessing import split_japanese_phrases | ||
|
||
def run_preprocessor_on(query): | ||
proc = split_japanese_phrases.create(QueryConfig().set_normalizer(None)) | ||
|
||
return proc(query) | ||
|
||
|
||
@pytest.mark.parametrize('inp,outp', [('大阪府大阪市大阪', '大阪府:大阪市:大阪'), | ||
('大阪府大阪', '大阪府:大阪'), | ||
('大阪市大阪', '大阪市:大阪')]) | ||
def test_split_phrases(inp, outp): | ||
query = [qmod.Phrase(qmod.PhraseType.NONE, inp)] | ||
|
||
out = run_preprocessor_on(query) | ||
|
||
assert out == [qmod.Phrase(qmod.PhraseType.NONE, outp)] |