Skip to content

Commit

Permalink
Merge pull request #3629 from lonvia/additional-breaks
Browse files Browse the repository at this point in the history
Introduce new break types and phrase splitting for Japanese addresses
  • Loading branch information
lonvia authored Jan 9, 2025
2 parents 14ecfc7 + efc09a5 commit f8337be
Show file tree
Hide file tree
Showing 9 changed files with 155 additions and 22 deletions.
8 changes: 5 additions & 3 deletions settings/icu_tokenizer.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
query-preprocessing:
- step: split_japanese_phrases
- step: normalize
normalization:
- ":: lower ()"
Expand All @@ -9,16 +10,17 @@ normalization:
- "'nº' > 'no'"
- "ª > a"
- "º > o"
- "[[:Punctuation:][:Symbol:]\u02bc] > ' '"
- "[[:Punctuation:][:Symbol:][\u02bc] - [-:]]+ > '-'"
- "ß > 'ss'" # German szet is unambiguously equal to double ss
- "[^[:alnum:] [:Canonical_Combining_Class=Virama:] [:Space:]] >"
- "[^[:alnum:] [:Canonical_Combining_Class=Virama:] [:Space:] [-:]] >"
- "[:Lm:] >"
- ":: [[:Number:]] Latin ()"
- ":: [[:Number:]] Ascii ();"
- ":: [[:Number:]] NFD ();"
- "[[:Nonspacing Mark:] [:Cf:]] >;"
- "[:Space:]+ > ' '"
- "[-:]?[:Space:]+[-:]? > ' '"
transliteration:
- "[-:] > ' '"
- ":: Latin ()"
- !include icu-rules/extended-unicode-to-asccii.yaml
- ":: Ascii ()"
Expand Down
61 changes: 61 additions & 0 deletions src/nominatim_api/query_preprocessing/split_japanese_phrases.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2025 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
This file divides Japanese addresses into three categories:
prefecture, municipality, and other.
The division is not strict but simple using these keywords.
"""
from typing import List
import re

from .config import QueryConfig
from .base import QueryProcessingFunc
from ..search.query import Phrase

MATCH_PATTERNS = [
r'''
(...??[都都道府県縣]) # [group1] prefecture
(.+?[市区區町村]) # [group2] municipalities (city/wards/towns/villages)
(.+) # [group3] other words
''',
r'''
(...??[都都道府県縣]) # [group1] prefecture
(.+) # [group3] other words
''',
r'''
(.+?[市区區町村]) # [group2] municipalities (city/wards/towns/villages)
(.+) # [group3] other words
'''
]


class _JapanesePreprocessing:

def __init__(self, config: QueryConfig) -> None:
self.config = config

def split_phrase(self, phrase: Phrase) -> Phrase:
"""
This function performs a division on the given text using a regular expression.
"""
for pattern in MATCH_PATTERNS:
result = re.match(pattern, phrase.text, re.VERBOSE)
if result is not None:
return Phrase(phrase.ptype, ':'.join(result.groups()))

return phrase

def __call__(self, phrases: List[Phrase]) -> List[Phrase]:
"""Split a Japanese address using japanese_tokenizer.
"""
return [self.split_phrase(p) for p in phrases]


def create(config: QueryConfig) -> QueryProcessingFunc:
""" Create a function of japanese preprocessing.
"""
return _JapanesePreprocessing(config)
1 change: 1 addition & 0 deletions src/nominatim_api/search/db_search_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -433,6 +433,7 @@ def get_near_items(self, assignment: TokenAssignment) -> Optional[dbf.WeightedCa
BreakType.START: 0.0,
BreakType.END: 0.0,
BreakType.PHRASE: 0.0,
BreakType.SOFT_PHRASE: 0.0,
BreakType.WORD: 0.1,
BreakType.PART: 0.2,
BreakType.TOKEN: 0.4
Expand Down
4 changes: 2 additions & 2 deletions src/nominatim_api/search/geocoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ def rerank_by_query(self, query: QueryStruct, results: SearchResults) -> None:
"""
assert self.query_analyzer is not None
qwords = [word for phrase in query.source
for word in re.split('[, ]+', phrase.text) if word]
for word in re.split('[-,: ]+', phrase.text) if word]
if not qwords:
return

Expand All @@ -146,7 +146,7 @@ def rerank_by_query(self, query: QueryStruct, results: SearchResults) -> None:
distance = 0.0
norm = self.query_analyzer.normalize_text(' '.join((result.display_name,
result.country_code or '')))
words = set((w for w in norm.split(' ') if w))
words = set((w for w in re.split('[-,: ]+', norm) if w))
if not words:
continue
for qword in qwords:
Expand Down
57 changes: 41 additions & 16 deletions src/nominatim_api/search/icu_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,12 @@
"""
Implementation of query analysis for the ICU tokenizer.
"""
from typing import Tuple, Dict, List, Optional, NamedTuple, Iterator, Any, cast
from typing import Tuple, Dict, List, Optional, Iterator, Any, cast
from collections import defaultdict
import dataclasses
import difflib
import re
from itertools import zip_longest

from icu import Transliterator

Expand All @@ -34,17 +36,30 @@
'C': qmod.TokenType.COUNTRY
}

PENALTY_IN_TOKEN_BREAK = {
qmod.BreakType.START: 0.5,
qmod.BreakType.END: 0.5,
qmod.BreakType.PHRASE: 0.5,
qmod.BreakType.SOFT_PHRASE: 0.5,
qmod.BreakType.WORD: 0.1,
qmod.BreakType.PART: 0.0,
qmod.BreakType.TOKEN: 0.0
}


class QueryPart(NamedTuple):
@dataclasses.dataclass
class QueryPart:
""" Normalized and transliterated form of a single term in the query.
When the term came out of a split during the transliteration,
the normalized string is the full word before transliteration.
The word number keeps track of the word before transliteration
and can be used to identify partial transliterated terms.
Penalty is the break penalty for the break following the token.
"""
token: str
normalized: str
word_number: int
penalty: float


QueryParts = List[QueryPart]
Expand All @@ -58,10 +73,12 @@ def yield_words(terms: List[QueryPart], start: int) -> Iterator[Tuple[str, qmod.
total = len(terms)
for first in range(start, total):
word = terms[first].token
yield word, qmod.TokenRange(first, first + 1)
penalty = PENALTY_IN_TOKEN_BREAK[qmod.BreakType.WORD]
yield word, qmod.TokenRange(first, first + 1, penalty=penalty)
for last in range(first + 1, min(first + 20, total)):
word = ' '.join((word, terms[last].token))
yield word, qmod.TokenRange(first, last + 1)
penalty += terms[last - 1].penalty
yield word, qmod.TokenRange(first, last + 1, penalty=penalty)


@dataclasses.dataclass
Expand Down Expand Up @@ -94,25 +111,25 @@ def rematch(self, norm: str) -> None:
self.penalty += (distance/len(self.lookup_word))

@staticmethod
def from_db_row(row: SaRow) -> 'ICUToken':
def from_db_row(row: SaRow, base_penalty: float = 0.0) -> 'ICUToken':
""" Create a ICUToken from the row of the word table.
"""
count = 1 if row.info is None else row.info.get('count', 1)
addr_count = 1 if row.info is None else row.info.get('addr_count', 1)

penalty = 0.0
penalty = base_penalty
if row.type == 'w':
penalty = 0.3
penalty += 0.3
elif row.type == 'W':
if len(row.word_token) == 1 and row.word_token == row.word:
penalty = 0.2 if row.word.isdigit() else 0.3
penalty += 0.2 if row.word.isdigit() else 0.3
elif row.type == 'H':
penalty = sum(0.1 for c in row.word_token if c != ' ' and not c.isdigit())
penalty += sum(0.1 for c in row.word_token if c != ' ' and not c.isdigit())
if all(not c.isdigit() for c in row.word_token):
penalty += 0.2 * (len(row.word_token) - 1)
elif row.type == 'C':
if len(row.word_token) == 1:
penalty = 0.3
penalty += 0.3

if row.info is None:
lookup_word = row.word
Expand Down Expand Up @@ -202,7 +219,7 @@ async def analyze_query(self, phrases: List[qmod.Phrase]) -> qmod.QueryStruct:

for row in await self.lookup_in_db(list(words.keys())):
for trange in words[row.word_token]:
token = ICUToken.from_db_row(row)
token = ICUToken.from_db_row(row, trange.penalty or 0.0)
if row.type == 'S':
if row.info['op'] in ('in', 'near'):
if trange.start == 0:
Expand Down Expand Up @@ -242,16 +259,24 @@ def split_query(self, query: qmod.QueryStruct) -> Tuple[QueryParts, WordDict]:
wordnr = 0
for phrase in query.source:
query.nodes[-1].ptype = phrase.ptype
for word in phrase.text.split(' '):
phrase_split = re.split('([ :-])', phrase.text)
# The zip construct will give us the pairs of word/break from
# the regular expression split. As the split array ends on the
# final word, we simply use the fillvalue to even out the list and
# add the phrase break at the end.
for word, breakchar in zip_longest(*[iter(phrase_split)]*2, fillvalue=','):
if not word:
continue
trans = self.transliterator.transliterate(word)
if trans:
for term in trans.split(' '):
if term:
parts.append(QueryPart(term, word, wordnr))
parts.append(QueryPart(term, word, wordnr,
PENALTY_IN_TOKEN_BREAK[qmod.BreakType.TOKEN]))
query.add_node(qmod.BreakType.TOKEN, phrase.ptype)
query.nodes[-1].btype = qmod.BreakType.WORD
query.nodes[-1].btype = qmod.BreakType(breakchar)
parts[-1].penalty = PENALTY_IN_TOKEN_BREAK[qmod.BreakType(breakchar)]
wordnr += 1
query.nodes[-1].btype = qmod.BreakType.PHRASE

for word, wrange in yield_words(parts, phrase_start):
words[word].append(wrange)
Expand All @@ -272,7 +297,7 @@ def add_extra_tokens(self, query: qmod.QueryStruct, parts: QueryParts) -> None:
""" Add tokens to query that are not saved in the database.
"""
for part, node, i in zip(parts, query.nodes, range(1000)):
if len(part.token) <= 4 and part[0].isdigit()\
if len(part.token) <= 4 and part.token.isdigit()\
and not node.has_tokens(i+1, qmod.TokenType.HOUSENUMBER):
query.add_token(qmod.TokenRange(i, i+1), qmod.TokenType.HOUSENUMBER,
ICUToken(penalty=0.5, token=0,
Expand Down
9 changes: 8 additions & 1 deletion src/nominatim_api/search/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,13 @@ class BreakType(enum.Enum):
END = '>'
""" End of the query. """
PHRASE = ','
""" Break between two phrases. """
""" Hard break between two phrases. Address parts cannot cross hard
phrase boundaries."""
SOFT_PHRASE = ':'
""" Likely break between two phrases. Address parts should not cross soft
phrase boundaries. Soft breaks can be inserted by a preprocessor
that is analysing the input string.
"""
WORD = ' '
""" Break between words. """
PART = '-'
Expand Down Expand Up @@ -116,6 +122,7 @@ class TokenRange:
"""
start: int
end: int
penalty: Optional[float] = None

def __lt__(self, other: 'TokenRange') -> bool:
return self.end <= other.start
Expand Down
1 change: 1 addition & 0 deletions src/nominatim_api/search/token_assignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ class TypedRange:
qmod.BreakType.START: 0.0,
qmod.BreakType.END: 0.0,
qmod.BreakType.PHRASE: 0.0,
qmod.BreakType.SOFT_PHRASE: 0.0,
qmod.BreakType.WORD: 0.1,
qmod.BreakType.PART: 0.2,
qmod.BreakType.TOKEN: 0.4
Expand Down
2 changes: 2 additions & 0 deletions src/nominatim_db/tokenizer/icu_token_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ class ICUTokenAnalysis:

def __init__(self, norm_rules: str, trans_rules: str,
analysis_rules: Mapping[Optional[str], 'TokenAnalyzerRule']):
# additional break signs are not relevant during name analysis
norm_rules += ";[[:Space:][-:]]+ > ' ';"
self.normalizer = Transliterator.createFromRules("icu_normalization",
norm_rules)
trans_rules += ";[:Space:]+ > ' '"
Expand Down
34 changes: 34 additions & 0 deletions test/python/api/query_processing/test_split_japanese_phrases.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# SPDX-License-Identifier: GPL-3.0-or-later
#
# This file is part of Nominatim. (https://nominatim.org)
#
# Copyright (C) 2025 by the Nominatim developer community.
# For a full list of authors see the git log.
"""
Tests for japanese phrase splitting.
"""
from pathlib import Path

import pytest

from icu import Transliterator

import nominatim_api.search.query as qmod
from nominatim_api.query_preprocessing.config import QueryConfig
from nominatim_api.query_preprocessing import split_japanese_phrases

def run_preprocessor_on(query):
proc = split_japanese_phrases.create(QueryConfig().set_normalizer(None))

return proc(query)


@pytest.mark.parametrize('inp,outp', [('大阪府大阪市大阪', '大阪府:大阪市:大阪'),
('大阪府大阪', '大阪府:大阪'),
('大阪市大阪', '大阪市:大阪')])
def test_split_phrases(inp, outp):
query = [qmod.Phrase(qmod.PhraseType.NONE, inp)]

out = run_preprocessor_on(query)

assert out == [qmod.Phrase(qmod.PhraseType.NONE, outp)]

0 comments on commit f8337be

Please sign in to comment.