Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

czech: Add czech language support #128

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
114 changes: 114 additions & 0 deletions pysbd/lang/czech.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
# -*- coding: utf-8 -*-
import re
from pysbd.abbreviation_replacer import AbbreviationReplacer
from pysbd.between_punctuation import BetweenPunctuation
from pysbd.lang.common import Common, Standard
from pysbd.processor import Processor
from pysbd.utils import Text
from pysbd.punctuation_replacer import replace_punctuation
from pysbd.lists_item_replacer import ListItemReplacer


class Czech(Common, Standard):
"""
Based on slovak.py language as the languages are very close.
"""

iso_code = 'cs'

class ListItemReplacer(ListItemReplacer):

def add_line_break(self):
# We've found alphabetical lists are causing a lot of problems with abbreviations
# with multiple periods and spaces, such as 'Company name s. r. o.'. Disabling
# alphabetical list parsing seems like a reasonable tradeoff.

# self.format_alphabetical_lists()
self.format_roman_numeral_lists()
self.format_numbered_list_with_periods()
self.format_numbered_list_with_parens()
return self.text

class AbbreviationReplacer(AbbreviationReplacer):
SENTENCE_STARTERS = []

def replace_period_of_abbr(self, txt, abbr):
# This is a very simple version of the original function, which makes sure
# all of the periods in the abbreviation get replaced, not only the last one.
# In Czech language we use a lot of abbreviations like 'Company Name s. r. o.', so it
# is important to handle this properly.

abbr_new = abbr.replace(".", "∯") + "∯"
txt = txt.replace(abbr + ".", abbr_new)
return txt

class Abbreviation(Standard.Abbreviation):
ABBREVIATIONS = ['č', 'no', 'nr', 's. r. o', 'ing', 'p', 'a. d', 'o. k', 'pol. pr', 'a. s. a. p', 'p. n. l', 'red', 'o.k', 'a.d', 'm.o', 'pol.pr', 'a.s.a.p', 'p.n.l', 'pp', 'sl', 'corp', 'plgr', 'tz', 'rtg', 'o.c.p', 'o. c. p', 'c.k', 'c. k', 'n.a', 'n. a', 'a.m', 'a. m', 'vz', 'i.b', 'i. b', 'ú.p.v.o', 'ú. p. v. o', 'bros', 'rsdr', 'doc', 'tu', 'ods', 'n.w.a', 'n. w. a', 'nár', 'pedg', 'paeddr', 'rndr', 'naprk', 'napřk', 'a.g.p', 'a. g. p', 'prof', 'pr', 'př', 'a.v', 'a. v', 'por', 'mvdr', 'nešp', 'u.s', 'u. s', 'kt', 'vyd', 'e.t', 'e. t', 'al', 'll.m', 'll. m', 'o.f.i', 'o. f. i', 'mr', 'apod', 'súkr', 'střed', 's.e.g', 's. e. g', 'sr', 'tvz', 'ind', 'var', 'etc', 'atd', 'n.o', 'n. o', 's.a', 's. a', 'např', 'a.i.i', 'a. i. i', 'a.k.a', 'a. k. a', 'konkr', 'čsl', 'odd', 'ltd', 't.z', 't. z', 'o.z', 'o. z', 'obv', 'obr', 'pok', 'tel', 'št', 'skr', 'phdr', 'xx', 'š.p', 'š. p', 'ph.d', 'ph. d', 'm.n.m', 'm. n. m', 'zz', 'roz', 'atď.', 'ev', 'v.sp', 'v. sp', 'drsc', 'mudr', 't.č', 't. č', 'el', 'os', 'co', 'r.o', 'r. o', 'str', 'p.a', 'p. a', 'zdravot', 'prek', 'gen', 'viď', 'dr', 'cca', 'p.s', 'p. s', 'zák', 'slov', 'arm', 'inc', 'max', 'd.c', 'k.o', 'a. r. k', 'd. c', 'k. o', 'a. r. k', 'soc', 'bc', 'zs', 'akad', 'sz', 'pozn', 'tr', 'nám', 'kol', 'csc', 'ul', 'sp', 'o.i', 'jr', 'zb', 'sv', 'tj', 'čs', 'tzn', 'příp', 'iv', 'hl', 'st', 'pod', 'vi', 'tis', 'stor', 'rozh', 'mld', 'atď', 'mgr', 'a.s', 'a. s', 'phd', 'z.z', 'z. z', 'judr', 'ing', 'hod', 'vs', 'písm', 's.r.o', 'min', 'ml', 'iii', 't.j', 't. j', 'spol', 'mil', 'ii', 'napr', 'resp', 'tzv']
PREPOSITIVE_ABBREVIATIONS = ['st', 'p', 'dr', 'mudr', 'judr', 'ing', 'mgr', 'bc', 'drsc', 'doc', 'prof']
NUMBER_ABBREVIATIONS = ['č', 'no', 'nr']

class BetweenPunctuation(BetweenPunctuation):
# Rubular: https://rubular.com/r/rImWbaYFtHHtf4
BETWEEN_CZECH_DOUBLE_QUOTES_REGEX = r'„(?>[^“\\]+|\\{2}|\\.)*“'
BETWEEN_CZECH_DOUBLE_QUOTES_REGEX_2 = r'\„(?=(?P<tmp>[^“\\]+|\\{2}|\\.)*)(?P=tmp)\“'

def sub_punctuation_between_czech_double_quotes(self, txt):
return re.sub(self.BETWEEN_CZECH_DOUBLE_QUOTES_REGEX_2, replace_punctuation, txt)

def sub_punctuation_between_quotes_and_parens(self, txt):
txt = self.sub_punctuation_between_single_quotes(txt)
txt = self.sub_punctuation_between_single_quote_slanted(txt)
txt = self.sub_punctuation_between_double_quotes(txt)
txt = self.sub_punctuation_between_square_brackets(txt)
txt = self.sub_punctuation_between_parens(txt)
txt = self.sub_punctuation_between_quotes_arrow(txt)
txt = self.sub_punctuation_between_em_dashes(txt)
txt = self.sub_punctuation_between_quotes_slanted(txt)
txt = self.sub_punctuation_between_czech_double_quotes(txt)
return txt

class Processor(Processor):

def __init__(self, text, lang, char_span=False):
super().__init__(text, lang, char_span)

def process(self):
if not self.text:
return self.text
self.text = self.text.replace('\n', '\r')

# Here we use language specific ListItemReplacer:
li = self.lang.ListItemReplacer(self.text)
self.text = li.add_line_break()

self.replace_abbreviations()
self.replace_numbers()
self.replace_continuous_punctuation()
self.replace_periods_before_numeric_references()
self.text = Text(self.text).apply(
self.lang.Abbreviation.WithMultiplePeriodsAndEmailRule,
self.lang.GeoLocationRule, self.lang.FileFormatRule)
postprocessed_sents = self.split_into_segments()
return postprocessed_sents

def replace_numbers(self):
self.text = Text(self.text).apply(*self.lang.Numbers.All)
self.replace_period_in_czech_dates()
self.replace_period_in_ordinal_numerals()
self.replace_period_in_roman_numerals()
return self.text

def replace_period_in_ordinal_numerals(self):
# Rubular: https://rubular.com/r/0HkmvzMGTqgWs6
self.text = re.sub(r'(?<=\d)\.(?=\s*[a-z]+)', '∯', self.text)

def replace_period_in_roman_numerals(self):
# Rubular: https://rubular.com/r/XlzTIi7aBRThSl
self.text = re.sub(r'((\s+[VXI]+)|(^[VXI]+))(\.)(?=\s+)', r'\1∯', self.text, re.IGNORECASE)

def replace_period_in_czech_dates(self):
MONTHS = ['leden', 'únor', 'březen', 'duben', 'květen', 'červen', 'červenec', 'srpen', 'září', 'říjen', 'listopad', 'prosinec',
'ledna', 'února', 'března', 'dubna', 'května', 'června', 'července', 'srpna', 'října', 'listopadu', 'prosince']
for month in MONTHS:
# Rubular: https://rubular.com/r/dGLZqsbjcdJvCd
self.text = re.sub(r'(?<=\d)\.(?=\s*{month})'.format(month=month), '∯', self.text)
4 changes: 3 additions & 1 deletion pysbd/languages.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from pysbd.lang.deutsch import Deutsch
from pysbd.lang.kazakh import Kazakh
from pysbd.lang.slovak import Slovak
from pysbd.lang.czech import Czech

LANGUAGE_CODES = {
'en': English,
Expand All @@ -46,7 +47,8 @@
'ja': Japanese,
'de': Deutsch,
'kk': Kazakh,
'sk': Slovak
'sk': Slovak,
'cs': Czech
}


Expand Down
5 changes: 5 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,3 +146,8 @@ def kk_default_fixture():
def sk_default_fixture():
sk_segmenter = pysbd.Segmenter(language="sk", clean=False, char_span=False)
return sk_segmenter

@pytest.fixture()
def cz_default_fixture():
cz_segmenter = pysbd.Segmenter(language="cs", clean=False, char_span=False)
return cz_segmenter
23 changes: 23 additions & 0 deletions tests/lang/test_czech.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# -*- coding: utf-8 -*-
import pytest

GOLDEN_CZ_RULES_TEST_CASES = [
("Jde o majite firmy ABTrade s. r. o., kteří stojí i za dalšími společnostmi, např. XYZCorp a.s.",
["Jde o majite firmy ABTrade s. r. o., kteří stojí i za dalšími společnostmi, např. XYZCorp a.s."]),
("„Průzkumy beru na lehkou váhu. V podstatě mě to nezajímá,“ reagoval Zeman na průzkum agentury Focus.",
["„Průzkumy beru na lehkou váhu. V podstatě mě to nezajímá,“ reagoval Zeman na průzkum agentury Focus."]),
("Toto se mi podařilo až na 10. pokus, ale stálo to za to.",
["Toto se mi podařilo až na 10. pokus, ale stálo to za to."]),
("Jde o príslušníky XII. Pluku speciálního nasazení.",
["Jde o príslušníky XII. Pluku speciálního nasazení."]),
("Společnost byla založena 7. dubna 2020, na smlouvě však figuruje datum 20. březen 2020.",
["Společnost byla založena 7. dubna 2020, na smlouvě však figuruje datum 20. březen 2020."]),
]


@pytest.mark.parametrize('text,expected_sents', GOLDEN_CZ_RULES_TEST_CASES)
def test_pl_sbd(cz_default_fixture, text, expected_sents):
"""Czech language SBD tests"""
segments = cz_default_fixture.segment(text)
segments = [s.strip() for s in segments]
assert segments == expected_sents