From 8b84d6d2a71438259c1124d16fa33094218cf6f5 Mon Sep 17 00:00:00 2001 From: Roman Inflianskas Date: Fri, 10 May 2024 16:59:21 +0300 Subject: [PATCH] RFC: Use stanza model for Finnish --- docker/PythonDockerfileDev | 8 ++++++-- tools/tokenizer.py | 3 ++- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/docker/PythonDockerfileDev b/docker/PythonDockerfileDev index f2b576b4..51f0f288 100644 --- a/docker/PythonDockerfileDev +++ b/docker/PythonDockerfileDev @@ -8,6 +8,8 @@ RUN apt-get update -y \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* +RUN pip install torch --index-url https://download.pytorch.org/whl/cpu + RUN pip install -U --no-cache-dir \ setuptools \ wheel \ @@ -22,6 +24,8 @@ RUN pip install -U --no-cache-dir \ bottle \ #spacy spacy \ +#stanza integration for spacy + stanza \ #chinese reading pinyin \ #subtitle file parser @@ -33,7 +37,6 @@ RUN python3 -m spacy download de_core_news_sm \ && python3 -m spacy download nb_core_news_sm \ && python3 -m spacy download es_core_news_sm \ && python3 -m spacy download nl_core_news_sm \ - && python3 -m spacy download fi_core_news_sm \ && python3 -m spacy download fr_core_news_sm \ && python3 -m spacy download it_core_news_sm \ && python3 -m spacy download sv_core_news_sm \ @@ -48,5 +51,6 @@ RUN python3 -m spacy download de_core_news_sm \ && python3 -m spacy download pt_core_news_sm \ && python3 -m spacy download ro_core_news_sm \ && python3 -m spacy download sl_core_news_sm \ - && python3 -m spacy download xx_ent_wiki_sm + && python3 -m spacy download xx_ent_wiki_sm \ + && python3 -c 'import stanza; stanza.download("fi", processors="tokenize,mwt,lemma")' diff --git a/tools/tokenizer.py b/tools/tokenizer.py index 4d681499..ff4d8ff1 100644 --- a/tools/tokenizer.py +++ b/tools/tokenizer.py @@ -22,6 +22,7 @@ import shutil import subprocess from newspaper import Article +import spacy_stanza # create emtpy sapce models multi_nlp = None @@ -122,7 +123,7 @@ def getTokenizerDoc(language, words): if language == 'finnish': global finnish_nlp if finnish_nlp == None: - finnish_nlp = spacy.load("fi_core_news_sm", disable = ['ner', 'parser']) + finnish_nlp = spacy_stanza.load_pipeline("fi", processors="tokenize,lemma") finnish_nlp.add_pipe("custom_sentence_splitter", first=True) doc = finnish_nlp(words)