From 8b84d6d2a71438259c1124d16fa33094218cf6f5 Mon Sep 17 00:00:00 2001
From: Roman Inflianskas <rominf@pm.me>
Date: Fri, 10 May 2024 16:59:21 +0300
Subject: [PATCH] RFC: Use stanza model for Finnish

---
 docker/PythonDockerfileDev | 8 ++++++--
 tools/tokenizer.py         | 3 ++-
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/docker/PythonDockerfileDev b/docker/PythonDockerfileDev
index f2b576b4..51f0f288 100644
--- a/docker/PythonDockerfileDev
+++ b/docker/PythonDockerfileDev
@@ -8,6 +8,8 @@ RUN apt-get update -y \
     && apt-get clean \
     && rm -rf /var/lib/apt/lists/*
 
+RUN pip install torch --index-url https://download.pytorch.org/whl/cpu
+
 RUN pip install -U --no-cache-dir \
         setuptools \
         wheel \
@@ -22,6 +24,8 @@ RUN pip install -U --no-cache-dir \
         bottle \
 #spacy
         spacy \
+#stanza integration for spacy
+        stanza \
 #chinese reading
         pinyin \
 #subtitle file parser
@@ -33,7 +37,6 @@ RUN python3 -m spacy download de_core_news_sm \
     && python3 -m spacy download nb_core_news_sm \
     && python3 -m spacy download es_core_news_sm \
     && python3 -m spacy download nl_core_news_sm \
-    && python3 -m spacy download fi_core_news_sm \
     && python3 -m spacy download fr_core_news_sm \
     && python3 -m spacy download it_core_news_sm \
     && python3 -m spacy download sv_core_news_sm \
@@ -48,5 +51,6 @@ RUN python3 -m spacy download de_core_news_sm \
     && python3 -m spacy download pt_core_news_sm \
     && python3 -m spacy download ro_core_news_sm \
     && python3 -m spacy download sl_core_news_sm \
-    && python3 -m spacy download xx_ent_wiki_sm
+    && python3 -m spacy download xx_ent_wiki_sm \
+    && python3 -c 'import stanza; stanza.download("fi", processors="tokenize,mwt,lemma")'
 
diff --git a/tools/tokenizer.py b/tools/tokenizer.py
index 4d681499..ff4d8ff1 100644
--- a/tools/tokenizer.py
+++ b/tools/tokenizer.py
@@ -22,6 +22,7 @@
 import shutil
 import subprocess
 from newspaper import Article
+import spacy_stanza
 
 # create emtpy sapce models
 multi_nlp = None
@@ -122,7 +123,7 @@ def getTokenizerDoc(language, words):
     if language == 'finnish':
         global finnish_nlp
         if finnish_nlp == None:
-            finnish_nlp = spacy.load("fi_core_news_sm", disable = ['ner', 'parser'])
+            finnish_nlp = spacy_stanza.load_pipeline("fi", processors="tokenize,lemma")
             finnish_nlp.add_pipe("custom_sentence_splitter", first=True)
         doc = finnish_nlp(words)