Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Persian Dictionary #181

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,7 @@ The currently supported dictionaries are:
* Basque - 'eu'
* Latvian - 'lv'
* Dutch - 'nl'
* Persian - 'fa'

Dictionary Creation and Updating
-------------------------------------------------------------------------------
Expand Down
58 changes: 57 additions & 1 deletion scripts/build_dictionary.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
Latvian Input: https://huggingface.co/datasets/RaivisDejus/latvian-text
Dutch Input: http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.nl.gz
Italian Input: http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.it.gz
Persian Input: https://drive.google.com/open?id=1mBeSSrEnajB2qxYs67tQbEDWmpRMZ0U0
Requirements:
The script requires more than the standard library to run in its
entirety. You will also need to install the NLTK package to build a
Expand Down Expand Up @@ -1051,6 +1052,59 @@ def clean_dutch(word_frequency, filepath_exclude, filepath_include, filepath_dic
return word_frequency


def clean_persian(word_frequency, filepath_exclude, filepath_include):
"""Clean a Persian word frequency list

Args:
word_frequency (Counter):
filepath_exclude (str):
filepath_include (str):
"""
letters = set("آابپتثجچحخدذرزژسشصضطظعغفقکگلمنوهی")

# remove words with invalid characters
invalid_chars = list()
for key in word_frequency:
kl = set(key)
if kl.issubset(letters):
continue
invalid_chars.append(key)
for misfit in invalid_chars:
word_frequency.pop(misfit)

# remove ellipses
ellipses = list()
for key in word_frequency:
if ".." in key:
ellipses.append(key)
for misfit in ellipses:
word_frequency.pop(misfit)

# TODO: other possible fixes?

# remove small numbers
small_frequency = list()
for key in word_frequency:
if word_frequency[key] <= MINIMUM_FREQUENCY:
small_frequency.append(key)
for misfit in small_frequency:
word_frequency.pop(misfit)

# remove flagged misspellings
for line in load_include_exclude(filepath_exclude):
if line in word_frequency:
word_frequency.pop(line)

# Add known missing words back in (ugh)
for line in load_include_exclude(filepath_include):
if line in word_frequency:
print("{} is already found in the dictionary! Skipping!".format(line))
else:
word_frequency[line] = MINIMUM_FREQUENCY

return word_frequency


def _parse_args():
"""parse arguments for command-line usage"""
import argparse
Expand All @@ -1063,7 +1117,7 @@ def _parse_args():
"--language",
required=True,
help="The language being built",
choices=["en", "es", "de", "fr", "pt", "ru", "ar", "lv", "eu", "nl", "it"],
choices=["en", "es", "de", "fr", "pt", "ru", "ar", "lv", "eu", "nl", "it", "fa"],
)
parser.add_argument(
"-f", "--file-path", help="The path to the downloaded text file OR the saved word frequency json"
Expand Down Expand Up @@ -1154,6 +1208,8 @@ def _parse_args():
elif args.language == "nl":
dict_path = os.path.abspath("{}/levidromelist-dicts/dutch.txt".format(data_path))
word_frequency = clean_dutch(word_frequency, exclude_filepath, include_filepath, dict_path)
elif args.language == "fa":
word_frequency = clean_persian(word_frequency, exclude_filepath, include_filepath)

# export word frequency for review!
word_frequency_path = os.path.join(script_path, "{}.json".format(args.language))
Expand Down
Empty file added scripts/data/fa_exclude.txt
Empty file.
Binary file added scripts/data/fa_full.json.gz
Binary file not shown.
Loading