diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..4602f20 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +/bin +/obj +/Properties \ No newline at end of file diff --git a/.vs/AsoSoftLibrary/DesignTimeBuild/.dtbcache.v2 b/.vs/AsoSoftLibrary/DesignTimeBuild/.dtbcache.v2 new file mode 100644 index 0000000..efba9ac Binary files /dev/null and b/.vs/AsoSoftLibrary/DesignTimeBuild/.dtbcache.v2 differ diff --git a/.vs/AsoSoftLibrary/FileContentIndex/548dea1d-6710-447d-84dc-a07492f54078.vsidx b/.vs/AsoSoftLibrary/FileContentIndex/548dea1d-6710-447d-84dc-a07492f54078.vsidx new file mode 100644 index 0000000..230f978 Binary files /dev/null and b/.vs/AsoSoftLibrary/FileContentIndex/548dea1d-6710-447d-84dc-a07492f54078.vsidx differ diff --git a/.vs/AsoSoftLibrary/FileContentIndex/5a079693-0b16-49f2-814e-cec1b525d720.vsidx b/.vs/AsoSoftLibrary/FileContentIndex/5a079693-0b16-49f2-814e-cec1b525d720.vsidx new file mode 100644 index 0000000..56bcd27 Binary files /dev/null and b/.vs/AsoSoftLibrary/FileContentIndex/5a079693-0b16-49f2-814e-cec1b525d720.vsidx differ diff --git a/.vs/AsoSoftLibrary/FileContentIndex/5e887efe-ac0a-43cc-a94e-355d326020f7.vsidx b/.vs/AsoSoftLibrary/FileContentIndex/5e887efe-ac0a-43cc-a94e-355d326020f7.vsidx new file mode 100644 index 0000000..6c4173c Binary files /dev/null and b/.vs/AsoSoftLibrary/FileContentIndex/5e887efe-ac0a-43cc-a94e-355d326020f7.vsidx differ diff --git a/.vs/AsoSoftLibrary/FileContentIndex/90b8f540-788f-47e8-81d9-020c5aea7fca.vsidx b/.vs/AsoSoftLibrary/FileContentIndex/90b8f540-788f-47e8-81d9-020c5aea7fca.vsidx new file mode 100644 index 0000000..9f6890e Binary files /dev/null and b/.vs/AsoSoftLibrary/FileContentIndex/90b8f540-788f-47e8-81d9-020c5aea7fca.vsidx differ diff --git a/.vs/AsoSoftLibrary/FileContentIndex/read.lock b/.vs/AsoSoftLibrary/FileContentIndex/read.lock new file mode 100644 index 0000000..e69de29 diff --git a/.vs/AsoSoftLibrary/v17/.futdcache.v1 b/.vs/AsoSoftLibrary/v17/.futdcache.v1 new file mode 100644 index 0000000..88f39e8 Binary files /dev/null and b/.vs/AsoSoftLibrary/v17/.futdcache.v1 differ diff --git a/.vs/AsoSoftLibrary/v17/.suo b/.vs/AsoSoftLibrary/v17/.suo new file mode 100644 index 0000000..509e808 Binary files /dev/null and b/.vs/AsoSoftLibrary/v17/.suo differ diff --git a/.vs/ProjectEvaluation/asosoftlibrary.metadata.v2 b/.vs/ProjectEvaluation/asosoftlibrary.metadata.v2 new file mode 100644 index 0000000..c75870f Binary files /dev/null and b/.vs/ProjectEvaluation/asosoftlibrary.metadata.v2 differ diff --git a/.vs/ProjectEvaluation/asosoftlibrary.projects.v2 b/.vs/ProjectEvaluation/asosoftlibrary.projects.v2 new file mode 100644 index 0000000..a2d7f56 Binary files /dev/null and b/.vs/ProjectEvaluation/asosoftlibrary.projects.v2 differ diff --git a/AsoSoft-logo.png b/AsoSoft-logo.png new file mode 100644 index 0000000..ed845dd Binary files /dev/null and b/AsoSoft-logo.png differ diff --git a/AsoSoftLibrary.csproj b/AsoSoftLibrary.csproj index 410028e..a6100b1 100644 --- a/AsoSoftLibrary.csproj +++ b/AsoSoftLibrary.csproj @@ -1,4 +1,4 @@ - + netcoreapp3.1 @@ -6,20 +6,59 @@ AsoSoft Class Library Aso Mahmudi AsoSoft Class Library offers basic natural language processing (NLP) algorithms for the Kurdish Language (ckb: Central branch of Kurdish). + MIT + https://github.com/AsoSoft/AsoSoft-Library + AsoSoft-logo.png + kurdish normalization natural-language-processing + AsoSoft Library for the Kurdish language processing (ckb: Central branch of Kurdish). +Normalizer and Numeral Converter classes + https://github.com/AsoSoft/AsoSoft-Library + 2.0.0 + ReadMe.md - + + + + + + + + + + + + + + + + + + + True True - AsoSoftResources.resx + replaceFiles.resx - + PublicResXFileCodeGenerator - AsoSoftResources.Designer.cs + replaceFiles.Designer.cs + + + + True + \ + + + True + \ + + + diff --git a/AsoSoftLibrary.csproj.user b/AsoSoftLibrary.csproj.user new file mode 100644 index 0000000..17b4106 --- /dev/null +++ b/AsoSoftLibrary.csproj.user @@ -0,0 +1,9 @@ + + + + + + Designer + + + \ No newline at end of file diff --git a/AsoSoftLibrary.sln b/AsoSoftLibrary.sln new file mode 100644 index 0000000..f966da8 --- /dev/null +++ b/AsoSoftLibrary.sln @@ -0,0 +1,25 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio Version 17 +VisualStudioVersion = 17.2.32616.157 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "AsoSoftLibrary", "AsoSoftLibrary.csproj", "{69039AA0-A7AD-4F12-B1B9-13263A9DC47F}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|Any CPU = Debug|Any CPU + Release|Any CPU = Release|Any CPU + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {69039AA0-A7AD-4F12-B1B9-13263A9DC47F}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {69039AA0-A7AD-4F12-B1B9-13263A9DC47F}.Debug|Any CPU.Build.0 = Debug|Any CPU + {69039AA0-A7AD-4F12-B1B9-13263A9DC47F}.Release|Any CPU.ActiveCfg = Release|Any CPU + {69039AA0-A7AD-4F12-B1B9-13263A9DC47F}.Release|Any CPU.Build.0 = Release|Any CPU + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {95A3F2E7-0611-4D99-8A85-055D3FE5E265} + EndGlobalSection +EndGlobal diff --git a/AsoSoftNormalization.cs b/AsoSoftNormalization.cs deleted file mode 100644 index 771eb85..0000000 --- a/AsoSoftNormalization.cs +++ /dev/null @@ -1,311 +0,0 @@ -using System.Collections.Generic; -using System.Text; -using System.Text.RegularExpressions; - -namespace AsoSoftLibrary -{ - public class AsoSoftNormalization - { - - private static string replaceByList(string text, List replaceList) - { - for (int i = 0; i < replaceList.Count; i += 2) - text = Regex.Replace(text, replaceList[i], replaceList[i + 1]); - return text; - } - - // ================= Non-Standard Fonts Conversion ================= - - // converts Kurdish text written in AliK fonts into Unicode standard - public static string AliK2Unicode(string text) => replaceByList(text, new List() { - "لاَ|لآ|لاً", "ڵا", - "لً|لَ|لأ", "ڵ", - "ة", "ە", - "ه" + "([^ئابپتجچحخدرڕزژسشعغفڤقکگلڵمنوۆەهھیێأإآثذصضطظكيىةڎۊؤ]|$)", "هـ$1", - "ض", "چ", - "ث", "پ", - "ظ", "ڤ", - "ط", "گ", - "ك", "ک", - "ىَ|يَ|یَ|آ", "ێ", - "رِ", "ڕ", - "ؤ|وَ", "ۆ", - "ي|ى", "ی", - "ء", "\u200Cو", - "ِ", "", - "ذ", "ژ" - }); - - // converts Kurdish text written in AliWeb fonts into Unicode standard - public static string AliWeb2Unicode(string text) => replaceByList(text, new List() { - "لاَ|لآ|لاً", "ڵا", - "لَ|پ", "ڵ", - "ة", "ە", - "ه", "ھ", - "ه", "ھ", - "رِ|أ", "ڕ", - "ؤ|وَ", "ۆ", - "يَ|یَ", "ێ", - "ص", "ێ", - "ي", "ی", - "ط", "ڭ", //swap ط and گ - "گ", "ط", // - "ڭ", "گ", // - "ض", "چ", - "ث", "پ", - "ظ", "ڤ", - "ْ|ُ", "", - "ى", "*", - "ك", "ک", - "ذ", "ژ" - }); - - // converts Kurdish text written in KDylan fonts into Unicode standard - public static string Dylan2Unicode(string text) => replaceByList(text, new List() { - "لإ|لأ|لآ", "ڵا", - "ؤ|وَ", "ۆ", - "ة", "ە", - "ض", "ڤ", - "ص", "ڵ", - "ث", "ێ", - "ؤ", "ۆ", - "ه", "ھ", - "ك", "ک", - "ي|ى", "ی", - "ذ", "ڕ" - }); - - // converts Kurdish text written in Zarnegar fonts into Unicode standard - public static string Zarnegar2Unicode(string text) => replaceByList(text, new List() { - "لاٌ", "ڵا", - "ى|ي", "ی", - "یٌ", "ێ", - "ه‏", "ە", - "لٌ", "ڵ", - "رٍ", "ڕ", - "وٌ", "ۆ" - }); - - // ================= Normalization ================= - public static Dictionary LoadNormalizerReplaces(List files) - { - var output = new Dictionary(); - foreach (var file in files) - { - foreach (var item in file.Split('\n')) - { - var chOld = System.Convert.ToChar(System.Convert.ToUInt32(item.Split('\t')[0], 16)); - var chNew = ""; - foreach (var ch in item.Split('\t')[1].Split(' ')) - if (ch != "") - chNew += System.Convert.ToChar(System.Convert.ToUInt32(ch, 16)); - if (!output.ContainsKey(chOld)) - output.Add(chOld, chNew); - } - } - return output; - } - - static string Ku = "ئابپتجچحخدرڕزژسشعغفڤقکگلڵمنوۆەهھیێأإآثذصضطظكيىةڎۊؤ" - + "\u064B-\u065F"; // Haraka - static string joiners = "بپتثجچحخسشصضطظعغفڤقکگلڵمنیهھێ"; - - // main Unicode Normalization for Central Kurdish - public static string NormalizeKurdish(string text) - { - return NormalizeKurdish(text, true, true, new Dictionary()); - } - public static string NormalizeKurdish(string text, bool IsOnlyKurdish, bool changeInitialR, Dictionary ReplaceList) - { - // Character-based replacement (ReplaceList and Private Use Area) - var CharList = new List(); - for (int i = 0; i < text.Length; i++) - if (!CharList.Contains(text[i])) - CharList.Add(text[i]); - foreach (var ch in CharList) - { - if (ReplaceList.ContainsKey(ch)) //ReplaceList - text = text.Replace(ch.ToString(), ReplaceList[ch]); - else if (ch > 57343 && ch < 63744) //Private Use Area - text = text.Replace(ch, '□'); // u25A1 White Square - } - - var Corrections = new List() { - //========= Zero-Width Non-Joiner - "[\uFEFF\u200C]+", "\u200C", //Standardize and remove dublicated ZWNJ - // remove unnecessary ZWNJ - "\u200C(?=(\\s|\\p{P}|$))", "", // ZWNJ + white spaces - "(? ماهـ - "([^" + joiners + "])(\u200D)([^" + joiners + "])", "$1$3", //remove unnecessary ZW-J - - //========= Tatweels (U+0640) - "\u0640{2,}", "\u0640", // merge - "\u0640" + "([" + Ku +"])", "$1", // delete unnecessary Tatweel - "([" + joiners + "])" + "\u0640", "$1", // delete unnecessary Tatweel - "(^|[^هئ])" + "\u0640", "$1-", // only we need Tatweel for final Heh and Hamza, others are dashes - }; - - // if the text is Monolingual (only Central Kurdish) - if (IsOnlyKurdish) - { - Corrections.AddRange(new List() { - //========= standard H, E, Y, K - "\u200C" + "و ", " و ", // شوێن‌و جێ => شوێن و جێ - "ه" + "\u200C", "ە", // Heh+ZWNJ => kurdish AE - "ه" + "([^" + Ku +"ـ]|$)", "ە$1", //final Heh looks like Ae - "ھ" + "([^" + Ku +"]|$)", "هـ$1", // final Heh Doachashmee - "ھ" , "ه", // non-final Heh Doachashmee - "ى|ي", "ی", // Alef maksura | Arabic Ye => Farsi ye - "ك", "ک", // Arabic Kaf => Farsi Ke - //"\u200C" + "دا" + "(?![" + Ku + @"]($|[ \t]))", "دا", // شوێن‌دا => شوێندا - //"(? بێ شوێن - - //========= errors from font conversion - "لاَ|لاً|لأ", "ڵا", - "(ی|ێ)" + "[\u064E\u064B]+", "ێ", //FATHA & FATHATAN - "(و|ۆ)" + "[\u064E\u064B]+", "ۆ", - "(ل|ڵ)" + "[\u064E\u064B]+", "ڵ", - "(ر|ڕ)" + "\u0650+", "ڕ", //KASRA - }); - //========= Initial r - if (changeInitialR) - Corrections.AddRange(new List() { "(? replaceByList(text, new List() { - "(?() { - "\\(\\(", "«", - "\\)\\)", "»", - "»", "\uF8FA", // temp replacement «x»eke - "\\)", "\uF8FB", //temp replacement - "([!.:;?،؛؟]+)(\\p{Pi})", "$1 $2", - "(\\p{P}+)(?![\\s\\p{P}])", "$1 ", // Seprate all punctuations - "\uF8FA", "»", // undo temp replacement - "\uF8FB", ")", // undo temp replacement - "([^ \\t\\p{P}])(\\p{P}+)", "$1 $2", // Seprate all punctuations - "(\\d) ([.|\u066B]) (?=\\d)", "$1$2", //DECIMAL SEPARATOR - "(\\d) ([,\u066C]) (?=\\d\\d\\d)", "$1$2", //THOUSANDS SEPARATOR - "(\\d) ([/\u060D]) (?=\\d)", "$1$2" //DATE SEPARATOR - }); - if (!seprateAllPunctuations) - { - text = replaceByList(text, new List() { - " ((\\p{Pe}|\\p{Pf})+)", "$1", // A ) B => A) B - "((\\p{Ps}|\\p{Pi})+) ", "$1", // A ( B => A (B - " ([!.:;?،؛؟]+)", "$1", // A ! => A! - }); - } - text = text.Replace('\uF8FD', '"'); //undo temp replacement - return text; - } - // Trim white spaces of a line - public static string TrimLine(string line) - { - line = Regex.Replace(line.Trim(), "[\u200B\u200C\uFEFF]+$", ""); - line = Regex.Replace(line.Trim(), "^[\u200B\u200C\uFEFF]+", ""); - return line.Trim(); - } - // Html Entity replacement for web crawled texts (e.g. "é" with "é") - public static string ReplaceHtmlEntity(string text) - { - return Regex.Replace(text, "&[a-zA-Z]+;", m => System.Net.WebUtility.HtmlDecode(m.Value)); - } - // Replace URLs and Emails with a certain word (improves language models) - public static string ReplaceUrlEmail(string text) - { - text = Regex.Replace(text, "([a-zA-Z0-9_\\-\\.]+)@([a-zA-Z0-9_\\-\\.]+\\.[a-zA-Z]{2,5})", "EmailAddress"); - text = Regex.Replace(text, "((http[s]?|ftp)?://([\\w-]+\\.)+[\\w-]+)(/[\\w-~./?%+&=]*)?", "URL"); - return text; - } - // Character replacement for ANSI CodePage - public static string Char2CharReplacment(string text, Dictionary Codepage) - { - foreach (var item in Codepage) - text = text.Replace(item.Key, item.Value); - return text; - } - // Correction Table (word replacement ) - public static string Word2WordReplacement(string line, Dictionary wordReplacements) - { - return Regex.Replace(line, "(? wordReplacements.ContainsKey(m.Value) ? wordReplacements[m.Value] : m.Value); - } - - //================= have to be improved: ================= - - // fast but not accurate; we need a language detector. - public static string DeleteNonKurdish(string line, int KurdishRateThreshold) - { - float KuPersent = Regex.Matches(line, "[پچژگڵۆڕێڤەھ]").Count / (float)line.Length; - if (KuPersent < KurdishRateThreshold / 100.0) - line = ""; - return line; - } - - //embrace sentences with start/end tags - public static string MarkSentence(string line, string sentenceTag) - { - var tagStart = "<" + sentenceTag + ">"; - var tagEnd = ""; - - // ending punctuations !?؟ - line = Regex.Replace(line.TrimEnd(), "([!?؟]+)(?!$)", "$1 " + tagEnd + tagStart); - // full stop - line = Regex.Replace(line, "([\\w\u200C]{2,} ?\\.)(?!([0-9a-zA-Z.]|$))", "$1 " + tagEnd + tagStart); - - return tagStart + line + tagEnd; - } - } -} - -// ================= Regex Hints ================= -// docs.microsoft.com/en-us/dotnet/standard/base-types/character-classes-in-regular-expressions -// Lookbehind Positive: (?<=a)b -// Lookbehind Negative: (? 12345678 - text = Regex.Replace(text, "(? floatName(m.Groups[1].Value.ToString(), m.Groups[2].Value.ToString())); - - //convert remaining integr numbers - text = Regex.Replace(text, "([0-9]+)", - m => integerName(m.Groups[1].Value.ToString())); - - return text; - } - - private static string floatName(string integerPart, string decimalPart) - { - var point = " پۆینت " + Regex.Replace(decimalPart, "(?<=^|0)0", " سفر "); - point = Regex.Replace(point, "[0-9]", ""); - return integerName(integerPart) + point + integerName(decimalPart); - } - - private static string integerName(string inputInteger) - { - var output = ""; - if (inputInteger != "0") - { - string[] ones = { "", "یەک", "دوو", "سێ", "چوار", "پێنج", "شەش", "حەوت", "هەشت", "نۆ" }; - string[] teens = { "دە", "یازدە", "دوازدە", "سێزدە", "چواردە", "پازدە", "شازدە", "حەڤدە", "هەژدە", "نۆزدە" }; - string[] tens = { "", "", "بیست", "سی", "چل", "پەنجا", "شەست", "هەفتا", "هەشتا", "نەوەد" }; - string[] hundreds = { "", "سەد", "دووسەد", "سێسەد", "چوارسەد", "پێنسەد", "شەشسەد", "حەوتسەد", "هەشتسەد", "نۆسەد" }; - string[] thousands = { "", " هەزار", " ملیۆن", " ملیار", " بلیۆن", " بلیار", " تریلیۆن", " تریلیار", " کوادرلیۆن" }; - var temp = inputInteger; - for (int i = 0; i < inputInteger.Length; i = i + 3) - { - string currentThree = Regex.Match(temp, "([0-9]{1,3})$").Result("$1"); - temp = temp.Substring(0, temp.Length - currentThree.Length); - currentThree = currentThree.PadLeft(3, '0'); - var C = Int32.Parse(currentThree[0].ToString()); - var X = Int32.Parse(currentThree[1].ToString()); - var I = Int32.Parse(currentThree[2].ToString()); - var conjunction1 = ((C != 0) && (X != 0 || I != 0)) ? " و " : ""; - var conjunction2 = (X != 0 && I != 0) ? " و " : ""; - if (X == 1) - currentThree = hundreds[C] + conjunction1 + teens[I]; - else - currentThree = hundreds[C] + conjunction1 + tens[X] + conjunction2 + ones[I]; - var M = (currentThree == "") ? "" : thousands[(int)(Math.Floor(i / 3.0))]; - currentThree += M; - var conjunction3 = (output == "") ? "" : " و "; - if (currentThree != "") - output = currentThree + conjunction3 + output; - } - output = output.Replace("یەک هەزار", "هەزار"); - } - else // if input number = 0 - output = "سفر"; - return output; - } - } -} \ No newline at end of file diff --git a/G2P.cs b/G2P.cs new file mode 100644 index 0000000..0bcc148 --- /dev/null +++ b/G2P.cs @@ -0,0 +1,449 @@ +// Automated Grapheme-to-Phoneme Conversion for Central Kurdish based on Optimality Theory +// Copyright (C) 2019 Aso Mahmudi, Hadi Veisi +// Maintainer: Aso Mahmudi (aso.mehmudi@gmail.com) +// Demo: https://asosoft.github.io/g2p/ +// Source Code: https://github.com/AsoSoft/AsoSoft-Library +// Test-set: https://github.com/AsoSoft/Kurdish-G2P-dataset +// Paper: https://www.sciencedirect.com/science/article/abs/pii/S0885230821000292 +// Cite: +// @article{mahmudi2021automated, +// title={Automated grapheme-to-phoneme conversion for Central Kurdish based on optimality theory}, +// author={Mahmudi, Aso and Veisi, Hadi}, +// journal={Computer Speech \& Language}, +// volume={70}, +// pages={101222}, +// year={2021}, +// publisher={Elsevier} +// } + +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Text.RegularExpressions; + +namespace AsoSoftLibrary +{ + public static partial class AsoSoft + { + private static Dictionary History = new Dictionary(); + + /// Converts Central Kurdish text in standard Arabic script into syllabified phonemic Latin script (i.e. graphemes to phonems) + public static string G2P(string text, + bool convertNumbersToWord = false, + bool backMergeConjunction = true, + bool singleOutputPerWord = true) + { + var sb = new StringBuilder(); + text = UnifyNumerals(text, "en"); + if (convertNumbersToWord) + text = Number2Word(text); + + text = g2pNormalize(text.Trim()); + // + var ku = "ئابپتجچحخدرڕزژسشعغفڤقکگلڵمنوۆەهیێ" + "ۋۉۊڎڴݵݸ"; + var wordss = Regex.Matches(text, "([" + ku + "]+|[^" + ku + "]+)"); + for (int i = 0; i < wordss.Count; i++) + { + var word = wordss[i].Value; + if (Regex.IsMatch(word, "[" + ku + "]") && word != "و") + sb.Append(WordG2P(Regex.Replace(word, "[^" + ku + "]+", ""), singleOutputPerWord)); + else + sb.Append(word); + } + var output = sb.ToString(); + + // conjunction و + output = Regex.Replace(output, "(^|[?!.] ?)" + "و", "$1ˈwe"); + if (!backMergeConjunction) + output = Regex.Replace(output, "و", "û"); + else + { + // if there are candidates preceeding conjunction (e.g ˈbîst¶ˈbîˈsit و) + + output = Regex.Replace(output, "(\\w+)¶(\\w+)¶(\\w+) و" + , "$1 و¶$2 و¶$3 و"); + output = Regex.Replace(output, "(\\w+)¶(\\w+) و" + , "$1 و¶$2 و"); + + // ('bi'ra + w => bi'raw) + output = Regex.Replace(output, "([aeêouûiî]) و", "$1w"); + // ('be'fir + û => 'bef'rû) + output = Regex.Replace(output, "(?<=\\w)ˈ([^aeêouûiî])i([^aeêouûiî]) و", "$1ˈ$2û"); + // ('ser + û => 'se'rû) + // ('sard + û => 'sar'dû) + // ('min + û => 'mi'nû) + // ('bi'gir + û => 'bi'gi'rû) + // ('gir'tin + û => 'gir'ti'nû) + output = Regex.Replace(output, "([^aeêouûiî]) و", "ˈ$1û"); + // if conjunction makes candidates the same (e.g ˈbîsˈtû¶ˈbîsˈtû) + output = Regex.Replace(output, "(?\\w+)¶\\k(?=\\s|$)", "$1"); + } + return output.TrimEnd(); + } + + + // chooses the best candidates for the word + private static string Evaluator(string gr, List Candidates) + { + var Output = new List(); + var evaluatedCandidates = EVAL(Candidates); + if (evaluatedCandidates.Count() > 0) + { + var LowestPenalt = evaluatedCandidates.First().Value; + foreach (var item in evaluatedCandidates) + if (item.Value < LowestPenalt + 5) + Output.Add(item.Key); + } + return (Output.Count() == 0) ? gr : string.Join('¶', Output); + } + + // Normalizion + private static string g2pNormalize(string text) + { + var s = new string[] + { + " +", " " , + "دٚ", "ڎ", + "گٚ", "ڴ", + @"(^|\s)چ بکە", "$1چبکە", + "َ", "ە", // فتحه + "ِ", "ی", // کسره + "ُ", "و", // ضمه + "ء", "ئ", // Hamza + "أ", "ئە", + "إ", "ئی", + "آ", "ئا", + "ظ|ذ|ض", "ز", + "ص|ث", "س", + "ط", "ت", + "ك", "ک", + "ي|ى", "ی", + "ه‌", "ە", + "ھ", "ه", + "ـ", "", // tatweel + "؟", "?", + "،", ",", + "؛", ";", + "\r", "", + }; + for (int i = 0; i < s.Length; i += 2) + text = Regex.Replace(text, s[i], s[i + 1]); + return text; + } + + private static string WordG2P(string gr, bool SingleOutputPerWord) + { + // Check history for speed up + if (!History.ContainsKey(gr)) + History.Add(gr, Evaluator(gr, Generator(gr))); + return SingleOutputPerWord ? History[gr].Split('¶')[0] : History[gr]; + } + + // GEN: generates all possible candidates: + // e.g. بوون => bûn, buwn, bwun + private static List Generator(string gr) + { + // Converting certain characters + foreach (var item in replaceFiles.G2PExceptions.Split('\n')) + gr = Regex.Replace(gr, item.Split('\t')[0], item.Split('\t')[1]); + + foreach (var item in replaceFiles.G2PCertain.Split('\n')) + gr = Regex.Replace(gr, item.Split('\t')[0], item.Split('\t')[1]); + + // Uncertainty in "و" and "ی" + var CandList1 = new List { "" }; + while (gr.Length > 0) + { + var temp = new List(); + if (Regex.IsMatch(gr, "^ووووو")) + { + temp.AddRange(new List + { "uwuwu", "uwuww", "uwwuw", "uwûw", + "wuwwu", "wuwuw", "wuwû", "wûww", "wwuwu", "wwuww", "wwûw", "wûwu", + "ûwwu", "ûwuw", "ûwû"}); + gr = gr.Substring(5); + } + else if (Regex.IsMatch(gr, "^وووو")) + { + temp.AddRange(new List + { "uwwu", "uwuw", "uwû", + "wwuw", "wwû", "wuww", "wuwu", "wûw", + "ûwu", "ûww", }); + gr = gr.Substring(4); + } + else if (Regex.IsMatch(gr, "^ووو")) + { + temp.AddRange(new List + { "wuw", "wwu", "wû", + "uww", "uwu", + "ûw" }); + gr = gr.Substring(3); + } + else if (Regex.IsMatch(gr, "^وو")) + { + temp.AddRange(new List { "wu", "uw", "ww", "û" }); + gr = gr.Substring(2); + } + else if (Regex.IsMatch(gr, "^و")) + { + temp.AddRange(new List { "u", "w" }); + gr = gr.Substring(1); + } + else if (Regex.IsMatch(gr, "^یی")) + { + temp.AddRange(new List { "îy", "yî" }); + gr = gr.Substring(2); + } + else if (Regex.IsMatch(gr, "^ی")) + { + temp.AddRange(new List { "y", "î" }); + gr = gr.Substring(1); + } + else + { + temp.Add(gr[0].ToString()); + gr = gr.Substring(1); + } + + var Count = CandList1.Count; + var TempList = new List(); + foreach (var item in CandList1) + TempList.Add(item); + CandList1.Clear(); + for (int i = 0; i < Count; i++) + { + for (int j = 0; j < temp.Count; j++) + { + var WW = Regex.IsMatch(temp[j], "^ww"); + var IsPreviousVowel = Regex.IsMatch(TempList[i], "[aeêouûiîüȯė]$"); + var IsNowVowel = Regex.IsMatch(temp[j], "^[aeêouûiîüȯė]"); + var ConsonantBeforeWW = !IsPreviousVowel && WW; + var hiatus = IsPreviousVowel && IsNowVowel; + if (!hiatus && !ConsonantBeforeWW) + CandList1.Add(TempList[i] + temp[j]); + } + } + } + // Adding "i" between Consonant Clusters + var Candidates = iInsertion(CandList1); + + // ======= Syllabification for each candidate + var OutputCandidates = Syllabification(Candidates); + + // for speed up: remove candidates that has 1) syllable without vowel or 2) more than 3 consonants in coda + var cCount = OutputCandidates.Count; + if(cCount > 1) + { + for (int i = cCount - 1; i > -1; i--) + if (Regex.IsMatch(OutputCandidates[i], "ˈ[^aeêouûiîüȯė]+(ˈ|$)") + || Regex.IsMatch(OutputCandidates[i], "[aeêouûiîüȯė][^aeêouûiîüȯėˈ]{4,}")) + OutputCandidates.RemoveAt(i); + } + + return OutputCandidates; + } + + // insertion of hidden /i/ vowel + // e.g. brd => bird, brid, birid + private static List iInsertion(List Cands) + { + var Candidates = new List(); + for (int i = 0; i < Cands.Count; i++) + { + var ThisCand = new List(); + if (!string.IsNullOrEmpty(Cands[i])) + { + ThisCand.Add(Cands[i][0].ToString()); + for (int j = 1; j < Cands[i].Length; j++) + { + var Count = ThisCand.Count; + var TempList = new List(); + foreach (var item in ThisCand) + TempList.Add(item); + ThisCand.Clear(); + for (int k = 0; k < Count; k++) + { + ThisCand.Add(TempList[k] + Cands[i][j]); + if (Regex.IsMatch(Cands[i].Substring(j - 1, 2), @"[^aeêouûiîüȯė][^aeêouûiîüȯė]")) + ThisCand.Add(TempList[k] + "i" + Cands[i][j]); + } + } + } + else + ThisCand.Add(Cands[i]); + foreach (var item in ThisCand) + Candidates.Add(item); + + } + return Candidates; + } + + // Syllabification of candidates + // e.g. dexom => ˈdeˈxom + private static List Syllabification(List Candidates) + { + var cCount = Candidates.Count; + for (int i = 0; i < cCount; i++) + { + // Onset C(C)V + Candidates[i] = Regex.Replace(Candidates[i], + "([^aeêouûiîȯėwy][wy]|[^aeêouûiîȯė])([aeêouûiîȯė])", "ˈ$1$2"); + // if no ˈ at beginig (grˈtin => ˈgrˈtin) + Candidates[i] = Regex.Replace(Candidates[i], + "^([^ˈ])", "ˈ$1"); + // add candidate ( 'be'sye => + 'bes'ye) + if (Regex.IsMatch(Candidates[i], "[aeêouûiîȯė][^aeêouûiîȯė]?ˈ[^aeêouûiîȯėwy][wy]")) + Candidates.Add(Regex.Replace(Candidates[i], "([aeêouûiîȯė][^aeêouûiîȯė]?)ˈ([^aeêouûiîȯėwy])([wy])", "$1$2ˈ$3")); + } + return Candidates; + } + + // EVAL: specifies a penalty number for each syllabified candidate + private static Dictionary EVAL(List Candidates) + { + var output = new Dictionary(); + if (Candidates.Count > 0) + { + var Penalty = new Dictionary(); + for (int i = 0; i < Candidates.Count; i++) + { + var P = 0; + // ================= types of penalties ============ + // Complex Onset + P += Regex.Matches(Candidates[i], "ˈ([^aeêouûiîȯėˈ]{2,}[wy]|[^aeêouûiîȯėˈ]+[^wy])[aeêouûiîȯė]").Count * 20; + + // Complex Coda + if (Candidates[i] != "ˈpoynt") + P += Regex.Matches(Candidates[i], "[aeêouûiîȯė][^aeêouûiîȯėˈ]{3}").Count * 10; + + P += Regex.Matches(Candidates[i], "[^aeêouûiîȯėˈ][wy][aeêouûiîȯė][wy][^aeêouûiîȯėˈ]").Count * 20; + + // SSP: ascending Sonority in coda + var codas = Regex.Matches(Candidates[i], "(?<=[aeêouûiîȯė])[^aeêouûiîȯėˈ]{2,}"); + foreach (var coda in codas) + { + var chars = coda.ToString(); + for (int j = 0; j < chars.Length - 1; j++) + if (SonorityIndex(chars[j]) <= SonorityIndex(chars[j + 1])) + P += 10; + } + // DEP: i insertion + P += Regex.Matches(Candidates[i], "i").Count * 2; + //=========================== + + P += Regex.Matches(Candidates[i], "kˈr").Count * 3; + + // ('kurd'si'tan => 'kur'dis'tan) + P += Regex.Matches(Candidates[i], "[^aeêouûiîȯėˈ]ˈsiˈtaˈ?n").Count * 3; + + //"(kewt|newt|ḧewt|rext|sext|dest|pest|řast|mest|pişt|wîst|hest|bîst|heşt|şest)" + // suffix /it/ and /im/ ('sert => 'se'rit) ('xewt !! 'xe'wit / 'xewt) + if (!Regex.IsMatch(Candidates[i], + "(rift|neft|kurt|girt|xirt|germ|term|port)")) + P += Regex.Matches(Candidates[i], "[aeêouûiîȯė]([^aeêouûiîyȯėˈ]m|[^aeêouûiîysşxwˈ]t)$").Count * 3; + + // (ˈdyu/ => ˈdîw) and (ˈkwiř => ˈkuř) + P += Regex.Matches(Candidates[i], "yu").Count * 5; + P += Regex.Matches(Candidates[i], "uy").Count * 5; + P += Regex.Matches(Candidates[i], "yi").Count * 5; + P += Regex.Matches(Candidates[i], "iˈ?y").Count * 5; // bes'ti'yan + P += Regex.Matches(Candidates[i], "wu").Count * 5; + P += Regex.Matches(Candidates[i], "uˈ?w").Count * 2; // 'bi'bu'wî + P += Regex.Matches(Candidates[i], "wi").Count * 2; + P += Regex.Matches(Candidates[i], "iw").Count * 2; + P += Regex.Matches(Candidates[i], "wû").Count * 5; + + // ˈdiˈrêˈjayˈyî => ˈdiˈrêˈjaˈyîy (not heyyî and teyyî) + // ˈdiˈrêjˈyî => ˈdiˈrêˈjîy + // (NOT ˈḧeyˈyî teyˈyî") + P += Regex.Matches(Candidates[i], "[^aeêouûiîȯė]ˈyî").Count * 3; + + // [CV]'CyV => [CV]C'yV (ˈdiˈrêˈjyî => ˈdiˈrêˈjîy) ('bes'tye'tî => 'best'ye'tî) + P += Regex.Matches(Candidates[i], "(? CC'yV (bir'dyan => bird'yan) ˈswênˈdyan + P += Regex.Matches(Candidates[i], "[^aeêouûiî]ˈ[^aeêouûiî][y][aeêouûî]").Count * 2; + + // twîˈwur => tu'yûr + P += Regex.Matches(Candidates[i], "[^aeêouûiî]wîˈw").Count * 3; + //=========================== + // Cix (řê'kix'raw => řêk'xi'raw + P += Regex.Matches(Candidates[i], "[^aeêouûiî]ixˈ").Count * 2; + + // ^'hełC' => ^'heł'C + P += Regex.Matches(Candidates[i], "^ˈhe(ł[^aeêouûiîˈ]ˈ|ˈłi)").Count * 3; + + // (he'jarn => 'he'ja'rin) + P += Regex.Matches(Candidates[i], "rn").Count * 5; + + // ('xawn => 'xa'win) ('pyawn => pya'win) + P += Regex.Matches(Candidates[i], "[aêoûî][w][^aeêouûiîˈ]").Count * 5; + //=========================== + + // ('lab'ri'di'nî => 'la'bir'di'nî) + P += Regex.Matches(Candidates[i], "[aeêouûiî][^aeêouûiîˈ]ˈriˈ").Count * 5; + // + // 'ser'nic, 'dek'rid, gir'fit => 'se'rinc, 'de'kird, 'gi'rift (NOT gir'tin) + var pat = Regex.Match(Candidates[i], "([^aeêouûiîˈ])ˈ([^aeêouûiîˈ])i([^aeêouûiîˈ])"); + if (pat.Success) + { + var C = Regex.Replace(pat.Value, "[iˈ]", ""); + if (SonorityIndex(C[1]) > SonorityIndex(C[2])) + P += 3; // + } + // ('sern'cê => 'se'rin'cê) + pat = Regex.Match(Candidates[i], "([^aeêouûiîˈ])([^aeêouûiîˈ])ˈ([^aeêouûiîˈ])"); + if (pat.Success) + { + var C = Regex.Replace(pat.Value, "[iˈ]", ""); + if (SonorityIndex(C[0]) > SonorityIndex(C[1])) + P += 3; + } + // ('ser'ni'cê => 'se'rin'cê) + pat = Regex.Match(Candidates[i], "([^aeêouûiîˈ])ˈ([^aeêouûiîˈ])iˈ([^aeêouûiîˈ])"); + if (pat.Success) + { + var C = Regex.Replace(pat.Value, "[iˈ]", ""); + if (SonorityIndex(C[0]) > SonorityIndex(C[1]) && SonorityIndex(C[1]) > SonorityIndex(C[2])) + P += 3; + } + // ('gi'rit'nê => 'gir'ti'nê) ('ku'şit'ne => 'kuş'ti'ne) + pat = Regex.Match(Candidates[i], "[aeêouûiî]ˈ([^aeêouûiîˈ])i([^aeêouûiîˈ])ˈ([^aeêouûiîˈ])"); + if (pat.Success) + { + var C = Regex.Replace(pat.Value, "[aeêouûiîˈ]", ""); + if (SonorityIndex(C[2]) >= SonorityIndex(C[1])) + P += 3; + } + Penalty.Add(Candidates[i], P); + } + output = Penalty.OrderBy(x => x.Value).ToDictionary(x => x.Key, x => x.Value); + } + return output; + } + + // Sonority Sequencing Principle in EVAL needs phoneme ranking + private static int SonorityIndex(char ch) + { + var c = ch.ToString(); + if (Regex.IsMatch(c, "[wy]")) // Approximant + return 6; + if (Regex.IsMatch(c, "[lłrř]")) // lateral + return 5; + if (Regex.IsMatch(c, "[mn]")) // nasal + return 4; + if (Regex.IsMatch(c, "[fvszşjxẍƹḧh]")) // fricative + return 3; + if (Regex.IsMatch(c, "[cç]")) // affricate + return 2; + else // stop + return 1; + } + + // for tests + public static Dictionary AllCandidates(string grapheme) + { + return EVAL(Generator(g2pNormalize(grapheme))); + } + } +} \ No newline at end of file diff --git a/Normalize.cs b/Normalize.cs new file mode 100644 index 0000000..8072a49 --- /dev/null +++ b/Normalize.cs @@ -0,0 +1,373 @@ +// Automated Kurdish Text Normalization خاوێن کردنی ئۆتۆماتیکی دەقی کوردی +// Copyright (C) 2019 Aso Mahmudi, Hadi Veisi, Mohammad MohammadAmini, Hawre Hosseini +// Developer and Maintainer: Aso Mahmudi (aso.mehmudi@gmail.com) + +// Source Code: https://github.com/AsoSoft/AsoSoft-Library +// Paper: https://www.researchgate.net/publication/333729065 +// Cite: +// @inproceedings{mahmudi2019automated, +// title={Automated Kurdish Text Normalization}, +// author={Mahmudi, Aso and Veisi, Hadi and MohammadAmini, Mohammad and Hosseini, Hawre}, +// booktitle={The Second International Conference on Kurdish and Persian Languages and Literature}, +// year={2019} +// } + +using System.Collections.Generic; +using System.Text; +using System.Text.RegularExpressions; + +namespace AsoSoftLibrary +{ + public static partial class AsoSoft + { + + static Dictionary DeepReplacements = LoadNormalizerReplaces(replaceFiles.NormalizerDeep); + static Dictionary additionalReplacements = LoadNormalizerReplaces(replaceFiles.NormalizerAdditional); + + // ================= Converting Non-Standard Fonts ================= + + /// Converts Kurdish text written in AliK fonts into Unicode standard + public static string AliK2Unicode(string text) => replaceByList(text, normalizationReplaces["AliK2Unicode"]); + + /// Converts Kurdish text written in AliWeb fonts into Unicode standard + public static string AliWeb2Unicode(string text) => replaceByList(text, normalizationReplaces["AliWeb2Unicode"]); + + /// Converts Kurdish text written in KDylan fonts into Unicode standard + public static string Dylan2Unicode(string text) => replaceByList(text, normalizationReplaces["Dylan2Unicode"]); + + /// Converts Kurdish text written in Zarnegar fonts into Unicode standard + public static string Zarnegar2Unicode(string text) => replaceByList(text, normalizationReplaces["Zarnegar2Unicode"]); + + static string Ku = "ئابپتجچحخدرڕزژسشعغفڤقکگلڵمنوۆەهھیێأإآثذصضطظكيىةڎۊؤ" + + "\u064B-\u065F"; // Haraka + static string joiners = "ئبپتثجچحخسشصضطظعغفڤقکكگلڵمنیيهھێ"; + private static readonly Dictionary> normalizationReplaces = new Dictionary> + { + {"NormalizeKurdish1", new List() { + //========= Tatweels (U+0640) + "\u0640{2,}", "\u0640", // merge + $"(?<=[{joiners}])\u0640(?=[{Ku}])", "", // delete unnecessary tatweel e.g. هـا to ها + // replace tatweel nonadjacent to Kurdish letters with dash + $"(?<=[{joiners}])\u0640", "\uF640", // temporal preserve + $"\u0640(?=[{Ku}])", "\uF640", // temporal preserve + "\u0640", "-", + "\uF640", "\u0640", + + //========= Zero-Width Non-Joiner + "[\uFEFF\u200C]+", "\u200C", //Standardize and remove dublicated ZWNJ + // remove unnecessary ZWNJ + "\u200C(?=(\\s|\\p{P}|$))", "", // ZWNJ + white spaces + $"(? ماهـ + $"(?() { + //========= standard H, E, Y, K + "ه" + "\u200C", "ە", // Heh+ZWNJ => kurdish AE + "ه" + "(?!([" + Ku +"ـ]|$))", "ە", //final Heh looks like Ae + "ھ" + "(?!([" + Ku +"]|$))", "هـ", // final Heh Doachashmee + "ھ" , "ه", // non-final Heh Doachashmee + "ى|ي", "ی", // Alef maksura | Arabic Ye => Farsi ye + "ك", "ک", // Arabic Kaf => Farsi Ke + "\u200C" + "و ", " و ", // شوێن‌و جێ => شوێن و جێ + //"\u200C" + "دا" + "(?![" + Ku + @"]($|[ \t]))", "دا", // شوێن‌دا => شوێندا + //"(? بێ شوێن + + //========= errors from font conversion + "لاَ|لاً|لأ", "ڵا", + "(ی|ێ)" + "[\u064E\u064B]+", "ێ", //FATHA & FATHATAN + "(و|ۆ)" + "[\u064E\u064B]+", "ۆ", + "(ل|ڵ)" + "[\u064E\u064B]+", "ڵ", + "(ر|ڕ)" + "\u0650+", "ڕ", //KASRA + }}, + {"NormalizeKurdish3", new List() { + "(?() { + "لاَ|لآ|لاً", "ڵا", + "لً|لَ|لأ", "ڵ", + "ة", "ە", + "ه" + "(?!([ئابپتجچحخدرڕزژسشعغفڤقکگلڵمنوۆەهھیێأإآثذصضطظكيىةڎۊؤ]|$))", "هـ", + "ض", "چ", + "ث", "پ", + "ظ", "ڤ", + "ط", "گ", + "ك", "ک", + "ىَ|يَ|یَ|آ", "ێ", + "رِ", "ڕ", + "ؤ|وَ", "ۆ", + "ي|ى", "ی", + "ء", "\u200Cو", + "ِ", "", + "ذ", "ژ" + }}, + {"AliWeb2Unicode", new List() { + "لاَ|لآ|لاً", "ڵا", + "لَ|پ", "ڵ", + "ة", "ە", + "ه", "ھ", + "ه", "ھ", + "رِ|أ", "ڕ", + "ؤ|وَ", "ۆ", + "يَ|یَ", "ێ", + "ص", "ێ", + "ي", "ی", + "ط", "ڭ", //swap ط and گ + "گ", "ط", // + "ڭ", "گ", // + "ض", "چ", + "ث", "پ", + "ظ", "ڤ", + "ْ|ُ", "", + "ى", "*", + "ك", "ک", + "ذ", "ژ" + }}, + {"Dylan2Unicode", new List() { + "لإ|لأ|لآ", "ڵا", + "ؤ|وَ", "ۆ", + "ة", "ە", + "ض", "ڤ", + "ص", "ڵ", + "ث", "ێ", + "ؤ", "ۆ", + "ه", "ھ", + "ك", "ک", + "ي|ى", "ی", + "ذ", "ڕ" + }}, + {"Zarnegar2Unicode", new List() { + "لاٌ", "ڵا", + "ى|ي", "ی", + "یٌ", "ێ", + "ه‏", "ە", + "لٌ", "ڵ", + "رٍ", "ڕ", + "وٌ", "ۆ" + }}, + {"SeperateDigits", new List() { + "(?() { + "\\(\\(", "«", + "\\)\\)", "»", + "»", "\uF8FA", // temp replacement «x»eke + "\\)", "\uF8FB", //temp replacement + "([!.:;?،؛؟]+)(\\p{Pi})", "$1 $2", + "(\\p{P}+)(?![\\s\\p{P}])", "$1 ", // Seprate all punctuations + "\uF8FA", "»", // undo temp replacement + "\uF8FB", ")", // undo temp replacement + "(?() { + " ((\\p{Pe}|\\p{Pf})+)", "$1", // A ) B => A) B + "((\\p{Ps}|\\p{Pi})+) ", "$1", // A ( B => A (B + " ([!.:;?،؛؟]+)", "$1", // A ! => A! + }} + }; + + private static string replaceByList(string text, List replaceList) + { + for (int i = 0; i < replaceList.Count; i += 2) + text = Regex.Replace(text, replaceList[i], replaceList[i + 1]); + return text; + } + // ================= Normalization ================= + private static Dictionary LoadNormalizerReplaces(string file) + { + var output = new Dictionary(); + foreach (var item in file.Trim().Split('\n')) + { + var chOld = System.Convert.ToChar(System.Convert.ToUInt32(item.Split('\t')[0], 16)); + var chNew = ""; + foreach (var ch in item.Split('\t')[1].Split(' ')) + if (ch != "") + chNew += System.Convert.ToChar(System.Convert.ToUInt32(ch, 16)); + if (!output.ContainsKey(chOld)) + output.Add(chOld, chNew); + } + return output; + } + + /// Main Unicode Normalization for Central Kurdish + public static string Normalize(string text) + { + return Normalize(text, true, true, true, true, new Dictionary()); + } + + /// Unicode Normalization for Central Kurdish + public static string Normalize(string text, + bool isOnlyKurdish, + bool changeInitialR, + bool deepUnicodeCorrectios, + bool additionalUnicodeCorrections, + Dictionary usersReplaceList) + { + var replaces = new Dictionary(); + // Character-based replacement (ReplaceList and Private Use Area) + var CharList = new List(); + for (int i = 0; i < text.Length; i++) + if (!CharList.Contains(text[i])) + CharList.Add(text[i]); + + if(deepUnicodeCorrectios) + foreach (var item in DeepReplacements) + if(CharList.Contains(item.Key)) + replaces.Add(item.Key, item.Value); + if (additionalUnicodeCorrections) + foreach (var item in additionalReplacements) + if(CharList.Contains(item.Key) && !replaces.ContainsKey(item.Key)) + replaces.Add(item.Key, item.Value); + foreach (var item in usersReplaceList) + if (CharList.Contains(item.Key) && !replaces.ContainsKey(item.Key)) + replaces.Add(item.Key, item.Value); + + foreach (var ch in CharList) + { + if (replaces.ContainsKey(ch)) //ReplaceList + text = text.Replace(ch.ToString(), replaces[ch]); + else if (ch > 57343 && ch < 63744) //Private Use Area + text = text.Replace(ch, '□'); // u25A1 White Square + } + + text = replaceByList(text, normalizationReplaces["NormalizeKurdish1"]); + + // if the text is Monolingual (only Central Kurdish) + if (isOnlyKurdish) + { + text = replaceByList(text, normalizationReplaces["NormalizeKurdish2"]); + //========= Initial r + if (changeInitialR) + text = replaceByList(text, normalizationReplaces["NormalizeKurdish3"]); + } + return text; + } + + // ===== Unifying Numerals ===== + private static readonly string[] digits = new string[]{ + "۰", "٠", "0", + "۱", "١", "1", + "۲", "٢", "2", + "۳", "٣", "3", + "۴", "٤", "4", + "۵", "٥", "5", + "۶", "٦", "6", + "۷", "٧", "7", + "۸", "٨", "8", + "۹", "٩", "9", }; + public static string UnifyNumerals(string text, string NumeralType) + { + for (int i = 0; i < digits.Length; i += 3) + { + if (NumeralType == "en") + text = Regex.Replace(text, digits[i] + "|" + digits[i + 1], digits[i + 2]); + else if (NumeralType == "ar") + text = Regex.Replace(text, digits[i] + "|" + digits[i + 2], digits[i + 1]); + } + return text; + } + + /// Seperate digits from words (e.g. replacing "12a" with "12 a") + public static string SeperateDigits(string text) => replaceByList(text, normalizationReplaces["SeperateDigits"]); + + /// Normalize Punctuations + public static string NormalizePunctuations(string text, bool seprateAllPunctuations) + { + text = text.Replace('"', '\uF8FD'); //temp replacement + text = replaceByList(text, normalizationReplaces["NormalizePunctuations1"]); + if (!seprateAllPunctuations) + { + text = replaceByList(text, normalizationReplaces["NormalizePunctuations2"]); + } + text = text.Replace('\uF8FD', '"'); //undo temp replacement + return text; + } + + + /// Trim white spaces of a line + public static string TrimLine(string line) + { + line = Regex.Replace(line.Trim(), "[\u200B\u200C\uFEFF]+$", ""); + line = Regex.Replace(line.Trim(), "^[\u200B\u200C\uFEFF]+", ""); + return line.Trim(); + } + + /// HTML Entity replacement for web crawled texts (e.g. "é" with "é") + public static string ReplaceHtmlEntity(string text) + { + return Regex.Replace(text, "&[a-zA-Z]+;", m => System.Net.WebUtility.HtmlDecode(m.Value)); + } + + /// Replace URLs and Emails with a certain word (improves language models) + public static string ReplaceUrlEmail(string text) + { + text = Regex.Replace(text, "([a-zA-Z0-9_\\-\\.]+)@([a-zA-Z0-9_\\-\\.]+\\.[a-zA-Z]{2,5})", "EmailAddress"); + text = Regex.Replace(text, "((http[s]?|ftp)?://([\\w-]+\\.)+[\\w-]+)(/[\\w-~./?%+&=]*)?", "URL"); + return text; + } + + /// Character replacement for ANSI CodePage + public static string Char2CharReplacment(string text, Dictionary Codepage) + { + foreach (var item in Codepage) + text = text.Replace(item.Key, item.Value); + return text; + } + + /// Correction Table (word replacement ) + public static string Word2WordReplacement(string line, Dictionary wordReplacements) + { + return Regex.Replace(line, "(? wordReplacements.ContainsKey(m.Value) ? wordReplacements[m.Value] : m.Value); + } + + //================= have to be improved: ================= + + /// Delete non-CK lines (fast but not accurate; we need a language detector.) + public static string DeleteNonKurdish(string line, int KurdishRateThreshold) + { + float KuPersent = Regex.Matches(line, "[پچژگڵۆڕێڤەھ]").Count / (float)line.Length; + if (KuPersent < KurdishRateThreshold / 100.0) + line = ""; + return line; + } + + /// Embrace sentences with start/end tags + public static string MarkSentence(string line, string sentenceTag) + { + var tagStart = "<" + sentenceTag + ">"; + var tagEnd = ""; + + // ending punctuations !?؟ + line = Regex.Replace(line.TrimEnd(), "([!?؟]+)(?!$)", "$1 " + tagEnd + tagStart); + // full stop + line = Regex.Replace(line, "([\\w\u200C]{2,} ?\\.)(?!([0-9a-zA-Z.]|$))", "$1 " + tagEnd + tagStart); + + return tagStart + line + tagEnd; + } + } +} + +// ================= Regex Hints ================= +// docs.microsoft.com/en-us/dotnet/standard/base-types/character-classes-in-regular-expressions +// Lookbehind Positive: (?<=a)b +// Lookbehind Negative: (? 12345678 + text = Regex.Replace(text, "(? floatName(m.Groups[1].Value.ToString(), m.Groups[2].Value.ToString())); + + //convert remaining integr numbers + text = Regex.Replace(text, "([0-9]+)", + m => integerName(m.Groups[1].Value.ToString())); + + return text; + } + + private static string floatName(string integerPart, string decimalPart) + { + var point = " پۆینت " + Regex.Replace(decimalPart, "(?<=^|0)0", " سفر "); + point = Regex.Replace(point, "[0-9]", ""); + return integerName(integerPart) + point + integerName(decimalPart); + } + + private static string integerName(string inputInteger) + { + var output = ""; + if (inputInteger != "0") + { + string[] ones = { "", "یەک", "دوو", "سێ", "چوار", "پێنج", "شەش", "حەوت", "هەشت", "نۆ" }; + string[] teens = { "دە", "یازدە", "دوازدە", "سێزدە", "چواردە", "پازدە", "شازدە", "حەڤدە", "هەژدە", "نۆزدە" }; + string[] tens = { "", "", "بیست", "سی", "چل", "پەنجا", "شەست", "هەفتا", "هەشتا", "نەوەد" }; + string[] hundreds = { "", "سەد", "دووسەد", "سێسەد", "چوارسەد", "پێنسەد", "شەشسەد", "حەوتسەد", "هەشتسەد", "نۆسەد" }; + string[] thousands = { "", " هەزار", " ملیۆن", " ملیار", " بلیۆن", " بلیار", " تریلیۆن", " تریلیار", " کوادرلیۆن" }; + var temp = inputInteger; + for (int i = 0; i < inputInteger.Length; i = i + 3) + { + string currentThree = Regex.Match(temp, "([0-9]{1,3})$").Result("$1"); + temp = temp.Substring(0, temp.Length - currentThree.Length); + currentThree = currentThree.PadLeft(3, '0'); + var C = Int32.Parse(currentThree[0].ToString()); + var X = Int32.Parse(currentThree[1].ToString()); + var I = Int32.Parse(currentThree[2].ToString()); + var conjunction1 = ((C != 0) && (X != 0 || I != 0)) ? " و " : ""; + var conjunction2 = (X != 0 && I != 0) ? " و " : ""; + if (X == 1) + currentThree = hundreds[C] + conjunction1 + teens[I]; + else + currentThree = hundreds[C] + conjunction1 + tens[X] + conjunction2 + ones[I]; + var M = (currentThree == "") ? "" : thousands[(int)(Math.Floor(i / 3.0))]; + currentThree += M; + var conjunction3 = (output == "") ? "" : " و "; + if (currentThree != "") + output = currentThree + conjunction3 + output; + } + output = output.Replace("یەک هەزار", "هەزار"); + } + else // if input number = 0 + output = "سفر"; + return output; + } + } +} \ No newline at end of file diff --git a/README.md b/README.md index 7a29d1c..88c3eb6 100644 --- a/README.md +++ b/README.md @@ -1,42 +1,67 @@ # AsoSoft Library AsoSoft Library offers basic natural language processing (NLP) algorithms for the Kurdish Language (ckb: Central branch of Kurdish). AsoSoft Library is written in C#. -- **Normalizer:** normalizes Kurdish text and punctuations, unifies numerals, replaces Html Entities, extracts and replaces URLs and emails, and more. +- **Grapheme-to-Phoneme (G2P) converter and Transliteration**: converts Kurdish text into syllabified phoneme string. Also transliterates Kurdish texts from Arabic script into Latin script and vice versa. +- **Normalizer:** normalizes the Kurdish text and punctuation marks, unifies numerals, replaces Html Entities, extracts and replaces URLs and emails, and more. - **Numeral Converter:** converts any type of numbers into Kurdish words. -- **Grapheme-to-Phoneme Convertor** *(coming soon)*: converts Kurdish text into syllabified phoneme string, also transliterates Kurdish texts from Arabic script into Latin script. -## Reference -If you find this code useful in your research, please consider citing [this paper](https://www.researchgate.net/publication/333729065): +## Grapheme-to-Phoneme (G2P) converter and Transliteration +This function is based on the study "[Automated Grapheme-to-Phoneme Conversion for Central Kurdish based on Optimality Theory](https://www.sciencedirect.com/science/article/abs/pii/S0885230821000292)". - @inproceedings{KurdNormalization2019, - Author = {Aso Mahmudi, Hadi Veisi, Mohammad Mohammadamini, Hawre Hosseini}, - Title = {Automated Kurdish Text Normalization خاوێن کردنی ئۆتۆماتیکی دەقی کوردی}, - Booktitle = {دومین همایش مشترک بین المللی مطالعات زبان و ادبیات کردی و فارسی}, - City = {Sanandaj, Iran} - Year = {2019} - } +### Kurdish G2P converter +Converts Central Kurdish text in standard Arabic script into **syllabified phonemic** Latin script (i.e. graphemes to phonems) +General format: +```cs +AsoSoft.G2P(string text, + bool convertNumbersToWord = false, + bool backMergeConjunction = true, + bool singleOutputPerWord = true); +``` +An example: +```cs +AsoSoft.G2P("شەو و ڕۆژ بووین بە گرفت. درێژیی دیوارەکەی گرتن"); +>ˈşeˈwû ˈřoj ˈbûyn ˈbe ˈgiˈrift. ˈdiˈrêˈjîy ˈdîˈwaˈreˈkey ˈgirˈtin< +``` +### Transliteration +Latin script (Hawar) into Arabic script: +```cs +AsoSoft.La2Ar("Gelî keç û xortên kurdan, hûn hemû bi xêr biçin"); +>گەلی کەچ و خۆرتێن کوردان، هوون هەموو ب خێر بچن< +``` + +Arabic script into Hawar Latin script (ح‌غ‌ڕڵ→ḧẍřł): +```cs +AsoSoft.Ar2La("گیرۆدەی خاڵی ڕەشتە؛ گوێت لە نەغمەی تویوورە؟"); +>gîrodey xałî řeşte; gwêt le neẍmey tuyûre?< +``` + +Arabic script into simplified (ḧẍřł→hxrl) Hawar Latin script: +```cs +AsoSoft.Ar2LaSimple("گیرۆدەی خاڵی ڕەشتە؛ گوێت لە نەغمەی تویوورە؟"); +>gîrodey xalî reşte; gwêt le nexmey tuyûre?< +``` ## Kurdish Text Normalizer Several functions needed for Central Kurdish text normalization: ### Normalize Kurdish Two character replacement lists are provided as the resources of the library: -- Required: +- Deep Unicode Corrections: - replacing deprecated Arabic Presentation Forms (FB50–FDFF and FE70–FEFF) with corresponding standard characters. - replacing different types of dashes and spaces - removing Unicode control character -- Optional +- Additional Unicode Corrections - replacing special Arabic math signs with corresponding Latin characters - replacing similar, but different letters with standard characters (e.g. ڪ,ے,ٶ with ک,ی,ؤ) The normalization task in this function: -- for all Arabic scripts: +- for all Arabic scripts (including Kurdish, Arabic, and Persian): - Character-based replacement: - - above Replace Lists + - Above mentioned replacement lists - Private Use Area (U+E000 to U+F8FF) with White Square character - - Standardizing and removing duplicated or unnecessary Zero-Width characters - - removing unnecessary Tatweels (U+0640) + - Standardizing and removing duplicated or unnecessary Zero-Width characters + - removing unnecessary Tatweels (U+0640) - only for Central Kurdish: - standardizing Kurdish characters: ە, هـ, ی, and ک - correcting miss-converted characters from non-Unicode fonts @@ -44,65 +69,64 @@ The normalization task in this function: the simple overloading: ```cs -AsoSoftNormalization.NormalizeKurdish("دەقے شیَعري خـــۆش. ره‌نگه‌كاني خاك"); +AsoSoft.Normalize("دەقے شیَعري خـــۆش. ره‌نگه‌كاني خاك"); >دەقے شێعری خۆش. ڕەنگەکانی خاک< ``` or the complete overloading: ```cs -var files = new List { - AsoSoftResources.NormalizerReplacesRequierd, - AsoSoftResources.NormalizerReplacesOptional -}; -var ReplaceList = AsoSoftNormalization.LoadNormalizerReplaces(files); -AsoSoftNormalization.NormalizeKurdish("دەقے شیَعري خـــۆش. ره‌نگه‌كاني خاك", true, true, ReplaceList); ->دەقی شێعری خۆش. ڕەنگەکانی خاک< +AsoSoft.Normalize(string text, + bool isOnlyKurdish, + bool changeInitialR, + bool deepUnicodeCorrectios, + bool additionalUnicodeCorrections, + Dictionary usersReplaceList); ``` ### AliK to Unicode `AliK2Unicode` converts Kurdish text written in AliK fonts (developed by Abas Majid in 1997) into Unicode standard. Ali-K fonts: *Alwand, Azzam, Hasan, Jiddah, kanaqen, Khalid, Sahifa, Sahifa Bold, Samik, Sayid, Sharif, Shrif Bold, Sulaimania, Traditional* ```cs -AsoSoftNormalization.AliK2Unicode("ئاشناكردنى خويَندكار بة طوَرِانكاريية كوَمةلاَيةتييةكان"); +AsoSoft.AliK2Unicode("ئاشناكردنى خويَندكار بة طوَرِانكاريية كوَمةلاَيةتييةكان"); >ئاشناکردنی خوێندکار بە گۆڕانکارییە کۆمەڵایەتییەکان< ``` ### AliWeb to Unicode `AliWeb2Unicode` converts Kurdish text written in AliK fonts into Unicode standard. Ali-Web fonts: *Malper, Malper Bold, Samik, Traditional, Traditional Bold* ```cs -AsoSoftNormalization.AliWeb2Unicode("هةر جةرةيانصکي مصذووُيي کة أوو دةدا"); +AsoSoft.AliWeb2Unicode("هةر جةرةيانصکي مصذووُيي کة أوو دةدا"); >ھەر جەرەیانێکی مێژوویی کە ڕوو دەدا< ``` ### Dylan to Unicode `Dylan2Unicode` converts Kurdish text written in Dylan fonts (developed by Dylan Saleh at [KurdSoft]( https://web.archive.org/web/20020528231610/http://www.kurdsoft.com/) in 2001) into Unicode standard. ```cs -AsoSoftNormalization.Dylan2Unicode("لثكؤلثنةران بؤيان دةركةوتووة كة دةتوانث بؤ لةش بةكةصك بث"); +AsoSoft.Dylan2Unicode("لثكؤلثنةران بؤيان دةركةوتووة كة دةتوانث بؤ لةش بةكةصك بث"); >لێکۆلێنەران بۆیان دەرکەوتووە کە دەتوانێ بۆ لەش بەکەڵک بێ< ``` ### Zarnegar to Unicode -`Zarnegar2Unicode` converts Kurdish text written in Zarnegar word processor (developed by [SinaSoft](http://www.sinasoft.com/fa/zarnegar.html) with RDF convertor by [NoorSoft](https://www.noorsoft.org/fa/software/view/6561)) and into Unicode standard. +`Zarnegar2Unicode` converts Kurdish text written in Zarnegar word processor (developed by [SinaSoft](http://www.sinasoft.com/fa/zarnegar.html) with RDF converter by [NoorSoft](https://www.noorsoft.org/fa/software/view/6561)) and into Unicode standard. ```cs -AsoSoftNormalization.Zarnegar2Unicode("بلٌيٌين و بگه‏رٍيٌين بوٌ هه‏لاٌلٌه‏ى سىٌيه‏مى فه‏لسه‏فه‏"); +AsoSoft.Zarnegar2Unicode("بلٌيٌين و بگه‏رٍيٌين بوٌ هه‏لاٌلٌه‏ى سىٌيه‏مى فه‏لسه‏فه‏"); >بڵێین و بگەڕێین بۆ هەڵاڵەی سێیەمی فەلسەفە< ``` ### NormalizePunctuations `NormalizePunctuations` corrects spaces before and after of the punctuations. When `seprateAllPunctuations` is true, ```cs -AsoSoftNormalization.NormalizePunctuations("دەقی«کوردی » و ڕێنووس ،((خاڵبەندی )) چۆنە ؟", false); +AsoSoft.NormalizePunctuations("دەقی«کوردی » و ڕێنووس ،((خاڵبەندی )) چۆنە ؟", false); >دەقی «کوردی» و ڕێنووس، «خاڵبەندی» چۆنە؟< ``` ### Trim Line Trim starting and ending white spaces (including zero width spaces) of line, `TrimLine` ```cs -AsoSoftNormalization.TrimLine(" دەق\u200c "); +AsoSoft.TrimLine(" دەق\u200c "); >دەق< ``` ### Replace Html Entities `ReplaceHtmlEntity` replaces HTML Entities with single Unicode characters (e.g. "é" with "é"). It is useful in web crawled corpora. ```cs -AsoSoftNormalization.ReplaceHtmlEntity("ئێوە "دەق" لە زمانی <کوردی> دەنووسن"); +AsoSoft.ReplaceHtmlEntity("ئێوە "دەق" لە زمانی <کوردی> دەنووسن"); >ئێوە "دەق" بە زمانی <کوردی> دەنووسن< ``` ### Replace URLs and emails @@ -111,29 +135,29 @@ AsoSoftNormalization.ReplaceHtmlEntity("ئێوە "دەق" لە زما ### Unify Numerals `UnifyNumerals` unifies numeral characters into desired numeral type from `en` (0123456789) or `ar` (٠١٢٣٤٥٦٧٨٩) ```cs -AsoSoftNormalization.UnifyNumerals("ژمارەکانی ٤٥٦ و ۴۵۶ و 456", "en"); +AsoSoft.UnifyNumerals("ژمارەکانی ٤٥٦ و ۴۵۶ و 456", "en"); >ژمارەکانی 456 و 456 و 456< ``` ### Seperate Digits from words `SeperateDigits` add a space between joined numerals and words (e.g. replacing "12کەس" with "12 کەس"). It improves language models. ```cs -AsoSoftNormalization.SeperateDigits("ساڵی1950دا1000دۆلاریان بە 5کەس دا"); ->ساڵی 1950 دا 1000 دۆلاریان بە 5 کەس دا< +AsoSoft.SeperateDigits("لە ساڵی1950دا1000دۆلاریان بە 5کەس دا"); +>لە ساڵی 1950 دا 1000 دۆلاریان بە 5 کەس دا< ``` -### Word for Word Replacment +### Word to Word Replacment `Word2WordReplacement` applies a "string to string" replacement dictionary on the text. It replaces the full-matched words not a part of them. ```cs var dict = new Dictionary() { { "مال", "ماڵ" } }; -AsoSoftNormalization.Word2WordReplacement("مال، نووری مالیکی", dict); +AsoSoft.Word2WordReplacement("مال، نووری مالیکی", dict); >ماڵ، نووری مالیکی< ``` -### Character for Character Replacment +### Character to Character Replacment `Char2CharReplacment` applies a "char to char" replacement dictionary on the text. It uses as the final step needed for some non-Unicode systems. -## Kurdish Numeral Convertor +## Kurdish Numeral converter It converts numerals into Central Kurdish words. It is useful in text-to-speech tools. - integers (1100 => ) - floats (10.11) @@ -142,21 +166,14 @@ It converts numerals into Central Kurdish words. It is useful in text-to-speech - querency marks ($100, £100, and €100) ```cs -AsoSoftNumerals.Number2Word("لە ساڵی 1999دا بڕی 40% لە پارەکەیان واتە $102.1 یان وەرگرت"); +AsoSoft.Number2Word("لە ساڵی 1999دا بڕی 40% لە پارەکەیان واتە $102.1 یان وەرگرت""); >لە ساڵی هەزار و نۆسەد و نەوەد و نۆدا بڕی چل لە سەد لە پارەکەیان واتە سەد و دوو پۆینت یەک دۆلاریان وەرگرت< ``` ## How to use? -In Microsoft Visual Studio, you have two choices: -- If you want to debug or change or customize the AsoSoft classes: - - inside Solution Explorer, right-click on your solution, click Add>Existing Project. - - Then right-click on your project, click Add>Project Reference... -- If you just use the AsoSoft classes: - - inside Solution Explorer, right-click on your project, click "Add Project Reference" - - click Browse, find "AsoSoftLibrary.dll", click Add. - -Then, insert `using AsoSoftLibrary;` into "Usings" of your class. +Install [AsoSoft Library package](https://www.nuget.org/packages/AsoSoftLibrary) via NuGet Gallery. +Then, insert `using AsoSoftLibrary;` into "Usings" of your codes. ## Development -AsoSoft Library is written in C# (.NET Core) and it is platform independent. -Using an IDE like Visual Studio 2017+ is recommended on Windows. Alternatively, VSCode should be the tool of choice on other platforms. +AsoSoft Library is developed and maintained by Aso Mahmudi. +AsoSoft Library is written in C# (.NET Core). \ No newline at end of file diff --git a/Sort.cs b/Sort.cs new file mode 100644 index 0000000..15729e5 --- /dev/null +++ b/Sort.cs @@ -0,0 +1,31 @@ +using System; +using System.Collections.Generic; +using System.Text; + +namespace AsoSoftLibrary +{ + public static partial class AsoSoft + { + public static List KurdishSort(List inputList) + { + var ku = new List(); + ku.AddRange("ئءاآأإبپتثجچحخدڎڊذرڕزژسشصضطظعغفڤقكکگڴلڵمنوۆۊۉۋهھەیێ"); + return CustomSort(inputList, ku); + } + public static List CustomSort(List inputList, List inputOrder) + { + var baseChar = 62000;// 9472; + var order = new List(); + for (int i = 0; i < inputOrder.Count; i++) + order.Add((char)(baseChar + i)); + for (int i = 0; i < inputList.Count; i++) + for (int j = 0; j < order.Count; j++) + inputList[i] = inputList[i].Replace(inputOrder[j], order[j]); + inputList.Sort(); + for (int i = 0; i < inputList.Count; i++) + for (int j = 0; j < order.Count; j++) + inputList[i] = inputList[i].Replace(order[j], inputOrder[j]); + return inputList; + } + } +} diff --git a/Transliteration.cs b/Transliteration.cs new file mode 100644 index 0000000..ee446b7 --- /dev/null +++ b/Transliteration.cs @@ -0,0 +1,138 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Text.RegularExpressions; +using System.Threading.Tasks; + +namespace AsoSoftLibrary +{ + public static partial class AsoSoft + { + + private static readonly string latinLetters = "a-zêîûçşéúıŕřĺɫƚḧẍḍṿʔ"; + + private static readonly Dictionary> TransliterationReplaces = new Dictionary> + { + {"LaDi2Ar", new List() { + "gh", "ẍ", + "hh", "ḧ", + "ll", "ɫ", + "rr", "ř" + }}, + {"La2Ar", new List() { + "\u201C", "«", + "\u201D", "»", + $"([0-9])([\'’-])([aeiouêîûéú])", "$1$3", // (e.g. 1990'an 5'ê) + "ʔ", "", // glottal stop + $"(^|[^{latinLetters}0-9\"’])([aeiouêîûéú])", "$1ئ$2", //insert initial hamza + "([aeouêîûéú])([aeiouêîûéú])", "$1ئ$2", //insert hamza between adjacent vowels + $"(ئ)([uû])([^{latinLetters}0-9])", "و$3", //omit the inserted hamza for "û" (=and) + "a", "ا", + "b", "ب", + "ç", "چ", + "c", "ج", + "d", "د", + "ḍ", "ڎ", // a Horami consonant + "ê|é", "ێ", + "e", "ە", + "f", "ف", + "g", "گ", + "h", "ه", + "ḧ", "ح", + "i|ı", "", + "î|y|í", "ی", + "j", "ژ", + "k", "ک", + "l", "ل", + "ɫ|ł|ƚ|Ɨ|ĺ", "ڵ", + "m", "م", + "n", "ن", + "ŋ", "نگ", + "o", "ۆ", + "ö", "وێ", + "p", "پ", + "q", "ق", + "r", "ر", + "ř|ŕ", "ڕ", + "s", "س", + "ş|š|ș|s̩", "ش", + "ṣ", "ص", + "t", "ت", + "ṭ", "ط", + "û|ú", "وو", + "u|w", "و", + "ü", "ۊ", + "v", "ڤ", + "x", "خ", + "ẍ", "غ", + "z", "ز", + "ه" + "($|[^ابپتجچحخدرڕزژسشصعغفڤقکگلڵمنوۆهەیێ])", "هـ" + "$1", // word-final h + "\"|’", "ئ", // need checking, not sure "ع" or "ئ" + "\\u003F", "؟", //question mark + ",", "،", //comma + ";", "؛" //semicolon + }} + }; + + /// Transliterating the Latin script into Arabic script of Kurdish (e.g. çak→چاک) + public static string La2Ar(string text) + { + text = replaceByList(text.ToLower(), TransliterationReplaces["La2ArMain"]); + return text; + } + + /// Transliterating the Latin script with digraphs into Arabic script of Kurdish (e.g. chall→چاڵ) + public static string LaDigraph2Ar(string text) + { + text = text.ToLower(); + text = replaceByList(text, TransliterationReplaces["LaDi2Ar"]); + text = replaceByList(text, TransliterationReplaces["La2Ar"]); + return text; + } + + /// Transliterating the Latin script into Arabic script of Kurdish (e.g. çak→چاک) + public static string Ar2La(string text) + { + return Phonemes2Hawar(G2P(text)); + } + /// Transliterating the Latin script into Arabic script of Kurdish (e.g. çak→چاک) + public static string Ar2LaSimple(string text) + { + text = Phonemes2Hawar(G2P(text)); + text = text.Replace("ḧ", "h"); + text = text.Replace("ř", "r"); + text = text.Replace("ł", "l"); + text = text.Replace("ẍ", "x"); + return text; + } + + /// Converts the output of the G2P into IPA (e.g. ˈdeˈçê→da.t͡ʃɛ) + public static string Phonemes2IPA(string text) + { + text = Regex.Replace(text, "(?<=(^|\\W))ˈ", ""); + text = Regex.Replace(text, "ˈ", "·"); //middle dot + foreach (var item in replaceFiles.Phoneme2IPA.Split('\n')) + text = text.Replace(item.Split('\t')[0], item.Split('\t')[1]); + return text; + } + + /// Converts the output of the G2P into Hawar (e.g. ˈʔeˈłêm→ełêm) + public static string Phonemes2Hawar(string text) + { + text = text.Replace("ˈ", ""); + text = Regex.Replace(text, "(?<=(^|\\W))ʔ", ""); + text = Regex.Replace(text, "[ʔƹ]", "’"); + return text; + } + + /// Converts the output of the G2P into Jira's ASCII format (e.g. ˈdeˈçim→D▪A▪CH▪M) + public static string Phonemes2ASCII(string text) + { + text = Regex.Replace(text, @"[iˈ]", ""); + foreach (var item in replaceFiles.Phoneme2Ascii.Split('\n')) + text = Regex.Replace(text, item.Split('\t')[0], item.Split('\t')[1] + "▪"); + return text; + } + } +} diff --git a/AsoSoftResources.Designer.cs b/replaceFiles.Designer.cs similarity index 87% rename from AsoSoftResources.Designer.cs rename to replaceFiles.Designer.cs index 5143e4d..c028480 100644 --- a/AsoSoftResources.Designer.cs +++ b/replaceFiles.Designer.cs @@ -19,17 +19,17 @@ namespace AsoSoftLibrary { // class via a tool like ResGen or Visual Studio. // To add or remove a member, edit your .ResX file then rerun ResGen // with the /str option, or rebuild your VS project. - [global::System.CodeDom.Compiler.GeneratedCodeAttribute("System.Resources.Tools.StronglyTypedResourceBuilder", "16.0.0.0")] + [global::System.CodeDom.Compiler.GeneratedCodeAttribute("System.Resources.Tools.StronglyTypedResourceBuilder", "17.0.0.0")] [global::System.Diagnostics.DebuggerNonUserCodeAttribute()] [global::System.Runtime.CompilerServices.CompilerGeneratedAttribute()] - public class AsoSoftResources { + public class replaceFiles { private static global::System.Resources.ResourceManager resourceMan; private static global::System.Globalization.CultureInfo resourceCulture; [global::System.Diagnostics.CodeAnalysis.SuppressMessageAttribute("Microsoft.Performance", "CA1811:AvoidUncalledPrivateCode")] - internal AsoSoftResources() { + internal replaceFiles() { } /// @@ -39,7 +39,7 @@ internal AsoSoftResources() { public static global::System.Resources.ResourceManager ResourceManager { get { if (object.ReferenceEquals(resourceMan, null)) { - global::System.Resources.ResourceManager temp = new global::System.Resources.ResourceManager("AsoSoftLibrary.AsoSoftResources", typeof(AsoSoftResources).Assembly); + global::System.Resources.ResourceManager temp = new global::System.Resources.ResourceManager("AsoSoftLibrary.replaceFiles", typeof(replaceFiles).Assembly); resourceMan = temp; } return resourceMan; @@ -60,62 +60,13 @@ internal AsoSoftResources() { } } - /// - /// Looks up a localized string similar to ʔ ʔ - ///b b - ///p p - ///t t - ///c d͡ʒ - ///ç t͡ʃ - ///ḧ ħ - ///x x - ///d d - ///r ɾ - ///ř r - ///z z - ///j ʒ - ///s s - ///ş ʃ - ///ƹ ʕ - ///ẍ ɣ - ///f f - ///v v - ///q q - ///k k - ///g g - ///l l - ///ł ɫ - ///m m - ///n n - ///w w - ///u ʊ - ///û u - ///o o̞ - ///h h - ///y j - ///a ä - ///e a - ///ê ɛ - ///i ɪ - ///î i - ///ĝ ŋ - ///đ đ - ///ü y - ///ô ô - ///õ õ. - /// - public static string G2P2IPA { - get { - return ResourceManager.GetString("G2P2IPA", resourceCulture); - } - } - /// /// Looks up a localized string similar to ڴ ĝ //Garusi Consonant ///ڎ đ //Hewrami Consonant - ///ۊ ü //Southern Vowel - ///ۉ ô //Hewrami Vowel - ///ۋ õ //Hewrami Vowel + ///ۉ ŵ //Hewrami Consonant + ///ݵ ė //Hewrami Vowel + ///ݸ ȯ //Hewrami Vowel + ///ۊ ẅ //Southern Vowel ///ئ ʔ ///ب b ///پ p @@ -153,11 +104,11 @@ public static string G2P2IPA { ///و(?=[aeêo]) w //before vowel ///(?<=[aeêo])ی y //after vowel ///ی(?=[aeêo]) y //before vowel - ///^([bcçdfghḧjklłmnpqrřsştvwxẍyzʔƹ])$ $1i //چ => çi. + ///^([bçdjl])$ $1i //چ => çi bcçdfghḧjklłmnpqrřsştvwxẍyzʔƹ. /// - public static string G2PCertainConversions { + public static string G2PCertain { get { - return ResourceManager.GetString("G2PCertainConversions", resourceCulture); + return ResourceManager.GetString("G2PCertain", resourceCulture); } } @@ -175,50 +126,6 @@ public static string G2PExceptions { } } - /// - /// Looks up a localized string similar to ʔ EH - ///a AA - ///b B - ///p P - ///t T - ///c JE - ///ç CH - ///ḧ HE - ///x X - ///d D - ///r R - ///ř RR - ///z Z - ///j ZH - ///s S - ///ş SH - ///ƹ AH - ///ẍ XE - ///f F - ///v V - ///q Q - ///k K - ///g G - ///l L - ///ł LL - ///m M - ///n N - ///o O - ///e A - ///h H - ///ê E - ///î I - ///y Y - ///w W - ///u U - ///û UU. - /// - public static string G2PReplaces { - get { - return ResourceManager.GetString("G2PReplaces", resourceCulture); - } - } - /// /// Looks up a localized string similar to 066A 0025 //Arabic PERCENT SIGN ///066B 002E //Arabic DECIMAL SEPARATOR @@ -250,9 +157,9 @@ public static string G2PReplaces { ///0699 0698 // ڙ ///076B 06 [rest of string was truncated]";. /// - public static string NormalizerReplacesOptional { + public static string NormalizerAdditional { get { - return ResourceManager.GetString("NormalizerReplacesOptional", resourceCulture); + return ResourceManager.GetString("NormalizerAdditional", resourceCulture); } } @@ -278,9 +185,103 @@ public static string NormalizerReplacesOptional { ///2003 0020 //EM SPACE ///2004 0020 //THREE [rest of string was truncated]";. /// - public static string NormalizerReplacesRequierd { + public static string NormalizerDeep { + get { + return ResourceManager.GetString("NormalizerDeep", resourceCulture); + } + } + + /// + /// Looks up a localized string similar to ʔ EH + ///a AA + ///b B + ///p P + ///t T + ///c JE + ///ç CH + ///ḧ HE + ///x X + ///d D + ///r R + ///ř RR + ///z Z + ///j ZH + ///s S + ///ş SH + ///ƹ AH + ///ẍ XE + ///f F + ///v V + ///q Q + ///k K + ///g G + ///l L + ///ł LL + ///m M + ///n N + ///o O + ///e A + ///h H + ///ê E + ///î I + ///y Y + ///w W + ///u U + ///û UU. + /// + public static string Phoneme2Ascii { + get { + return ResourceManager.GetString("Phoneme2Ascii", resourceCulture); + } + } + + /// + /// Looks up a localized string similar to ʔ ʔ + ///b b + ///p p + ///t t + ///c d͡ʒ + ///ç t͡ʃ + ///ḧ ħ + ///x x + ///d d + ///r ɾ + ///ř r + ///z z + ///j ʒ + ///s s + ///ş ʃ + ///ƹ ʕ + ///ẍ ɣ + ///f f + ///v v + ///q q + ///k k + ///g g + ///l l + ///ł ɫ + ///m m + ///n n + ///w w + ///u ʊ + ///û u + ///o o̞ + ///h h + ///y j + ///a ä + ///e a + ///ê ɛ + ///i ɪ + ///î i + ///ĝ ŋ + ///đ đ + ///ü y + ///ô ô + ///õ õ. + /// + public static string Phoneme2IPA { get { - return ResourceManager.GetString("NormalizerReplacesRequierd", resourceCulture); + return ResourceManager.GetString("Phoneme2IPA", resourceCulture); } } } diff --git a/AsoSoftResources.resx b/replaceFiles.resx similarity index 81% rename from AsoSoftResources.resx rename to replaceFiles.resx index f244f4e..a9ab131 100644 --- a/AsoSoftResources.resx +++ b/replaceFiles.resx @@ -118,10 +118,22 @@ System.Resources.ResXResourceWriter, System.Windows.Forms, Version=4.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089 - - Resources\NormalizerReplacesOptional.txt;System.String, mscorlib, Version=4.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089;utf-8 + + resources\G2PCertain.txt;System.String, mscorlib, Version=4.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089;utf-8 - - Resources\NormalizerReplacesRequierd.txt;System.String, mscorlib, Version=4.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089;utf-8 + + resources\G2PExceptions.txt;System.String, mscorlib, Version=4.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089;utf-8 + + + resources\NormalizeUnicodeAdditional.txt;System.String, mscorlib, Version=4.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089;utf-8 + + + resources\NormalizeUnicodeDeep.txt;System.String, mscorlib, Version=4.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089;utf-8 + + + resources\Phoneme2Ascii.txt;System.String, mscorlib, Version=4.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089;utf-8 + + + resources\Phoneme2IPA.txt;System.String, mscorlib, Version=4.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089;utf-8 \ No newline at end of file diff --git a/resources/G2PCertain.txt b/resources/G2PCertain.txt new file mode 100644 index 0000000..7bcee10 --- /dev/null +++ b/resources/G2PCertain.txt @@ -0,0 +1,44 @@ +ڴ ĝ //Garusi Consonant +ڎ đ //Hewrami Consonant +ۉ ŵ //Hewrami Consonant +ݵ ė //Hewrami Vowel +ݸ ȯ //Hewrami Vowel +ۊ ẅ //Southern Vowel +ئ ʔ +ب b +پ p +ت t +ج c +چ ç +ح ḧ +خ x +د d +ر r +ڕ ř +ز z +ژ j +س s +ش ş +ع ƹ +غ ẍ +ف f +ڤ v +ق q +ک k +گ g +ل l +ڵ ł +م m +ن n +ه h +ا a +ۆ o +ە e +ێ ê +^ی y +^و w +(?<=[aeêo])و w //after vowel +و(?=[aeêo]) w //before vowel +(?<=[aeêo])ی y //after vowel +ی(?=[aeêo]) y //before vowel +^([bçdjl])$ $1i //چ => çi bcçdfghḧjklłmnpqrřsştvwxẍyzʔƹ \ No newline at end of file diff --git a/resources/G2PExceptions.txt b/resources/G2PExceptions.txt new file mode 100644 index 0000000..0da1c2f --- /dev/null +++ b/resources/G2PExceptions.txt @@ -0,0 +1,6 @@ +حەییی ḧeyyî +تەییی teyyî +ئاگر ʔagir +قانع qaniƹ +سالم salim +عاشق ƹaşiq \ No newline at end of file diff --git a/Resources/NormalizerReplacesOptional.txt b/resources/NormalizeUnicodeAdditional.txt similarity index 100% rename from Resources/NormalizerReplacesOptional.txt rename to resources/NormalizeUnicodeAdditional.txt diff --git a/Resources/NormalizerReplacesRequierd.txt b/resources/NormalizeUnicodeDeep.txt similarity index 100% rename from Resources/NormalizerReplacesRequierd.txt rename to resources/NormalizeUnicodeDeep.txt diff --git a/resources/Phoneme2Ascii.txt b/resources/Phoneme2Ascii.txt new file mode 100644 index 0000000..a15b1be --- /dev/null +++ b/resources/Phoneme2Ascii.txt @@ -0,0 +1,36 @@ +ʔ EH +a AA +b B +p P +t T +c JE +ç CH +ḧ HE +x X +d D +r R +ř RR +z Z +j ZH +s S +ş SH +ƹ AH +ẍ XE +f F +v V +q Q +k K +g G +l L +ł LL +m M +n N +o O +e A +h H +ê E +î I +y Y +w W +u U +û UU \ No newline at end of file diff --git a/resources/Phoneme2IPA.txt b/resources/Phoneme2IPA.txt new file mode 100644 index 0000000..01d8e47 --- /dev/null +++ b/resources/Phoneme2IPA.txt @@ -0,0 +1,42 @@ +ʔ ʔ +b b +p p +t t +c d͡ʒ +ç t͡ʃ +ḧ ħ +x x +d d +r ɾ +ř r +z z +j ʒ +s s +ş ʃ +ƹ ʕ +ẍ ɣ +f f +v v +q q +k k +g g +l l +ł ɫ +m m +n n +w w +u ʊ +û u +o o̞ +h h +y j +a ä +e a +ê ɛ +i ɪ +î i +ĝ ŋ +đ đ +ü y +ô ô +õ õ \ No newline at end of file