diff --git a/pyproject.toml b/pyproject.toml index cf226903..08d2dfa7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,7 +28,7 @@ dependencies = [ "humanize", "loguru>=0.7.0", "multiprocess", - "numpy>=1.25.0,<2.0.0", + "numpy>=2.0.0", "tqdm", ] @@ -41,7 +41,7 @@ io = [ "pyarrow", "python-magic", "warcio", - "datasets>=2.18.0", + "datasets>=3.1.0", "orjson", "zstandard" ] @@ -49,7 +49,7 @@ s3 = [ "s3fs>=2023.12.2", ] processing = [ - "fasttext-wheel", + "fasttext-numpy2-wheel", "nltk", "inscriptis", # "readability-lxml @ git+https://github.com/huggingface/python-readability.git@speedup", @@ -58,6 +58,7 @@ processing = [ "tokenizers", "ftfy", "fasteners", + "regex", "xxhash", "kenlm", "pyahocorasick" @@ -66,13 +67,20 @@ decont = [ "lighteval>=0.3.0" ] multilingual = [ - "spacy", + "spacy[ja]>=3.8", "stanza", - "pyvi", - "pythainlp", - "jieba", - "indic-nlp-library", - "kiwipiepy", + "pyvi", # vietnamese + "pythainlp", # thai + "jieba", # chinese + "indic-nlp-library", # indic languages + "kiwipiepy", # korean + # urduhack has keras and tensorflow as dependencies and requires a specific version to work... + "urduhack", + "tensorflow>=2.16", + "khmer-nltk", # khmer + "laonlp", # lao + "botok", # tibetan languages, + "pyidaungsu-numpy2", # burmese ] quality = [ "ruff>=0.1.5" @@ -83,11 +91,15 @@ testing = [ "datatrove[processing]", "datatrove[multilingual]", "datatrove[s3]", - "datatrove[decont]", + # Lighteval doesn't support numpy>=2.0.0 +# "datatrove[decont]", +# Flask doesn't have correct dependencies on werkzeux, causing issues, thus we pin flask 3.1 (which currently works) to avoid it + "flask>=3.1.0", "pytest", "pytest-timeout", "pytest-xdist", "moto[s3,server]", + "spacy[ja]" ] all = [ "datatrove[quality]", @@ -128,7 +140,8 @@ lint.select = [ "E", "F", "I", - "W" + "W", + "NPY201", # numpy 2.0.0 ] line-length = 119 diff --git a/src/datatrove/assets/tokenizer_assignment.csv b/src/datatrove/assets/tokenizer_assignment.csv new file mode 100644 index 00000000..ea461e40 --- /dev/null +++ b/src/datatrove/assets/tokenizer_assignment.csv @@ -0,0 +1,2178 @@ +code_3,code_1,script,name,family,type,tok_code,proxy,auto_assigned,default_code_1,default_script +aai,,Latn,Arifama-Miniafia,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +aak,,Latn,Ankave,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +aau,,Latn,Abau,Sepik,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +aaz,,Latn,Amarasi,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +aba,,Latn,Abé,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +abi,,Latn,Abidji,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +abk,ab,Cyrl,Abkhazian,Abkhaz-Adyghe,SpaCyTokenizer,xx,TRUE,TRUE,TRUE,TRUE +abn,,Latn,Abua,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +abq,,Cyrl,Abaza,Abkhaz-Adyghe,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +abs,,Latn,Ambonese Malay,Creole,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +abt,,Latn,Ambulas,Sepik,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +abx,,Latn,Inabaknon,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +aby,,Latn,Aneme Wake,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +abz,,Latn,Abui,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +aca,,Latn,Achagua,Maipurean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +acd,,Latn,Gikyode,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +ace,,Arab,Achinese,Austronesian,WhitespaceTokenizer,,TRUE,TRUE,FALSE,FALSE +ace,,Latn,Achinese,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +acf,,Latn,Saint Lucian Creole French,Creole,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +ach,,Latn,Acoli,Nilo-Saharan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +acm,ar,Arab,Mesopotamian Arabic,Afro-Asiatic,SpaCyTokenizer,ar,FALSE,FALSE,FALSE,TRUE +acn,,Latn,Achang,Sino-Tibetan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +acr,,Latn,Achi,Mayan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +acu,,Latn,Achuar-Shiwiar,Jivaroan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +ada,,Latn,Adangme,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +ade,,Latn,Adele,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +adh,,Latn,Adhola,Nilo-Saharan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +adi,,Latn,Adi,Sino-Tibetan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +adj,,Latn,Adioukrou,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +adl,,Latn,Galo,Sino-Tibetan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +ady,,Cyrl,Adyghe,Abkhaz-Adyghe,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +adz,,Latn,Adzera,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +aeb,ar,Arab,Tunisian Arabic,Afro-Asiatic,SpaCyTokenizer,ar,FALSE,FALSE,FALSE,TRUE +aer,,Latn,Eastern Arrernte,Australian,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +aeu,,Latn,Akeu,Sino-Tibetan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +aey,,Latn,Amele,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +afr,af,Latn,Afrikaans,Indo-European,SpaCyTokenizer,af,FALSE,FALSE,TRUE,TRUE +agd,,Latn,Agarabi,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +agg,,Latn,Angor,Senagi,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +agm,,Latn,Angaataha,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +agn,,Latn,Agutaynen,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +agr,,Latn,Aguaruna,Jivaroan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +agt,,Latn,Central Cagayan Agta,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +agu,,Latn,Aguacateco,Mayan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +agw,,Latn,Kahua,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +agx,,Cyrl,Aghul,Nakh-Daghestanian,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +aha,,Latn,Ahanta,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +ahk,,Latn,Akha,Sino-Tibetan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +aia,,Latn,Arosi,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +aii,,Syrc,Assyrian Neo-Aramaic,Afro-Asiatic,WhitespaceTokenizer,,TRUE,TRUE,FALSE,TRUE +aim,,Latn,Aimol,Sino-Tibetan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +ain,,Latn,Ainu (Japan),Language isolate,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +ajg,,Latn,Aja (Benin),Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +aji,,Latn,Ajië,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +ajp,ar,Arab,,,SpaCyTokenizer,ar,FALSE,FALSE,FALSE,TRUE +ajz,,Latn,Amri Karbi,Sino-Tibetan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +aka,ak,Latn,Akan,Niger-Congo,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +akb,,Latn,Batak Angkola,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +ake,,Latn,Akawaio,Cariban,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +akh,,Latn,Angal Heneng,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +akp,,Latn,Siwu,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +ald,,Latn,Alladian,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +alj,,Latn,Alangan,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +aln,sq,Latn,Gheg Albanian,Indo-European,SpaCyTokenizer,sq,FALSE,FALSE,FALSE,TRUE +alp,,Latn,Alune,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +alq,,Latn,Algonquin,Algic,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +als,sq,Latn,Tosk Albanian,Indo-European,SpaCyTokenizer,sq,FALSE,FALSE,TRUE,TRUE +alt,,Cyrl,Southern Altai,Turkic,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +aly,,Latn,Alyawarr,Australian,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +alz,,Latn,Alur,Nilo-Saharan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +ame,,Latn,Yanesha',Maipurean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +amf,,Latn,Hamer-Banna,Afro-Asiatic,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +amh,am,Ethi,Amharic,Afro-Asiatic,SpaCyTokenizer,am,FALSE,FALSE,TRUE,TRUE +ami,,Latn,Amis,Austronesian,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +amk,,Latn,Ambai,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +amm,,Latn,Ama (Papua New Guinea),Arai (Left May),SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +amn,,Latn,Amanab,Border,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +amp,,Latn,Alamblak,Sepik,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +amr,,Latn,Amarakaeri,Harákmbut,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +amu,,Latn,Guerrero Amuzgo,Otomanguean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +amx,,Latn,Anmatyerre,Australian,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +ang,,Latn,Old English (ca. 450-1100),,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +anm,,Latn,Anal,Sino-Tibetan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +ann,,Latn,Obolo,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +anp,,Deva,Angika,Indo-European,IndicNLPTokenizer,hi,TRUE,TRUE,FALSE,TRUE +anv,,Latn,Denya,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +any,,Latn,Anyin,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +aoi,,Latn,Anindilyakwa,Australian,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +aoj,,Latn,Mufian,Torricelli,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +aom,,Latn,Ömie,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +aoz,,Latn,Uab Meto,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +apb,,Latn,Sa'a,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +apc,ar,Arab,Levantine Arabic,Afro-Asiatic,SpaCyTokenizer,ar,FALSE,FALSE,FALSE,TRUE +ape,,Latn,Bukiyip,Torricelli,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +apn,,Latn,Apinayé,Jean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +apr,,Latn,Arop-Lokep,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +apt,,Latn,Apatani,Sino-Tibetan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +apu,,Latn,Apurinã,Maipurean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +apw,,Latn,Western Apache,Eyak-Athabaskan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +apy,,Latn,Apalaí,Cariban,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +apz,,Latn,Safeyoka,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +aqz,,Latn,Akuntsu,Tupian,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +ara,ar,Arab,Arabic,,SpaCyTokenizer,ar,FALSE,FALSE,FALSE,TRUE +ara,ar,Latn,Arabic,,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,FALSE +arb,ar,Arab,Standard Arabic,Afro-Asiatic,SpaCyTokenizer,ar,FALSE,FALSE,TRUE,TRUE +arb,ar,Latn,Standard Arabic,Afro-Asiatic,StanzaTokenizer,mt,TRUE,TRUE,TRUE,FALSE +are,,Latn,Western Arrarnta,Australian,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +arg,an,Latn,Aragonese,Indo-European,SpaCyTokenizer,es,TRUE,TRUE,TRUE,TRUE +arl,,Latn,Arabela,Zaparoan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +arn,,Latn,Mapudungun,Mapudungu,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +arp,,Latn,Arapaho,Algic,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +arq,ar,Arab,Algerian Arabic,Afro-Asiatic,SpaCyTokenizer,ar,FALSE,FALSE,FALSE,TRUE +arr,,Latn,Karo (Brazil),Tupian,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +ars,ar,Arab,Najdi Arabic,Afro-Asiatic,SpaCyTokenizer,ar,FALSE,FALSE,FALSE,TRUE +ary,ar,Arab,Moroccan Arabic,Afro-Asiatic,SpaCyTokenizer,ar,FALSE,FALSE,FALSE,TRUE +arz,ar,Arab,Egyptian Arabic,Afro-Asiatic,SpaCyTokenizer,ar,FALSE,FALSE,FALSE,TRUE +asg,,Latn,Cishingini,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +asm,as,Beng,Assamese,Indo-European,IndicNLPTokenizer,as,FALSE,FALSE,TRUE,TRUE +asm,as,Latn,Assamese,Indo-European,StanzaTokenizer,kmr,TRUE,TRUE,TRUE,FALSE +aso,,Latn,Dano,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +ast,,Latn,Asturian,Indo-European,SpaCyTokenizer,es,TRUE,TRUE,FALSE,TRUE +ata,,Latn,Pele-Ata,Yele-West New Britain,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +atb,,Latn,Zaiwa,Sino-Tibetan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +atd,,Latn,Ata Manobo,Austronesian,SpaCyTokenizer,tl,TRUE,TRUE,FALSE,TRUE +atg,,Latn,Ivbie North-Okpela-Arhe,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +ati,,Latn,Attié,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +atj,,Latn,Atikamekw,Algic,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +atq,,Latn,Aralle-Tabulahan,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +att,,Latn,Pamplona Atta,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +auc,,Latn,Waorani,Language isolate,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +aui,,Latn,Anuki,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +auy,,Latn,Awiyaana,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +ava,av,Cyrl,Avaric,Nakh-Daghestanian,SpaCyTokenizer,xx,TRUE,TRUE,TRUE,TRUE +avk,,Latn,Kotava,,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +avn,,Latn,Avatime,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +avt,,Latn,Au,Torricelli,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +avu,,Latn,Avokaya,Nilo-Saharan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +awa,,Deva,Awadhi,Indo-European,IndicNLPTokenizer,ne,TRUE,TRUE,FALSE,TRUE +awb,,Latn,Awa (Papua New Guinea),Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +awi,,Latn,Aekyom,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +awx,,Latn,Awara,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +aym,ay,Latn,Aymara,,SpaCyTokenizer,az,TRUE,TRUE,FALSE,TRUE +ayo,,Latn,Ayoreo,Zamucoan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +ayp,ar,Arab,North Mesopotamian Arabic,Afro-Asiatic,SpaCyTokenizer,ar,FALSE,FALSE,FALSE,TRUE +ayr,ay,Latn,Central Aymara,Aymaran,SpaCyTokenizer,xx,TRUE,TRUE,TRUE,TRUE +azb,az,Arab,South Azerbaijani,Turkic,SpaCyTokenizer,fa,TRUE,FALSE,FALSE,TRUE +aze,az,Arab,Azerbaijani,,WhitespaceTokenizer,,TRUE,TRUE,FALSE,FALSE +aze,az,Cyrl,Azerbaijani,,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,FALSE +aze,az,Latn,Azerbaijani,,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +azg,,Latn,San Pedro Amuzgos Amuzgo,Otomanguean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +azj,az,Cyrl,North Azerbaijani,Turkic,SpaCyTokenizer,xx,TRUE,TRUE,TRUE,FALSE +azj,az,Latn,North Azerbaijani,Turkic,SpaCyTokenizer,az,FALSE,FALSE,TRUE,TRUE +azz,,Latn,Highland Puebla Nahuatl,Uto-Aztecan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +bak,ba,Cyrl,Bashkir,Turkic,SpaCyTokenizer,tt,TRUE,TRUE,TRUE,TRUE +bal,,Arab,Baluchi,,WhitespaceTokenizer,,TRUE,TRUE,FALSE,TRUE +bam,bm,Latn,Bambara,Niger-Congo,SpaCyTokenizer,xx,TRUE,TRUE,TRUE,TRUE +ban,,Latn,Balinese,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +bao,,Latn,Waimaha,Tucanoan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +bar,,Latn,Bavarian,Indo-European,SpaCyTokenizer,de,TRUE,TRUE,FALSE,TRUE +bas,,Latn,Basa (Cameroon),Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +bav,,Latn,Vengo,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +bba,,Latn,Baatonum,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +bbb,,Latn,Barai,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +bbc,,Latn,Batak Toba,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +bbj,,Latn,Ghomálá',Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +bbk,,Latn,Babanki,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +bbo,,Latn,Northern Bobo Madaré,Niger-Congo,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +bbr,,Latn,Girawa,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +bcc,,Arab,Southern Balochi,Indo-European,IndicNLPTokenizer,ur,TRUE,TRUE,FALSE,TRUE +bch,,Latn,Bariai,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +bci,,Latn,Baoulé,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +bcl,,Latn,Central Bikol,Austronesian,SpaCyTokenizer,tl,TRUE,TRUE,FALSE,TRUE +bco,,Latn,Kaluli,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +bcw,,Latn,Bana,Afro-Asiatic,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +bdd,,Latn,Bunama,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +bdh,,Latn,Baka (South Sudan),Nilo-Saharan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +bdq,,Latn,Bahnar,Austro-Asiatic,SpaCyTokenizer,vi,TRUE,TRUE,FALSE,TRUE +bea,,Latn,Beaver,Eyak-Athabaskan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +bef,,Latn,Benabena,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +bel,be,Cyrl,Belarusian,Indo-European,StanzaTokenizer,be,FALSE,FALSE,TRUE,TRUE +bem,,Latn,Bemba (Zambia),Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +ben,bn,Beng,Bengali,Indo-European,IndicNLPTokenizer,bn,FALSE,FALSE,TRUE,TRUE +ben,bn,Latn,Bengali,Indo-European,StanzaTokenizer,kmr,TRUE,TRUE,TRUE,FALSE +beq,,Latn,Beembe,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +bew,,Latn,Betawi,Creole,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +bex,,Latn,Jur Modo,Nilo-Saharan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +bfd,,Latn,Bafut,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +bfo,,Latn,Malba Birifor,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +bgr,,Latn,Bawm Chin,Sino-Tibetan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +bgs,,Latn,Tagabawa,Austronesian,SpaCyTokenizer,tl,TRUE,TRUE,FALSE,TRUE +bgt,,Latn,Bughotu,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +bgz,,Latn,Banggai,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +bhg,,Latn,Binandere,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +bhl,,Latn,Bimin,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +bho,,Deva,Bhojpuri,Indo-European,IndicNLPTokenizer,hi,TRUE,TRUE,FALSE,TRUE +bhp,,Latn,Bima,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +bhw,,Latn,Biak,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +bhz,,Latn,Bada (Indonesia),Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +bib,,Latn,Bissa,Niger-Congo,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +big,,Latn,Biangai,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +bik,,Latn,Bikol,,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +bim,,Latn,Bimoba,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +bin,,Latn,Bini,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +bis,bi,Latn,Bislama,Creole,StanzaTokenizer,pcm,TRUE,TRUE,TRUE,TRUE +biu,,Latn,Biete,Sino-Tibetan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +biv,,Latn,Southern Birifor,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +bjn,ms,Arab,Banjar,Austronesian,WhitespaceTokenizer,,TRUE,TRUE,FALSE,FALSE +bjn,ms,Latn,Banjar,Austronesian,SpaCyTokenizer,ms,FALSE,FALSE,FALSE,TRUE +bjp,,Latn,Fanamaket,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +bjr,,Latn,Binumarien,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +bjv,,Latn,Bedjond,Nilo-Saharan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +bkd,,Latn,Binukid,Austronesian,SpaCyTokenizer,tl,TRUE,TRUE,FALSE,TRUE +bkl,,Latn,Berik,Tor-Kwerba,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +bkq,,Latn,Bakairí,Cariban,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +bku,,Latn,Buhid,Austronesian,SpaCyTokenizer,tl,TRUE,TRUE,FALSE,TRUE +bkv,,Latn,Bekwarra,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +bla,,Latn,Siksika,Algic,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +blh,,Latn,Kuwaa,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +blk,,Mymr,Pa'o Karen,Sino-Tibetan,BurmeseTokenizer,,TRUE,TRUE,FALSE,TRUE +blt,,Latn,Tai Dam,Kra-Dai,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +blw,,Latn,Balangao,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +blz,,Latn,Balantak,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +bmh,,Latn,Kein,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +bmk,,Latn,Ghayavi,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +bmq,,Latn,Bomu,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +bmr,,Latn,Muinane,Witotoan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +bmu,,Latn,Somba-Siawari,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +bmv,,Latn,Bum,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +bnc,,Latn,Bontok,,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +bnj,,Latn,Eastern Tawbuid,Austronesian,SpaCyTokenizer,tl,TRUE,TRUE,FALSE,TRUE +bno,,Latn,Bantoanon,Austronesian,SpaCyTokenizer,tl,TRUE,TRUE,FALSE,TRUE +bnp,,Latn,Bola,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +boa,,Latn,Bora,Witotoan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +bod,bo,Tibt,Tibetan,Sino-Tibetan,TibetanTokenizer,,FALSE,FALSE,TRUE,TRUE +boj,,Latn,Anjam,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +bom,,Latn,Berom,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +bon,,Latn,Bine,Eastern Trans-Fly,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +bor,,Latn,Borôro,Bororoan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +bos,bs,Latn,Bosnian,Indo-European,SpaCyTokenizer,sr,FALSE,FALSE,TRUE,TRUE +bov,,Latn,Tuwuli,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +box,,Latn,Buamu,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +bpr,,Latn,Koronadal Blaan,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +bps,,Latn,Sarangani Blaan,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +bpy,,Beng,Bishnupriya,Indo-European,IndicNLPTokenizer,bn,TRUE,TRUE,FALSE,TRUE +bqc,,Latn,Boko (Benin),Niger-Congo,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +bqj,,Latn,Bandial,Niger-Congo,StanzaTokenizer,wo,TRUE,TRUE,FALSE,TRUE +bqp,,Latn,Busa,Niger-Congo,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +bre,br,Latn,Breton,Indo-European,StanzaTokenizer,cy,TRUE,TRUE,TRUE,TRUE +brh,,Arab,Brahui,Dravidian,WhitespaceTokenizer,,TRUE,TRUE,FALSE,TRUE +bru,,Latn,Eastern Bru,Austro-Asiatic,SpaCyTokenizer,vi,TRUE,TRUE,FALSE,TRUE +brx,,Deva,Bodo (India),Sino-Tibetan,IndicNLPTokenizer,hi,FALSE,FALSE,FALSE,FALSE +brx,,Latn,Bodo (India),Sino-Tibetan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +bsc,,Latn,Bassari,Niger-Congo,StanzaTokenizer,wo,TRUE,TRUE,FALSE,TRUE +bsn,,Latn,Barasana-Eduria,Tucanoan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +bsp,,Latn,Baga Sitemu,Niger-Congo,StanzaTokenizer,wo,TRUE,TRUE,FALSE,TRUE +bsq,,Latn,Bassa,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +bss,,Latn,Akoose,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +btd,,Latn,Batak Dairi,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +bth,,Latn,Biatah Bidayuh,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +bts,,Latn,Batak Simalungun,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +btt,,Latn,Bete-Bendi,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +btx,,Latn,Batak Karo,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +bua,,Cyrl,Buriat,,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +bud,,Latn,Ntcham,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +bug,,Latn,Buginese,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +buk,,Latn,Bugawac,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +bul,bg,Cyrl,Bulgarian,Indo-European,SpaCyTokenizer,bg,FALSE,FALSE,TRUE,TRUE +bum,,Latn,Bulu (Cameroon),Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +bus,,Latn,Bokobaru,Niger-Congo,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +bvc,,Latn,Baelelea,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +bvd,,Latn,Baeggu,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +bvr,,Latn,Burarra,Australian,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +bvz,,Latn,Bauzi,East Geelvink Bay,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +bwd,,Latn,Bwaidoka,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +bwi,,Latn,Baniwa,Maipurean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +bwq,,Latn,Southern Bobo Madaré,Niger-Congo,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +bwu,,Latn,Buli (Ghana),Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +bxh,,Latn,Buhutu,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +bxr,,Cyrl,Russia Buriat,Mongolic,StanzaTokenizer,bxr,FALSE,FALSE,FALSE,TRUE +byr,,Latn,Baruya,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +byv,,Latn,Medumba,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +byx,,Latn,Qaqet,East New Britain,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +bzd,,Latn,Bribri,Chibchan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +bzh,,Latn,Mapos Buang,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +bzi,,Thai,Bisu,Sino-Tibetan,ThaiTokenizer,,TRUE,TRUE,FALSE,TRUE +bzj,,Latn,Belize Kriol English,Creole,StanzaTokenizer,pcm,TRUE,TRUE,FALSE,TRUE +caa,,Latn,Chortí,Mayan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +cab,,Latn,Garifuna,Maipurean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +cac,,Latn,Chuj,Mayan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +caf,,Latn,Southern Carrier,Eyak-Athabaskan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +cag,,Latn,Nivaclé,Matacoan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +cak,,Latn,Kaqchikel,Mayan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +cao,,Latn,Chácobo,Panoan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +cap,,Latn,Chipaya,Chipaya-Uru,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +caq,,Latn,Car Nicobarese,Austro-Asiatic,SpaCyTokenizer,vi,TRUE,TRUE,FALSE,TRUE +car,,Latn,Galibi Carib,Cariban,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +cas,,Latn,Tsimané,Mosetenan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +cat,ca,Latn,Catalan,Indo-European,SpaCyTokenizer,ca,FALSE,FALSE,TRUE,TRUE +cav,,Latn,Cavineña,Tacanan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +cax,,Latn,Chiquitano,Language isolate,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +cbc,,Latn,Carapana,Tucanoan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +cbi,,Latn,Chachi,Barbacoan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +cbk,,Latn,Chavacano,Creole,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +cbr,,Latn,Cashibo-Cacataibo,Panoan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +cbs,,Latn,Cashinahua,Panoan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +cbt,,Latn,Chayahuita,Cahuapanan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +cbu,,Latn,Candoshi-Shapra,Language isolate,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +cbv,,Latn,Cacua,Puinavean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +cce,,Latn,Chopi,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +cco,,Latn,Comaltepec Chinantec,Otomanguean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +ccp,,Latn,Chakma,Indo-European,StanzaTokenizer,kmr,TRUE,TRUE,FALSE,TRUE +cdf,,Latn,Chiru,Sino-Tibetan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +ceb,,Latn,Cebuano,Austronesian,SpaCyTokenizer,tl,TRUE,TRUE,FALSE,TRUE +ceg,,Latn,Chamacoco,Zamucoan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +cek,,Latn,Eastern Khumi Chin,Sino-Tibetan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +ces,cs,Latn,Czech,Indo-European,SpaCyTokenizer,cs,FALSE,FALSE,TRUE,TRUE +cfm,,Latn,Falam Chin,Sino-Tibetan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +cgc,,Latn,Kagayanen,Austronesian,SpaCyTokenizer,tl,TRUE,TRUE,FALSE,TRUE +cgg,,Latn,Chiga,Niger-Congo,SpaCyTokenizer,lg,TRUE,TRUE,FALSE,TRUE +cha,ch,Latn,Chamorro,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,TRUE,TRUE +chd,,Latn,Highland Oaxaca Chontal,Tequistlatecan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +che,ce,Cyrl,Chechen,Nakh-Daghestanian,SpaCyTokenizer,xx,TRUE,TRUE,TRUE,TRUE +chf,,Latn,Tabasco Chontal,Mayan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +chj,,Latn,Ojitlán Chinantec,Otomanguean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +chk,,Latn,Chuukese,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +chm,,Cyrl,Mari (Russia),,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +cho,,Latn,Choctaw,Muskogean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +chq,,Latn,Quiotepec Chinantec,Otomanguean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +chr,,Cher,Cherokee,Iroquoian,WhitespaceTokenizer,,TRUE,TRUE,FALSE,TRUE +chr,,Latn,Cherokee,Iroquoian,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,FALSE +chu,cu,Cyrl,Church Slavic,Indo-European,StanzaTokenizer,cu,FALSE,FALSE,TRUE,TRUE +chv,cv,Cyrl,Chuvash,Turkic,SpaCyTokenizer,xx,TRUE,TRUE,TRUE,TRUE +chw,,Latn,Chuwabu,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +chz,,Latn,Ozumacín Chinantec,Otomanguean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +cjk,,Latn,Chokwe,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +cjo,,Latn,Ashéninka Pajonal,Maipurean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +cjp,,Latn,Cabécar,Chibchan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +cjs,,Cyrl,Shor,Turkic,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +cjv,,Latn,Chuave,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +ckb,ku,Arab,Central Kurdish,Indo-European,IndicNLPTokenizer,ur,TRUE,TRUE,TRUE,TRUE +ckm,,Latn,Chakavian,Indo-European,SpaCyTokenizer,sr,TRUE,TRUE,FALSE,TRUE +cko,,Latn,Anufo,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +ckt,,Cyrl,Chukot,Chukotko-Kamchatkan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +cle,,Latn,Lealao Chinantec,Otomanguean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +clu,,Latn,Caluyanun,Austronesian,SpaCyTokenizer,tl,TRUE,TRUE,FALSE,TRUE +cly,,Latn,Eastern Highland Chatino,Otomanguean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +cme,,Latn,Cerma,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +cmn,zh,Hani,Mandarin Chinese,Sino-Tibetan,SpaCyTokenizer,zh,FALSE,FALSE,TRUE,TRUE +cmo,,Khmr,Central Mnong,Austro-Asiatic,KhmerTokenizer,,FALSE,FALSE,FALSE,FALSE +cmo,,Latn,Central Mnong,Austro-Asiatic,SpaCyTokenizer,vi,TRUE,TRUE,FALSE,TRUE +cmr,,Latn,Mro-Khimi Chin,Sino-Tibetan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +cnh,,Latn,Hakha Chin,Sino-Tibetan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +cni,,Latn,Asháninka,Maipurean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +cnk,,Latn,Khumi Chin,Sino-Tibetan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +cnl,,Latn,Lalana Chinantec,Otomanguean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +cnr,sh,Latn,Montenegrin,Indo-European,SpaCyTokenizer,sr,FALSE,FALSE,TRUE,TRUE +cnt,,Latn,Tepetotutla Chinantec,Otomanguean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +cnw,,Latn,Ngawn Chin,Sino-Tibetan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +coe,,Latn,Koreguaje,Tucanoan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +cof,,Latn,Colorado,Barbacoan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +cok,,Latn,Santa Teresa Cora,Uto-Aztecan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +con,,Latn,Cofán,Language isolate,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +cop,,Copt,Coptic,Afro-Asiatic,StanzaTokenizer,cop,FALSE,FALSE,FALSE,TRUE +cor,kw,Latn,Cornish,Indo-European,StanzaTokenizer,cy,TRUE,TRUE,TRUE,TRUE +cos,co,Latn,Corsican,Indo-European,SpaCyTokenizer,es,TRUE,TRUE,TRUE,TRUE +cot,,Latn,Caquinte,Maipurean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +cou,,Latn,Wamey,Niger-Congo,StanzaTokenizer,wo,TRUE,TRUE,FALSE,TRUE +cpa,,Latn,Palantla Chinantec,Otomanguean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +cpb,,Latn,Ucayali-Yurúa Ashéninka,Maipurean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +cpc,,Latn,Ajyíninka Apurucayali,Maipurean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +cpu,,Latn,Pichis Ashéninka,Maipurean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +cpy,,Latn,South Ucayali Ashéninka,Maipurean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +cre,cr,Cans,Cree,,WhitespaceTokenizer,,TRUE,TRUE,FALSE,TRUE +cre,cr,Latn,Cree,,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,FALSE +crh,,Cyrl,Crimean Tatar,Turkic,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,FALSE +crh,,Latn,Crimean Tatar,Turkic,SpaCyTokenizer,tr,TRUE,TRUE,FALSE,TRUE +cri,,Latn,Sãotomense,Creole,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +crj,cr,Cans,Southern East Cree,Algic,WhitespaceTokenizer,,TRUE,TRUE,FALSE,TRUE +crk,cr,Cans,Plains Cree,Algic,WhitespaceTokenizer,,TRUE,TRUE,FALSE,FALSE +crk,cr,Latn,Plains Cree,Algic,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +crl,cr,Cans,Northern East Cree,Algic,WhitespaceTokenizer,,TRUE,TRUE,FALSE,TRUE +crm,cr,Cans,Moose Cree,Algic,WhitespaceTokenizer,,TRUE,TRUE,TRUE,TRUE +crn,,Latn,El Nayar Cora,Uto-Aztecan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +crs,,Latn,Seselwa Creole French,Creole,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +crt,,Latn,Iyojwa'ja Chorote,Matacoan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +crx,,Latn,Carrier,Eyak-Athabaskan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +csb,,Latn,Kashubian,Indo-European,SpaCyTokenizer,pl,TRUE,TRUE,FALSE,TRUE +csk,,Latn,Jola-Kasa,Niger-Congo,StanzaTokenizer,wo,TRUE,TRUE,FALSE,TRUE +cso,,Latn,Sochiapam Chinantec,Otomanguean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +csw,cr,Latn,Swampy Cree,Algic,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +csy,,Latn,Siyin Chin,Sino-Tibetan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +cta,,Latn,Tataltepec Chatino,Otomanguean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +ctd,,Latn,Tedim Chin,Sino-Tibetan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +cto,,Latn,Emberá-Catío,Chocoan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +ctp,,Latn,Western Highland Chatino,Otomanguean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +ctu,,Latn,Chol,Mayan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +cub,,Latn,Cubeo,Tucanoan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +cuc,,Latn,Usila Chinantec,Otomanguean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +cui,,Latn,Cuiba,Guajiboan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +cuk,,Latn,San Blas Kuna,Chibchan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +cul,,Latn,Culina,Arauan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +cut,,Latn,Teutila Cuicatec,Otomanguean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +cux,,Latn,Tepeuxila Cuicatec,Otomanguean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +cwd,cr,Cans,Woods Cree,Algic,WhitespaceTokenizer,,TRUE,TRUE,FALSE,TRUE +cwe,,Latn,Kwere,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +cwt,,Latn,Kuwaataay,Niger-Congo,StanzaTokenizer,wo,TRUE,TRUE,FALSE,TRUE +cya,,Latn,Nopala Chatino,Otomanguean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +cym,cy,Latn,Welsh,Indo-European,StanzaTokenizer,cy,FALSE,FALSE,TRUE,TRUE +czt,,Latn,Zotung Chin,Sino-Tibetan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +daa,,Latn,Dangaléat,Afro-Asiatic,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +dad,,Latn,Marik,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +daf,,Latn,,,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +dag,,Latn,Dagbani,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +dah,,Latn,Gwahatike,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +dak,,Latn,Dakota,Siouan-Catawban,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +dan,da,Latn,Danish,Indo-European,SpaCyTokenizer,da,FALSE,FALSE,TRUE,TRUE +dar,,Cyrl,Dargwa,Nakh-Daghestanian,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +dbq,,Latn,Daba,Afro-Asiatic,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +ddg,,Latn,Fataluku,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +ddn,,Latn,Dendi (Benin),Nilo-Saharan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +ded,,Latn,Dedua,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +des,,Latn,Desano,Tucanoan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +deu,de,Latn,German,Indo-European,SpaCyTokenizer,de,FALSE,FALSE,TRUE,TRUE +dga,,Latn,Southern Dagaare,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +dgc,,Latn,Casiguran Dumagat Agta,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +dgi,,Latn,Northern Dagara,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +dgr,,Latn,Tlicho,Eyak-Athabaskan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +dgz,,Latn,Daga,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +dhg,,Latn,Dhangu-Djangu,Australian,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +dhm,,Latn,Zemba,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +dhv,,Latn,Dehu,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +did,,Latn,Didinga,Nilo-Saharan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +dig,,Latn,Digo,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +dik,,Latn,Southwestern Dinka,Nilo-Saharan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +din,,Latn,Dinka,,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +dip,,Latn,Northeastern Dinka,Nilo-Saharan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +diq,,Latn,Dimli (individual language),Indo-European,StanzaTokenizer,kmr,TRUE,TRUE,FALSE,TRUE +dis,,Latn,Dimasa,Sino-Tibetan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +diu,,Latn,Diriku,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +div,dv,Thaa,Dhivehi,Indo-European,WhitespaceTokenizer,,TRUE,TRUE,TRUE,TRUE +dje,,Latn,Zarma,Nilo-Saharan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +djk,,Latn,Eastern Maroon Creole,Creole,StanzaTokenizer,pcm,TRUE,TRUE,FALSE,TRUE +djr,,Latn,Djambarrpuyngu,Australian,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +dks,,Latn,Southeastern Dinka,Nilo-Saharan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +dln,,Latn,Darlong,Sino-Tibetan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +dng,,Cyrl,Dungan,Sino-Tibetan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +dnj,,Latn,Dan,Niger-Congo,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +dnw,,Latn,Western Dani,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +dob,,Latn,Dobu,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +doi,,Deva,Dogri (macrolanguage),,WhitespaceTokenizer,,FALSE,FALSE,FALSE,TRUE +dop,,Latn,Lukpa,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +dos,,Latn,Dogosé,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +dow,,Latn,Doyayo,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +drg,,Latn,Rungus,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +dru,,Latn,Rukai,Austronesian,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +dsb,,Latn,Lower Sorbian,Indo-European,SpaCyTokenizer,dsb,FALSE,FALSE,FALSE,TRUE +dsh,,Latn,Daasanach,Afro-Asiatic,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +dtb,,Latn,Labuk-Kinabatangan Kadazan,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +dtp,,Latn,Kadazan Dusun,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +dts,,Latn,Toro So Dogon,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +dty,ne,Deva,Dotyali,Indo-European,IndicNLPTokenizer,ne,FALSE,FALSE,FALSE,TRUE +dua,,Latn,Duala,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +due,,Latn,Umiray Dumaget Agta,Austronesian,SpaCyTokenizer,tl,TRUE,TRUE,FALSE,TRUE +dug,,Latn,Duruma,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +duo,,Latn,Dupaninan Agta,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +dur,,Latn,Dii,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +dwr,,Latn,Dawro,Afro-Asiatic,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +dww,,Latn,Dawawa,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +dyi,,Latn,Djimini Senoufo,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +dyo,,Latn,Jola-Fonyi,Niger-Congo,StanzaTokenizer,wo,TRUE,TRUE,FALSE,TRUE +dyu,,Latn,Dyula,Niger-Congo,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +dzo,dz,Tibt,Dzongkha,Sino-Tibetan,TibetanTokenizer,,FALSE,FALSE,TRUE,TRUE +ebk,,Latn,Eastern Bontok,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +efi,,Latn,Efik,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +eka,,Latn,Ekajuk,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +ekk,et,Latn,Standard Estonian,Uralic,SpaCyTokenizer,et,FALSE,FALSE,TRUE,TRUE +eko,,Latn,Koti,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +ell,el,Grek,Modern Greek (1453-),Indo-European,SpaCyTokenizer,el,FALSE,FALSE,TRUE,TRUE +eme,,Latn,Emerillon,Tupian,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +emi,,Latn,Mussau-Emira,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +eml,,Latn,,,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +emp,,Latn,Northern Emberá,Chocoan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +enb,,Latn,Markweeta,Nilo-Saharan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +eng,en,Latn,English,Indo-European,SpaCyTokenizer,en,FALSE,FALSE,TRUE,TRUE +enl,,Latn,Enlhet,Mascoyan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +enm,,Latn,Middle English (1100-1500),,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +enq,,Latn,Enga,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +enx,,Latn,Enxet,Mascoyan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +epo,eo,Latn,Esperanto,Constructed language,SpaCyTokenizer,xx,TRUE,TRUE,TRUE,TRUE +eri,,Latn,Ogea,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +ese,,Latn,Ese Ejja,Tacanan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +esi,ik,Latn,North Alaskan Inupiatun,Eskimo-Aleut,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +esk,ik,Latn,Northwest Alaska Inupiatun,Eskimo-Aleut,SpaCyTokenizer,xx,TRUE,TRUE,TRUE,TRUE +ess,,Latn,Central Siberian Yupik,Eskimo-Aleut,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +est,et,Latn,Estonian,,SpaCyTokenizer,et,FALSE,FALSE,FALSE,TRUE +esu,,Latn,Central Yupik,Eskimo-Aleut,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +eto,,Latn,Eton (Cameroon),Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +etr,,Latn,Edolo,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +etu,,Latn,Ejagham,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +eus,eu,Latn,Basque,Language isolate,SpaCyTokenizer,eu,FALSE,FALSE,TRUE,TRUE +eve,,Cyrl,Even,Tungusic,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +ewe,ee,Latn,Ewe,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,TRUE,TRUE +ewo,,Latn,Ewondo,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +ext,,Latn,Extremaduran,Indo-European,SpaCyTokenizer,es,TRUE,TRUE,FALSE,TRUE +eza,,Latn,Ezaa,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +faa,,Latn,Fasu,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +fad,,Latn,Wagi,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +fai,,Latn,Faiwol,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +fal,,Latn,South Fali,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +fan,,Latn,Fang (Equatorial Guinea),Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +fao,fo,Latn,Faroese,Indo-European,SpaCyTokenizer,fo,FALSE,FALSE,TRUE,TRUE +far,,Latn,Fataleka,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +fas,fa,Arab,Persian,Indo-European,SpaCyTokenizer,fa,FALSE,FALSE,TRUE,TRUE +fat,ak,Latn,Fanti,,SpaCyTokenizer,xx,TRUE,TRUE,TRUE,TRUE +ffm,ff,Latn,Maasina Fulfulde,Niger-Congo,StanzaTokenizer,wo,TRUE,TRUE,FALSE,TRUE +fij,fj,Latn,Fijian,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,TRUE,TRUE +fil,,Latn,Filipino,Austronesian,SpaCyTokenizer,tl,FALSE,FALSE,FALSE,TRUE +fin,fi,Latn,Finnish,Uralic,SpaCyTokenizer,fi,FALSE,FALSE,TRUE,TRUE +fit,,Latn,Tornedalen Finnish,Uralic,SpaCyTokenizer,fi,TRUE,TRUE,FALSE,TRUE +fkv,,Latn,Kven Finnish,Uralic,SpaCyTokenizer,fi,TRUE,TRUE,FALSE,TRUE +fmu,,Deva,Far Western Muria,Dravidian,WhitespaceTokenizer,,TRUE,TRUE,FALSE,TRUE +fon,,Latn,Fon,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +for,,Latn,Fore,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +fra,fr,Latn,French,Indo-European,SpaCyTokenizer,fr,FALSE,FALSE,TRUE,TRUE +frd,,Latn,Fordata,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +fro,,Latn,Old French (842-ca. 1400),,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +frp,,Latn,Arpitan,Indo-European,SpaCyTokenizer,fr,TRUE,TRUE,FALSE,TRUE +frr,,Latn,Northern Frisian,Indo-European,SpaCyTokenizer,en,TRUE,TRUE,FALSE,TRUE +fry,fy,Latn,Western Frisian,Indo-European,SpaCyTokenizer,en,TRUE,TRUE,TRUE,TRUE +fub,ff,Latn,Adamawa Fulfulde,Niger-Congo,StanzaTokenizer,wo,TRUE,TRUE,FALSE,TRUE +fud,,Latn,East Futuna,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +fue,ff,Latn,Borgu Fulfulde,Niger-Congo,StanzaTokenizer,wo,TRUE,TRUE,FALSE,TRUE +fuf,ff,Latn,Pular,Niger-Congo,StanzaTokenizer,wo,TRUE,TRUE,FALSE,TRUE +fuh,ff,Latn,Western Niger Fulfulde,Niger-Congo,StanzaTokenizer,wo,TRUE,TRUE,FALSE,TRUE +ful,ff,Arab,Fulah,,WhitespaceTokenizer,,TRUE,TRUE,FALSE,FALSE +ful,ff,Latn,Fulah,,StanzaTokenizer,wo,TRUE,TRUE,FALSE,TRUE +fuq,ff,Latn,Central-Eastern Niger Fulfulde,Niger-Congo,StanzaTokenizer,wo,TRUE,TRUE,FALSE,TRUE +fur,,Latn,Friulian,Indo-European,SpaCyTokenizer,fr,TRUE,TRUE,FALSE,TRUE +fuv,ff,Arab,Nigerian Fulfulde,Niger-Congo,WhitespaceTokenizer,,TRUE,TRUE,TRUE,FALSE +fuv,ff,Latn,Nigerian Fulfulde,Niger-Congo,StanzaTokenizer,wo,TRUE,TRUE,TRUE,TRUE +gaa,,Latn,Ga,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +gag,,Cyrl,Gagauz,Turkic,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,FALSE +gag,,Latn,Gagauz,Turkic,SpaCyTokenizer,tr,TRUE,TRUE,FALSE,TRUE +gah,,Latn,Alekano,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +gai,,Latn,Borei,Ramu-Lower Sepik,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +gam,,Latn,Kandawo,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +gaw,,Latn,Nobonob,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +gaz,om,Latn,West Central Oromo,Afro-Asiatic,SpaCyTokenizer,xx,TRUE,TRUE,TRUE,TRUE +gba,,Latn,Gbaya (Central African Republic),,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +gbi,,Latn,Galela,West Papuan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +gbo,,Latn,Northern Grebo,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +gbr,,Latn,Gbagyi,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +gcf,,Latn,Guadeloupean Creole French,Creole,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +gcr,,Latn,Guianese Creole French,Creole,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +gde,,Latn,Gude,Afro-Asiatic,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +gdg,,Latn,Ga'dang,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +gdn,,Latn,Umanakaina,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +gdr,,Latn,Wipi,Eastern Trans-Fly,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +geb,,Latn,Kire,Ramu-Lower Sepik,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +gej,,Latn,Gen,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +gfk,,Latn,Patpatar,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +ghe,,Deva,Southern Ghale,Sino-Tibetan,IndicNLPTokenizer,hi,TRUE,TRUE,FALSE,TRUE +ghs,,Latn,Guhu-Samane,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +gid,,Latn,Gidar,Afro-Asiatic,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +gil,,Latn,Gilbertese,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +giz,,Latn,South Giziga,Afro-Asiatic,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +gjn,,Latn,Gonja,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +gkn,,Latn,Gokana,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +gla,gd,Latn,Scottish Gaelic,Indo-European,StanzaTokenizer,gd,FALSE,FALSE,TRUE,TRUE +gle,ga,Latn,Irish,Indo-European,SpaCyTokenizer,ga,FALSE,FALSE,TRUE,TRUE +glg,gl,Latn,Galician,Indo-European,StanzaTokenizer,gl,FALSE,FALSE,TRUE,TRUE +glk,,Arab,Gilaki,Indo-European,IndicNLPTokenizer,ur,TRUE,TRUE,FALSE,TRUE +glv,gv,Latn,Manx,Indo-European,StanzaTokenizer,gv,FALSE,FALSE,TRUE,TRUE +gmh,,Latn,Middle High German (ca. 1050-1500),,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +gmv,,Ethi,Gamo,Afro-Asiatic,WhitespaceTokenizer,,TRUE,TRUE,FALSE,FALSE +gmv,,Latn,Gamo,Afro-Asiatic,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +gna,,Latn,Kaansa,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +gnb,,Latn,Gangte,Sino-Tibetan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +gnd,,Latn,Zulgo-Gemzek,Afro-Asiatic,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +gng,,Latn,Ngangam,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +gnn,,Latn,Gumatj,Australian,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +gnw,gn,Latn,Western Bolivian Guaraní,Tupian,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +goa,,Latn,Guro,Niger-Congo,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +gof,,Ethi,Gofa,Afro-Asiatic,WhitespaceTokenizer,,TRUE,TRUE,FALSE,FALSE +gof,,Latn,Gofa,Afro-Asiatic,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +gog,,Latn,Gogo,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +goh,,Latn,Old High German (ca. 750-1050),,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +gom,,Deva,Goan Konkani,Indo-European,IndicNLPTokenizer,kK,FALSE,FALSE,FALSE,FALSE +gom,,Latn,Goan Konkani,Indo-European,StanzaTokenizer,kmr,TRUE,TRUE,FALSE,TRUE +gon,,Telu,Gondi,,WhitespaceTokenizer,,TRUE,TRUE,FALSE,TRUE +gor,,Latn,Gorontalo,Austronesian,SpaCyTokenizer,tl,TRUE,TRUE,FALSE,TRUE +gos,,Latn,Gronings,Indo-European,SpaCyTokenizer,nl,TRUE,TRUE,FALSE,TRUE +got,,Goth,Gothic,,WhitespaceTokenizer,,TRUE,TRUE,FALSE,FALSE +got,,Latn,Gothic,,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +gqr,,Latn,Gor,Nilo-Saharan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +grb,,Latn,Grebo,,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +grc,,Grek,Ancient Greek (to 1453),Indo-European,SpaCyTokenizer,grc,FALSE,FALSE,FALSE,TRUE +grn,gn,Latn,Guarani,,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +grt,,Beng,Garo,Sino-Tibetan,WhitespaceTokenizer,,TRUE,TRUE,FALSE,TRUE +gso,,Latn,Southwest Gbaya,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +gsw,,Latn,Swiss German,Indo-European,SpaCyTokenizer,de,TRUE,TRUE,FALSE,TRUE +gub,,Latn,Guajajára,Tupian,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +guc,,Latn,Wayuu,Maipurean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +gud,,Latn,Yocoboué Dida,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +gug,gn,Latn,Paraguayan Guaraní,Tupian,SpaCyTokenizer,xx,TRUE,TRUE,TRUE,TRUE +guh,,Latn,Guahibo,Guajiboan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +gui,gn,Latn,Eastern Bolivian Guaraní,Tupian,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +guj,gu,Gujr,Gujarati,Indo-European,IndicNLPTokenizer,gu,FALSE,FALSE,TRUE,TRUE +guj,gu,Latn,Gujarati,Indo-European,StanzaTokenizer,kmr,TRUE,TRUE,TRUE,FALSE +guk,,Ethi,Gumuz,Nilo-Saharan,WhitespaceTokenizer,,TRUE,TRUE,FALSE,TRUE +gul,,Latn,Sea Island Creole English,Creole,StanzaTokenizer,pcm,TRUE,TRUE,FALSE,TRUE +gum,,Latn,Guambiano,Paezan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +gun,gn,Latn,Mbyá Guaraní,Tupian,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +guo,,Latn,Guayabero,Guajiboan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +guq,,Latn,Aché,Tupian,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +gur,,Latn,Farefare,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +guu,,Latn,Yanomamö,Yanomaman,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +guw,,Latn,Gun,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +gux,,Latn,Gourmanchéma,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +guz,,Latn,Gusii,Niger-Congo,SpaCyTokenizer,lg,TRUE,TRUE,FALSE,TRUE +gvc,,Latn,Guanano,Tucanoan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +gvf,,Latn,Golin,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +gvl,,Latn,Gulay,Nilo-Saharan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +gvn,,Latn,Kuku-Yalanji,Australian,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +gwi,,Latn,Gwichʼin,Eyak-Athabaskan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +gwr,,Latn,Gwere,Niger-Congo,SpaCyTokenizer,lg,TRUE,TRUE,FALSE,TRUE +gya,,Latn,Northwest Gbaya,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +gym,,Latn,Ngäbere,Chibchan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +gyr,,Latn,Guarayu,Tupian,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +hac,,Arab,Gurani,Indo-European,IndicNLPTokenizer,ur,TRUE,TRUE,FALSE,TRUE +hae,om,Latn,Eastern Oromo,Afro-Asiatic,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +hag,,Latn,Hanga,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +hak,zh,Hani,Hakka Chinese,Sino-Tibetan,SpaCyTokenizer,zh,FALSE,FALSE,FALSE,TRUE +hak,zh,Latn,Hakka Chinese,Sino-Tibetan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,FALSE +hat,ht,Latn,Haitian,Creole,SpaCyTokenizer,xx,TRUE,TRUE,TRUE,TRUE +hau,ha,Latn,Hausa,Afro-Asiatic,SpaCyTokenizer,xx,TRUE,TRUE,TRUE,TRUE +hav,,Latn,Havu,Niger-Congo,SpaCyTokenizer,lg,TRUE,TRUE,FALSE,TRUE +haw,,Latn,Hawaiian,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +hay,,Latn,Haya,Niger-Congo,SpaCyTokenizer,lg,TRUE,TRUE,FALSE,TRUE +hbo,,Hebr,Ancient Hebrew,Afro-Asiatic,StanzaTokenizer,hbo,FALSE,FALSE,FALSE,TRUE +hbs,sh,Cyrl,Serbo-Croatian,,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,FALSE +hbs,sh,Latn,Serbo-Croatian,,SpaCyTokenizer,sr,TRUE,TRUE,FALSE,TRUE +hch,,Latn,Huichol,Uto-Aztecan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +heb,he,Hebr,Hebrew,Afro-Asiatic,SpaCyTokenizer,he,FALSE,FALSE,TRUE,TRUE +heg,,Latn,Helong,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +heh,,Latn,Hehe,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +her,hz,Latn,Herero,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,TRUE,TRUE +hif,,Latn,Fiji Hindi,Indo-European,StanzaTokenizer,kmr,TRUE,TRUE,FALSE,TRUE +hig,,Latn,Kamwe,Afro-Asiatic,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +hil,,Latn,Hiligaynon,Austronesian,SpaCyTokenizer,tl,TRUE,TRUE,FALSE,TRUE +hin,hi,Deva,Hindi,Indo-European,IndicNLPTokenizer,hi,FALSE,FALSE,TRUE,TRUE +hin,hi,Latn,Hindi,Indo-European,StanzaTokenizer,kmr,TRUE,TRUE,TRUE,FALSE +hix,,Latn,Hixkaryána,Cariban,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +hla,,Latn,Halia,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +hlt,,Latn,Matu Chin,Sino-Tibetan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +hmn,,Latn,Hmong,,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +hmo,ho,Latn,Hiri Motu,Pidgin,SpaCyTokenizer,xx,TRUE,TRUE,TRUE,TRUE +hmr,,Latn,Hmar,Sino-Tibetan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +hne,,Deva,Chhattisgarhi,Indo-European,IndicNLPTokenizer,ne,TRUE,TRUE,FALSE,TRUE +hnj,,Latn,Hmong Njua,Hmong-Mien,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +hnn,,Latn,Hanunoo,Austronesian,SpaCyTokenizer,tl,TRUE,TRUE,FALSE,TRUE +hns,,Latn,Caribbean Hindustani,Indo-European,StanzaTokenizer,kmr,TRUE,TRUE,FALSE,TRUE +hoc,,Latn,Ho,Austro-Asiatic,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +hoc,,Wara,Ho,Austro-Asiatic,WhitespaceTokenizer,,TRUE,TRUE,FALSE,FALSE +hop,,Latn,Hopi,Uto-Aztecan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +hot,,Latn,Hote,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +hra,,Latn,Hrangkhol,Sino-Tibetan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +hrv,hr,Latn,Croatian,Indo-European,SpaCyTokenizer,hr,FALSE,FALSE,TRUE,TRUE +hrx,,Latn,Hunsrik,Indo-European,SpaCyTokenizer,de,TRUE,TRUE,FALSE,TRUE +hsb,,Latn,Upper Sorbian,Indo-European,SpaCyTokenizer,hsb,FALSE,FALSE,FALSE,TRUE +hto,,Latn,Minica Huitoto,Witotoan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +hub,,Latn,Huambisa,Jivaroan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +hui,,Latn,Huli,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +hun,hu,Latn,Hungarian,Uralic,SpaCyTokenizer,hu,FALSE,FALSE,TRUE,TRUE +hus,,Latn,Huastec,Mayan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +huu,,Latn,Murui Huitoto,Witotoan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +huv,,Latn,San Mateo Del Mar Huave,Huavean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +hvn,,Latn,Sabu,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +hwc,,Latn,Hawai'i Creole English,Creole,StanzaTokenizer,pcm,TRUE,TRUE,FALSE,TRUE +hye,hy,Armn,Armenian,Indo-European,SpaCyTokenizer,hy,FALSE,FALSE,TRUE,TRUE +hyw,,Armn,Western Armenian,Indo-European,StanzaTokenizer,hyw,FALSE,FALSE,FALSE,TRUE +ian,,Latn,Iatmul,Sepik,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +iba,,Latn,Iban,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +ibg,,Latn,Ibanag,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +ibo,ig,Latn,Igbo,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,TRUE,TRUE +icr,,Latn,Islander Creole English,Creole,StanzaTokenizer,pcm,TRUE,TRUE,FALSE,TRUE +ido,io,Latn,Ido,,SpaCyTokenizer,xx,TRUE,TRUE,TRUE,TRUE +idu,,Latn,Idoma,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +ifa,,Latn,Amganad Ifugao,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +ifb,,Latn,Batad Ifugao,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +ife,,Latn,Ifè,Niger-Congo,SpaCyTokenizer,yo,TRUE,TRUE,FALSE,TRUE +ifk,,Latn,Tuwali Ifugao,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +ifu,,Latn,Mayoyao Ifugao,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +ify,,Latn,Keley-I Kallahan,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +ige,,Latn,Igede,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +ign,,Latn,Ignaciano,Maipurean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +ike,iu,Cans,Eastern Canadian Inuktitut,Eskimo-Aleut,WhitespaceTokenizer,,TRUE,TRUE,TRUE,TRUE +ikk,,Latn,Ika,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +ikt,iu,Latn,Inuinnaqtun,Eskimo-Aleut,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +iku,iu,Cans,Inuktitut,,WhitespaceTokenizer,,TRUE,TRUE,FALSE,TRUE +iku,iu,Latn,Inuktitut,,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,FALSE +ikw,,Latn,Ikwere,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +ilb,,Latn,Ila,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +ile,ie,Latn,Interlingue,,SpaCyTokenizer,xx,TRUE,TRUE,TRUE,TRUE +ilo,,Latn,Iloko,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +imo,,Latn,Imbongu,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +ina,ia,Latn,Interlingua (International Auxiliary Language Association),,SpaCyTokenizer,xx,TRUE,TRUE,TRUE,TRUE +inb,,Latn,Inga,Quechuan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +ind,id,Latn,Indonesian,Austronesian,SpaCyTokenizer,id,FALSE,FALSE,TRUE,TRUE +inh,,Cyrl,Ingush,Nakh-Daghestanian,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +ino,,Latn,Inoke-Yate,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +iou,,Latn,Tuma-Irumu,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +ipi,,Latn,Ipili,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +ipk,ik,Latn,Inupiaq,,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +iqw,,Latn,Ikwo,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +iri,,Latn,Rigwe,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +irk,,Latn,Iraqw,Afro-Asiatic,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +iry,,Latn,Iraya,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +isd,,Latn,Isnag,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +ish,,Latn,Esan,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +isl,is,Latn,Icelandic,Indo-European,SpaCyTokenizer,is,FALSE,FALSE,TRUE,TRUE +iso,,Latn,Isoko,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +ita,it,Latn,Italian,Indo-European,SpaCyTokenizer,it,FALSE,FALSE,TRUE,TRUE +itl,,Cyrl,Itelmen,Chukotko-Kamchatkan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +itv,,Latn,Itawit,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +ium,,Latn,Iu Mien,Hmong-Mien,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +ivb,,Latn,Ibatan,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +ivv,,Latn,Ivatan,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +iws,,Latn,Sepik Iwam,Sepik,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +ixl,,Latn,Ixil,Mayan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +izr,,Latn,Izere,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +izz,,Latn,Izii,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +jaa,,Latn,Jamamadí,Arauan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +jac,,Latn,Popti',Mayan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +jae,,Latn,Yabem,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +jam,,Latn,Jamaican Creole English,Creole,StanzaTokenizer,pcm,TRUE,TRUE,FALSE,TRUE +jav,jv,Java,Javanese,Austronesian,,,TRUE,TRUE,TRUE,FALSE +jav,jv,Latn,Javanese,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,TRUE,TRUE +jbo,,Latn,Lojban,,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +jbu,,Latn,Jukun Takum,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +jic,,Latn,Tol,Jicaquean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +jiv,,Latn,Shuar,Jivaroan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +jmc,,Latn,Machame,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +jpn,ja,Jpan,Japanese,Japonic,SpaCyTokenizer,ja,FALSE,FALSE,TRUE,TRUE +jra,,Latn,Jarai,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +jun,,Orya,Juang,Austro-Asiatic,WhitespaceTokenizer,,TRUE,TRUE,FALSE,TRUE +jvn,,Latn,Caribbean Javanese,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +kaa,,Cyrl,Kara-Kalpak,Turkic,StanzaTokenizer,kk,TRUE,TRUE,FALSE,TRUE +kaa,,Latn,Kara-Kalpak,Turkic,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,FALSE +kab,,Latn,Kabyle,Afro-Asiatic,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +kac,,Latn,Kachin,Sino-Tibetan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +kak,,Latn,Kalanguya,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +kal,kl,Latn,Kalaallisut,Eskimo-Aleut,SpaCyTokenizer,xx,TRUE,TRUE,TRUE,TRUE +kam,,Latn,Kamba (Kenya),Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +kan,kn,Knda,Kannada,Dravidian,IndicNLPTokenizer,kn,FALSE,FALSE,TRUE,TRUE +kan,kn,Latn,Kannada,Dravidian,SpaCyTokenizer,xx,TRUE,TRUE,TRUE,FALSE +kao,,Latn,Xaasongaxango,Niger-Congo,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +kap,,Cyrl,Bezhta,Nakh-Daghestanian,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +kaq,,Latn,Capanahua,Panoan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +kas,ks,Arab,Kashmiri,Indo-European,IndicNLPTokenizer,ur,FALSE,FALSE,TRUE,FALSE +kas,ks,Deva,Kashmiri,Indo-European,IndicNLPTokenizer,mr,TRUE,TRUE,TRUE,TRUE +kas,ks,Latn,Kashmiri,Indo-European,StanzaTokenizer,kmr,TRUE,TRUE,TRUE,FALSE +kat,ka,Geor,Georgian,Kartvelian,WhitespaceTokenizer,,FALSE,FALSE,TRUE,TRUE +kau,kr,Arab,Kanuri,,WhitespaceTokenizer,,TRUE,TRUE,FALSE,TRUE +kau,kr,Latn,Kanuri,,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,FALSE +kaz,kk,Cyrl,Kazakh,Turkic,StanzaTokenizer,kk,FALSE,FALSE,TRUE,TRUE +kbc,,Latn,Kadiwéu,Guaykuruan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +kbd,,Cyrl,Kabardian,Abkhaz-Adyghe,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +kbh,,Latn,Camsá,Language isolate,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +kbm,,Latn,Iwal,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +kbo,,Latn,Keliko,Nilo-Saharan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +kbp,,Latn,Kabiyè,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +kbq,,Latn,Kamano,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +kbr,,Latn,Kafa,Afro-Asiatic,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +kby,kr,Latn,Manga Kanuri,Nilo-Saharan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +kca,,Cyrl,Khanty,Uralic,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +kcg,,Latn,Tyap,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +kck,,Latn,Kalanga,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +kdc,,Latn,Kutu,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +kde,,Latn,Makonde,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +kdh,,Latn,Tem,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +kdi,,Latn,Kumam,Nilo-Saharan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +kdj,,Latn,Karamojong,Nilo-Saharan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +kdl,,Latn,Tsikimba,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +kdp,,Latn,Kaningdon-Nindem,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +kdr,,Latn,Karaim,Turkic,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +kea,,Latn,Kabuverdianu,Creole,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +kei,,Latn,Kei,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +kek,,Latn,Kekchí,Mayan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +ken,,Latn,Kenyang,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +keo,,Latn,Kakwa,Nilo-Saharan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +ker,,Latn,Kera,Afro-Asiatic,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +kew,,Latn,West Kewa,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +kex,,Deva,Kukna,Indo-European,IndicNLPTokenizer,kK,TRUE,TRUE,FALSE,TRUE +kez,,Latn,Kukele,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +kff,,Telu,Koya,Dravidian,IndicNLPTokenizer,te,TRUE,TRUE,FALSE,TRUE +kgf,,Latn,Kube,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +kgk,,Latn,Kaiwá,Tupian,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +kgp,,Latn,Kaingang,Jean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +kgr,,Latn,Abun,Language isolate,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +kha,,Latn,Khasi,Austro-Asiatic,SpaCyTokenizer,vi,TRUE,TRUE,FALSE,TRUE +khk,mn,Cyrl,Halh Mongolian,Mongolic,StanzaTokenizer,bxr,TRUE,TRUE,TRUE,TRUE +khm,km,Khmr,Khmer,Austro-Asiatic,KhmerTokenizer,,FALSE,FALSE,TRUE,TRUE +khq,,Latn,Koyra Chiini Songhay,Nilo-Saharan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +khs,,Latn,Kasua,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +khy,,Latn,Kele (Democratic Republic of Congo),Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +khz,,Latn,Keapara,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +kia,,Latn,Kim,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +kij,,Latn,Kilivila,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +kik,ki,Latn,Kikuyu,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,TRUE,TRUE +kin,rw,Latn,Kinyarwanda,Niger-Congo,SpaCyTokenizer,lg,TRUE,TRUE,TRUE,TRUE +kir,ky,Cyrl,Kirghiz,Turkic,SpaCyTokenizer,ky,FALSE,FALSE,TRUE,TRUE +kiu,,Latn,Kirmanjki (individual language),Indo-European,StanzaTokenizer,kmr,TRUE,TRUE,FALSE,TRUE +kix,,Latn,Khiamniungan Naga,Sino-Tibetan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +kjb,,Latn,Q'anjob'al,Mayan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +kje,,Latn,Kisar,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +kjh,,Cyrl,Khakas,Turkic,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +kjs,,Latn,East Kewa,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +kkc,,Latn,Odoodee,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +kki,,Latn,Kagulu,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +kkj,,Latn,Kako,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +kkl,,Latn,Kosarek Yale,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +kle,,Deva,Kulung (Nepal),Sino-Tibetan,IndicNLPTokenizer,hi,TRUE,TRUE,FALSE,TRUE +kln,,Latn,Kalenjin,,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +klt,,Latn,Nukna,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +klv,,Latn,Maskelynes,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +kma,,Latn,Konni,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +kmb,,Latn,Kimbundu,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +kmd,,Latn,Majukayang Kalinga,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +kmg,,Latn,Kâte,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +kmh,,Latn,Kalam,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +kmk,,Latn,Limos Kalinga,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +kmm,,Latn,Kom (India),Sino-Tibetan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +kmo,,Latn,Kwoma,Sepik,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +kmr,ku,Cyrl,Northern Kurdish,Indo-European,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,FALSE +kmr,ku,Latn,Northern Kurdish,Indo-European,StanzaTokenizer,kmr,FALSE,FALSE,FALSE,TRUE +kms,,Latn,Kamasau,Torricelli,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +kmu,,Latn,Kanite,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +kmy,,Latn,Koma,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +knc,kr,Arab,Central Kanuri,Nilo-Saharan,WhitespaceTokenizer,,TRUE,TRUE,TRUE,TRUE +knc,kr,Latn,Central Kanuri,Nilo-Saharan,SpaCyTokenizer,xx,TRUE,TRUE,TRUE,FALSE +kne,,Latn,Kankanaey,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +knf,,Latn,Mankanya,Niger-Congo,StanzaTokenizer,wo,TRUE,TRUE,FALSE,TRUE +kng,kg,Latn,Koongo,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,TRUE,TRUE +knj,,Latn,Western Kanjobal,Mayan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +knk,,Latn,Kuranko,Niger-Congo,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +kno,,Latn,Kono (Sierra Leone),Niger-Congo,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +knv,,Latn,Tabo,South-Central Papuan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +knx,,Latn,Kendayan,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +kny,,Latn,Kanyok,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +kog,,Latn,Cogui,Chibchan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +koi,kv,Cyrl,Komi-Permyak,Uralic,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +kok,,Deva,Konkani (macrolanguage),,WhitespaceTokenizer,,TRUE,TRUE,FALSE,TRUE +kok,,Latn,Konkani (macrolanguage),,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,FALSE +kom,kv,Cyrl,Komi,,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +kon,kg,Latn,Kongo,,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +koo,,Latn,Konzo,Niger-Congo,SpaCyTokenizer,lg,TRUE,TRUE,FALSE,TRUE +kor,ko,Hang,Korean,Koreanic,KiwiTokenizer,,FALSE,FALSE,TRUE,TRUE +kos,,Latn,Kosraean,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +kpe,,Latn,Kpelle,,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +kpf,,Latn,Komba,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +kpg,,Latn,Kapingamarangi,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +kpj,,Latn,Karajá,Karajá,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +kpq,,Latn,Korupun-Sela,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +kpr,,Latn,Korafe-Yegha,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +kpv,kv,Cyrl,Komi-Zyrian,Uralic,SpaCyTokenizer,xx,TRUE,TRUE,TRUE,TRUE +kpw,,Latn,Kobon,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +kpx,,Latn,Mountain Koiali,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +kpz,,Latn,Kupsabiny,Nilo-Saharan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +kqa,,Latn,Mum,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +kqc,,Latn,Doromu-Koki,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +kqe,,Latn,Kalagan,Austronesian,SpaCyTokenizer,tl,TRUE,TRUE,FALSE,TRUE +kqf,,Latn,Kakabai,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +kql,,Latn,Kyenele,Yuat,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +kqn,,Latn,Kaonde,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +kqo,,Latn,Eastern Krahn,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +kqp,,Latn,Kimré,Afro-Asiatic,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +kqs,,Latn,Northern Kissi,Niger-Congo,StanzaTokenizer,wo,TRUE,TRUE,FALSE,TRUE +kqw,,Latn,Kandas,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +kqy,,Ethi,Koorete,Afro-Asiatic,WhitespaceTokenizer,,TRUE,TRUE,FALSE,TRUE +krc,,Cyrl,Karachay-Balkar,Turkic,StanzaTokenizer,kk,TRUE,TRUE,FALSE,TRUE +kri,,Latn,Krio,Creole,StanzaTokenizer,pcm,TRUE,TRUE,FALSE,TRUE +krj,,Latn,Kinaray-A,Austronesian,SpaCyTokenizer,tl,TRUE,TRUE,FALSE,TRUE +krl,,Latn,Karelian,Uralic,SpaCyTokenizer,fi,TRUE,TRUE,FALSE,TRUE +kru,,Deva,Kurukh,Dravidian,WhitespaceTokenizer,,TRUE,TRUE,FALSE,TRUE +krx,,Latn,Karon,Niger-Congo,StanzaTokenizer,wo,TRUE,TRUE,FALSE,TRUE +ksb,,Latn,Shambala,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +ksc,,Latn,Southern Kalinga,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +ksd,,Latn,Kuanua,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +ksf,,Latn,Bafia,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +ksh,,Latn,Kölsch,Indo-European,SpaCyTokenizer,lb,TRUE,TRUE,FALSE,TRUE +ksj,,Latn,Uare,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +ksp,,Latn,Kaba,Nilo-Saharan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +ksr,,Latn,Borong,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +kss,,Latn,Southern Kisi,Niger-Congo,StanzaTokenizer,wo,TRUE,TRUE,FALSE,TRUE +ksw,,Mymr,S'gaw Karen,Sino-Tibetan,BurmeseTokenizer,,TRUE,TRUE,FALSE,TRUE +ktb,,Ethi,Kambaata,Afro-Asiatic,WhitespaceTokenizer,,TRUE,TRUE,FALSE,TRUE +ktj,,Latn,Plapo Krumen,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +ktm,,Latn,Kurti,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +kto,,Latn,Kuot,Language isolate,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +ktu,,Latn,Kituba (Democratic Republic of Congo),Creole,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +ktz,,Latn,Juǀʼhoan,Kx’a,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +kua,kj,Latn,Kuanyama,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,TRUE,TRUE +kub,,Latn,Kutep,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +kud,,Latn,Auhelawa,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +kue,,Latn,Kuman (Papua New Guinea),Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +kuj,,Latn,Kuria,Niger-Congo,SpaCyTokenizer,lg,TRUE,TRUE,FALSE,TRUE +kum,,Cyrl,Kumyk,Turkic,StanzaTokenizer,kk,TRUE,TRUE,FALSE,TRUE +kup,,Latn,Kunimaipa,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +kur,ku,Arab,Kurdish,,IndicNLPTokenizer,ur,TRUE,TRUE,FALSE,TRUE +kur,ku,Cyrl,Kurdish,,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,FALSE +kur,ku,Latn,Kurdish,,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,FALSE +kus,,Latn,Kusaal,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +kvg,,Latn,Kuni-Boazi,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +kvj,,Latn,Psikye,Afro-Asiatic,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +kvn,,Latn,Border Kuna,Chibchan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +kwd,,Latn,Kwaio,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +kwf,,Latn,Kwara'ae,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +kwi,,Latn,Awa-Cuaiquer,Barbacoan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +kwj,,Latn,Kwanga,Sepik,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +kwn,,Latn,Kwangali,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +kwy,kg,Latn,San Salvador Kongo,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +kxc,,Ethi,Konso,Afro-Asiatic,WhitespaceTokenizer,,TRUE,TRUE,FALSE,TRUE +kxm,,Thai,Northern Khmer,Austro-Asiatic,ThaiTokenizer,,TRUE,TRUE,FALSE,TRUE +kxw,,Latn,Konai,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +kyc,,Latn,Kyaka,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +kyf,,Latn,Kouya,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +kyg,,Latn,Keyagana,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +kyq,,Latn,Kenga,Nilo-Saharan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +kyu,,Kali,Western Kayah,Sino-Tibetan,WhitespaceTokenizer,,TRUE,TRUE,FALSE,FALSE +kyu,,Latn,Western Kayah,Sino-Tibetan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +kyu,,Mymr,Western Kayah,Sino-Tibetan,BurmeseTokenizer,,TRUE,TRUE,FALSE,FALSE +kyz,,Latn,Kayabí,Tupian,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +kze,,Latn,Kosena,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +kzf,,Latn,Da'a Kaili,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +kzj,,Latn,,,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +kzn,,Latn,Kokola,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +lac,,Latn,Lacandon,Mayan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +lad,,Hebr,Ladino,Indo-European,WhitespaceTokenizer,,TRUE,TRUE,FALSE,FALSE +lad,,Latn,Ladino,Indo-European,SpaCyTokenizer,es,TRUE,TRUE,FALSE,TRUE +lah,,Arab,Lahnda,,WhitespaceTokenizer,,TRUE,TRUE,FALSE,TRUE +lai,,Latn,Lambya,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +laj,,Latn,Lango (Uganda),Nilo-Saharan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +lam,,Latn,Lamba,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +lao,lo,Laoo,Lao,Kra-Dai,LaoTokenizer,,FALSE,FALSE,TRUE,TRUE +lap,,Latn,Laka (Chad),Nilo-Saharan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +las,,Latn,Lama (Togo),Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +lat,la,Latn,Latin,Indo-European,SpaCyTokenizer,la,FALSE,FALSE,TRUE,TRUE +lav,lv,Latn,Latvian,,SpaCyTokenizer,lv,FALSE,FALSE,FALSE,TRUE +law,,Latn,Lauje,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +lbb,,Latn,Label,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +lbe,,Cyrl,Lak,Nakh-Daghestanian,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +lbj,,Tibt,Ladakhi,Sino-Tibetan,TibetanTokenizer,,TRUE,TRUE,FALSE,TRUE +lbk,,Latn,Central Bontok,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +lcm,,Latn,Tungag,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +lcp,,Thai,Western Lawa,Austro-Asiatic,ThaiTokenizer,,TRUE,TRUE,FALSE,TRUE +ldi,kg,Latn,Laari,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +ldn,,Latn,Láadan,,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +lea,,Latn,Lega-Shabunda,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +led,,Latn,Lendu,Nilo-Saharan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +lee,,Latn,Lyélé,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +lef,,Latn,Lelemi,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +leh,,Latn,Lenje,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +lem,,Latn,Nomaande,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +leu,,Latn,Kara (Papua New Guinea),Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +lew,,Latn,Ledo Kaili,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +lex,,Latn,Luang,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +lez,,Cyrl,Lezghian,Nakh-Daghestanian,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +lfn,,Cyrl,Lingua Franca Nova,,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,FALSE +lfn,,Latn,Lingua Franca Nova,,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +lgg,,Latn,Lugbara,Nilo-Saharan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +lgl,,Latn,Wala,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +lgm,,Latn,Lega-Mwenga,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +lhi,,Latn,Lahu Shi,Sino-Tibetan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +lhu,,Latn,Lahu,Sino-Tibetan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +lia,,Latn,West-Central Limba,Niger-Congo,StanzaTokenizer,wo,TRUE,TRUE,FALSE,TRUE +lid,,Latn,Nyindrou,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +lif,,Deva,Limbu,Sino-Tibetan,IndicNLPTokenizer,hi,TRUE,TRUE,FALSE,TRUE +lif,,Limb,Limbu,Sino-Tibetan,WhitespaceTokenizer,,TRUE,TRUE,FALSE,FALSE +lij,,Latn,Ligurian,Indo-European,SpaCyTokenizer,lij,FALSE,FALSE,FALSE,TRUE +lim,li,Latn,Limburgan,Indo-European,SpaCyTokenizer,nl,TRUE,TRUE,TRUE,TRUE +lin,ln,Latn,Lingala,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,TRUE,TRUE +lip,,Latn,Sekpele,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +lis,,Lisu,Lisu,Sino-Tibetan,WhitespaceTokenizer,,TRUE,TRUE,FALSE,TRUE +lit,lt,Latn,Lithuanian,Indo-European,SpaCyTokenizer,lt,FALSE,FALSE,TRUE,TRUE +liv,,Latn,Liv,Uralic,SpaCyTokenizer,fi,TRUE,TRUE,FALSE,TRUE +ljp,,Latn,Lampung Api,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +lki,,Arab,Laki,Indo-European,IndicNLPTokenizer,ur,TRUE,TRUE,FALSE,TRUE +llb,,Latn,Lolo,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +lld,,Latn,Ladin,Indo-European,SpaCyTokenizer,fr,TRUE,TRUE,FALSE,TRUE +llg,,Latn,Lole,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +lln,,Latn,Lele (Chad),Afro-Asiatic,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +lmk,,Latn,Lamkang,Sino-Tibetan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +lmo,,Latn,Lombard,Indo-European,SpaCyTokenizer,lij,TRUE,TRUE,FALSE,TRUE +lmp,,Latn,Limbum,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +lnd,,Latn,Lundayeh,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +lob,,Latn,Lobi,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +loe,,Latn,Saluan,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +log,,Latn,Logo,Nilo-Saharan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +lok,,Latn,Loko,Niger-Congo,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +lol,,Latn,Mongo,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +lom,,Latn,Loma (Liberia),Niger-Congo,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +loq,,Latn,Lobala,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +loz,,Latn,Lozi,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +lrc,,Arab,Northern Luri,Indo-European,IndicNLPTokenizer,ur,TRUE,TRUE,FALSE,TRUE +lsi,,Latn,Lashi,Sino-Tibetan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +lsm,,Latn,Saamia,Niger-Congo,SpaCyTokenizer,lg,TRUE,TRUE,FALSE,TRUE +ltg,lv,Latn,Latgalian,Indo-European,SpaCyTokenizer,lv,FALSE,FALSE,FALSE,TRUE +ltz,lb,Latn,Luxembourgish,Indo-European,SpaCyTokenizer,lb,FALSE,FALSE,TRUE,TRUE +lua,,Latn,Luba-Lulua,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +lub,lu,Latn,Luba-Katanga,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,TRUE,TRUE +luc,,Latn,Aringa,Nilo-Saharan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +lud,,Latn,Ludian,Uralic,SpaCyTokenizer,fi,TRUE,TRUE,FALSE,TRUE +lue,,Latn,Luvale,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +lug,lg,Latn,Ganda,Niger-Congo,SpaCyTokenizer,lg,FALSE,FALSE,TRUE,TRUE +lun,,Latn,Lunda,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +luo,,Latn,Luo (Kenya and Tanzania),Nilo-Saharan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +lus,,Latn,Lushai,Sino-Tibetan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +luy,,Latn,Luyia,,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +lvs,lv,Latn,Standard Latvian,Indo-European,SpaCyTokenizer,lv,FALSE,FALSE,TRUE,TRUE +lwg,,Latn,Wanga,Niger-Congo,SpaCyTokenizer,lg,TRUE,TRUE,FALSE,TRUE +lwo,,Latn,Luwo,Nilo-Saharan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +lww,,Latn,Lewo,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +lzh,zh,Hani,Literary Chinese,Sino-Tibetan,StanzaTokenizer,lzh,FALSE,FALSE,FALSE,TRUE +maa,,Latn,San Jerónimo Tecóatl Mazatec,Otomanguean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +mad,,Latn,Madurese,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +maf,,Latn,Mafa,Afro-Asiatic,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +mag,,Deva,Magahi,Indo-European,IndicNLPTokenizer,hi,TRUE,TRUE,FALSE,TRUE +mah,mh,Latn,Marshallese,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,TRUE,TRUE +mai,,Deva,Maithili,Indo-European,IndicNLPTokenizer,hi,FALSE,FALSE,FALSE,TRUE +maj,,Latn,Jalapa De Díaz Mazatec,Otomanguean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +mak,,Latn,Makasar,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +mal,ml,Latn,Malayalam,Dravidian,SpaCyTokenizer,xx,TRUE,TRUE,TRUE,FALSE +mal,ml,Mlym,Malayalam,Dravidian,IndicNLPTokenizer,ml,FALSE,FALSE,TRUE,TRUE +mam,,Latn,Mam,Mayan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +man,,Latn,Mandingo,,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +maq,,Latn,Chiquihuitlán Mazatec,Otomanguean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +mar,mr,Deva,Marathi,Indo-European,IndicNLPTokenizer,mr,FALSE,FALSE,TRUE,TRUE +mar,mr,Latn,Marathi,Indo-European,StanzaTokenizer,kmr,TRUE,TRUE,TRUE,FALSE +mas,,Latn,Masai,Nilo-Saharan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +mau,,Latn,Huautla Mazatec,Otomanguean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +mav,,Latn,Sateré-Mawé,Tupian,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +maw,,Latn,Mampruli,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +max,ms,Latn,North Moluccan Malay,Creole,SpaCyTokenizer,ms,FALSE,FALSE,FALSE,TRUE +maz,,Latn,Central Mazahua,Otomanguean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +mbb,,Latn,Western Bukidnon Manobo,Austronesian,SpaCyTokenizer,tl,TRUE,TRUE,FALSE,TRUE +mbc,,Latn,Macushi,Cariban,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +mbd,,Latn,Dibabawon Manobo,Austronesian,SpaCyTokenizer,tl,TRUE,TRUE,FALSE,TRUE +mbf,,Latn,Baba Malay,Creole,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +mbh,,Latn,Mangseng,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +mbi,,Latn,Ilianen Manobo,Austronesian,SpaCyTokenizer,tl,TRUE,TRUE,FALSE,TRUE +mbj,,Latn,Nadëb,Puinavean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +mbl,,Latn,Maxakalí,Maxakalian,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +mbs,,Latn,Sarangani Manobo,Austronesian,SpaCyTokenizer,tl,TRUE,TRUE,FALSE,TRUE +mbt,,Latn,Matigsalug Manobo,Austronesian,SpaCyTokenizer,tl,TRUE,TRUE,FALSE,TRUE +mca,,Latn,Maca,Matacoan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +mcb,,Latn,Machiguenga,Maipurean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +mcd,,Latn,Sharanahua,Panoan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +mcf,,Latn,Matsés,Panoan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +mck,,Latn,Mbunda,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +mcn,,Latn,Masana,Afro-Asiatic,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +mco,,Latn,Coatlán Mixe,Mixe-Zoquean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +mcp,,Latn,Makaa,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +mcq,,Latn,Ese,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +mcu,,Latn,Cameroon Mambila,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +mda,,Latn,Mada (Nigeria),Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +mdf,,Cyrl,Moksha,Uralic,StanzaTokenizer,myv,TRUE,TRUE,FALSE,TRUE +mdy,,Ethi,Male (Ethiopia),Afro-Asiatic,WhitespaceTokenizer,,TRUE,TRUE,FALSE,TRUE +med,,Latn,Melpa,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +mee,,Latn,Mengen,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +mej,,Latn,Meyah,East Bird’s Head-Sentani,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +mek,,Latn,Mekeo,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +men,,Latn,Mende (Sierra Leone),Niger-Congo,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +meq,,Latn,Merey,Afro-Asiatic,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +mer,,Latn,Meru,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +met,,Latn,Mato,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +meu,,Latn,Motu,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +mev,,Latn,Mano,Niger-Congo,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +mfe,,Latn,Morisyen,Creole,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +mfg,,Latn,Mogofin,Niger-Congo,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +mfh,,Latn,Matal,Afro-Asiatic,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +mfi,,Latn,Wandala,Afro-Asiatic,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +mfk,,Latn,North Mofu,Afro-Asiatic,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +mfq,,Latn,Moba,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +mfy,,Latn,Mayo,Uto-Aztecan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +mfz,,Latn,Mabaan,Nilo-Saharan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +mgc,,Latn,Morokodo,Nilo-Saharan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +mgh,,Latn,Makhuwa-Meetto,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +mgm,,Latn,Mambae,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +mgo,,Latn,Meta',Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +mgr,,Latn,Mambwe-Lungu,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +mhi,,Latn,Ma'di,Nilo-Saharan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +mhl,,Latn,Mauwake,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +mhr,,Cyrl,Eastern Mari,Uralic,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +mhw,,Latn,Mbukushu,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +mhx,,Latn,Maru,Sino-Tibetan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +mhy,,Latn,Ma'anyan,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +mib,,Latn,Atatláhuca Mixtec,Otomanguean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +mic,,Latn,Mi'kmaq,Algic,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +mie,,Latn,Ocotepec Mixtec,Otomanguean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +mif,,Latn,Mofu-Gudur,Afro-Asiatic,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +mig,,Latn,San Miguel El Grande Mixtec,Otomanguean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +mih,,Latn,Chayuco Mixtec,Otomanguean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +mil,,Latn,Peñoles Mixtec,Otomanguean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +mim,,Latn,Alacatlatzala Mixtec,Otomanguean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +min,ms,Arab,Minangkabau,Austronesian,WhitespaceTokenizer,,TRUE,TRUE,FALSE,FALSE +min,ms,Latn,Minangkabau,Austronesian,SpaCyTokenizer,ms,FALSE,FALSE,FALSE,TRUE +mio,,Latn,Pinotepa Nacional Mixtec,Otomanguean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +mip,,Latn,Apasco-Apoala Mixtec,Otomanguean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +miq,,Latn,Mískito,Misumalpan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +mir,,Latn,Isthmus Mixe,Mixe-Zoquean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +mit,,Latn,Southern Puebla Mixtec,Otomanguean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +miy,,Latn,Ayutla Mixtec,Otomanguean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +miz,,Latn,Coatzospan Mixtec,Otomanguean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +mjc,,Latn,San Juan Colorado Mixtec,Otomanguean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +mjw,,Latn,Karbi,Sino-Tibetan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +mkd,mk,Cyrl,Macedonian,Indo-European,SpaCyTokenizer,mk,FALSE,FALSE,TRUE,TRUE +mkl,,Latn,Mokole,Niger-Congo,SpaCyTokenizer,yo,TRUE,TRUE,FALSE,TRUE +mkn,,Latn,Kupang Malay,Creole,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +mks,,Latn,Silacayoapan Mixtec,Otomanguean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +mkz,,Latn,Makasae,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +mlg,mg,Latn,Malagasy,,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +mlh,,Latn,Mape,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +mlp,,Latn,Bargam,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +mlt,mt,Latn,Maltese,Afro-Asiatic,StanzaTokenizer,mt,FALSE,FALSE,TRUE,TRUE +mlu,,Latn,To'abaita,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +mmn,,Latn,Mamanwa,Austronesian,SpaCyTokenizer,tl,TRUE,TRUE,FALSE,TRUE +mmo,,Latn,Mangga Buang,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +mmx,,Latn,Madak,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +mna,,Latn,Mbula,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +mnb,,Latn,Muna,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +mnf,,Latn,Mundani,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +mni,,Beng,Manipuri,Sino-Tibetan,WhitespaceTokenizer,,TRUE,TRUE,FALSE,FALSE +mni,,Latn,Manipuri,Sino-Tibetan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +mni,,Mtei,Manipuri,Sino-Tibetan,WhitespaceTokenizer,,TRUE,TRUE,FALSE,FALSE +mnk,,Latn,Mandinka,Niger-Congo,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +mns,,Cyrl,Mansi,Uralic,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +mnw,,Mymr,Mon,Austro-Asiatic,BurmeseTokenizer,,TRUE,TRUE,FALSE,TRUE +mnx,,Latn,Manikion,East Bird’s Head-Sentani,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +mny,,Latn,Manyawa,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +moa,,Latn,Mwan,Niger-Congo,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +moc,,Latn,Mocoví,Guaykuruan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +mog,,Latn,Mongondow,Austronesian,SpaCyTokenizer,tl,TRUE,TRUE,FALSE,TRUE +moh,,Latn,Mohawk,Iroquoian,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +mon,mn,Cyrl,Mongolian,,StanzaTokenizer,bxr,TRUE,TRUE,FALSE,TRUE +mop,,Latn,Mopán Maya,Mayan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +mor,,Latn,Moro,Niger-Congo,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +mos,,Latn,Mossi,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +mox,,Latn,Molima,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +mpg,,Latn,Marba,Afro-Asiatic,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +mph,,Latn,Maung,Australian,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +mpm,,Latn,Yosondúa Mixtec,Otomanguean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +mpp,,Latn,Migabac,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +mps,,Latn,Dadibi,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +mpt,,Latn,Mian,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +mpx,,Latn,Misima-Panaeati,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +mqb,,Latn,Mbuko,Afro-Asiatic,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +mqj,,Latn,Mamasa,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +mqy,,Latn,Manggarai,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +mrg,,Latn,Mising,Sino-Tibetan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +mri,mi,Latn,Maori,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,TRUE,TRUE +mrj,,Cyrl,Western Mari,Uralic,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +mrq,,Latn,North Marquesan,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +mrv,,Latn,Mangareva,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +mrw,,Latn,Maranao,Austronesian,SpaCyTokenizer,tl,TRUE,TRUE,FALSE,TRUE +msa,ms,Arab,Malay (macrolanguage),,WhitespaceTokenizer,,TRUE,TRUE,FALSE,FALSE +msa,ms,Latn,Malay (macrolanguage),,SpaCyTokenizer,ms,FALSE,FALSE,FALSE,TRUE +msa,ms,Thai,Malay (macrolanguage),,ThaiTokenizer,,TRUE,TRUE,FALSE,FALSE +msb,,Latn,Masbatenyo,Austronesian,SpaCyTokenizer,tl,TRUE,TRUE,FALSE,TRUE +msc,,Latn,Sankaran Maninka,Niger-Congo,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +mse,,Latn,Musey,Afro-Asiatic,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +msk,,Latn,Mansaka,Austronesian,SpaCyTokenizer,tl,TRUE,TRUE,FALSE,TRUE +msm,,Latn,Agusan Manobo,Austronesian,SpaCyTokenizer,tl,TRUE,TRUE,FALSE,TRUE +msy,,Latn,Aruamu,Ramu-Lower Sepik,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +mta,,Latn,Cotabato Manobo,Austronesian,SpaCyTokenizer,tl,TRUE,TRUE,FALSE,TRUE +mtg,,Latn,Una,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +mti,,Latn,Maiwa (Papua New Guinea),Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +mtj,,Latn,Moskona,East Bird’s Head-Sentani,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +mto,,Latn,Totontepec Mixe,Mixe-Zoquean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +mtp,,Latn,Wichí Lhamtés Nocten,Matacoan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +mua,,Latn,Mundang,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +mug,,Latn,Musgu,Afro-Asiatic,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +muh,,Latn,Mündü,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +mui,ms,Latn,Musi,Austronesian,SpaCyTokenizer,ms,FALSE,FALSE,FALSE,TRUE +mup,,Deva,Malvi,Indo-European,IndicNLPTokenizer,ne,TRUE,TRUE,FALSE,TRUE +mur,,Latn,Murle,Nilo-Saharan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +mus,,Latn,Creek,Muskogean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +mux,,Latn,Bo-Ung,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +muy,,Latn,Muyang,Afro-Asiatic,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +mva,,Latn,Manam,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +mvn,,Latn,Minaveha,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +mvp,,Latn,Duri,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +mwc,,Latn,Are,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +mwf,,Latn,Murrinh-Patha,Australian,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +mwl,,Latn,Mirandese,Indo-European,SpaCyTokenizer,es,TRUE,TRUE,FALSE,TRUE +mwm,,Latn,Sar,Nilo-Saharan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +mwn,,Latn,Nyamwanga,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +mwp,,Latn,Kala Lagaw Ya,Australian,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +mwq,,Latn,Mün Chin,Sino-Tibetan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +mwv,,Latn,Mentawai,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +mww,,Latn,Hmong Daw,Hmong-Mien,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +mxb,,Latn,Tezoatlán Mixtec,Otomanguean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +mxp,,Latn,Tlahuitoltepec Mixe,Mixe-Zoquean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +mxq,,Latn,Juquila Mixe,Mixe-Zoquean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +mxt,,Latn,Jamiltepec Mixtec,Otomanguean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +mxv,,Latn,Metlatónoc Mixtec,Otomanguean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +mya,my,Mymr,Burmese,Sino-Tibetan,BurmeseTokenizer,,FALSE,FALSE,TRUE,TRUE +myb,,Latn,Mbay,Nilo-Saharan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +myk,,Latn,Mamara Senoufo,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +myu,,Latn,Mundurukú,Tupian,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +myv,,Cyrl,Erzya,Uralic,StanzaTokenizer,myv,FALSE,FALSE,FALSE,TRUE +myw,,Latn,Muyuw,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +myx,,Latn,Masaaba,Niger-Congo,SpaCyTokenizer,lg,TRUE,TRUE,FALSE,TRUE +myy,,Latn,Macuna,Tucanoan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +mza,,Latn,Santa María Zacatepec Mixtec,Otomanguean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +mzh,,Latn,Wichí Lhamtés Güisnay,Matacoan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +mzk,,Latn,Nigeria Mambila,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +mzl,,Latn,Mazatlán Mixe,Mixe-Zoquean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +mzm,,Latn,Mumuye,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +mzn,,Arab,Mazanderani,Indo-European,IndicNLPTokenizer,ur,TRUE,TRUE,FALSE,TRUE +mzw,,Latn,Deg,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +mzz,,Latn,Maiadomu,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +nab,,Latn,Southern Nambikuára,Nambikwara,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +naf,,Latn,Nabak,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +nah,,Latn,,,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +nak,,Latn,Nakanai,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +nan,,Hani,Min Nan Chinese,,SpaCyTokenizer,zh,TRUE,TRUE,FALSE,TRUE +nan,,Latn,Min Nan Chinese,,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,FALSE +nap,,Latn,Neapolitan,Indo-European,SpaCyTokenizer,it,TRUE,TRUE,FALSE,TRUE +naq,,Latn,Khoekhoe,Khoe-Kwadi,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +nas,,Latn,Naasioi,South Bougainville,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +nav,nv,Latn,Navajo,Eyak-Athabaskan,SpaCyTokenizer,xx,TRUE,TRUE,TRUE,TRUE +naw,,Latn,Nawuri,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +nba,,Latn,Nyemba,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +nbc,,Latn,Chang Naga,Sino-Tibetan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +nbe,,Latn,Konyak Naga,Sino-Tibetan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +nbl,nr,Latn,South Ndebele,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,TRUE,TRUE +nbq,,Latn,Nggem,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +nbu,,Latn,Rongmei Naga,Sino-Tibetan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +nca,,Latn,Iyo,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +nch,,Latn,Central Huasteca Nahuatl,Uto-Aztecan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +ncj,,Latn,Northern Puebla Nahuatl,Uto-Aztecan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +ncl,,Latn,Michoacán Nahuatl,Uto-Aztecan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +ncq,,Laoo,Northern Katang,Austro-Asiatic,LaoTokenizer,,TRUE,TRUE,FALSE,TRUE +nct,,Latn,Chothe Naga,Sino-Tibetan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +ncu,,Latn,Chumburung,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +ncx,,Latn,Central Puebla Nahuatl,Uto-Aztecan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +ndc,,Latn,Ndau,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +nde,nd,Latn,North Ndebele,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,TRUE,TRUE +ndh,,Latn,Ndali,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +ndi,,Latn,Samba Leko,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +ndj,,Latn,Ndamba,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +ndo,ng,Latn,Ndonga,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,TRUE,TRUE +ndp,,Latn,Ndo,Nilo-Saharan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +nds,,Latn,Low German,Indo-European,SpaCyTokenizer,nl,TRUE,TRUE,FALSE,TRUE +ndy,,Latn,Lutos,Nilo-Saharan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +ndz,,Latn,Ndogo,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +neb,,Latn,Toura (Côte d'Ivoire),Niger-Congo,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +nep,ne,Deva,Nepali (macrolanguage),,IndicNLPTokenizer,ne,FALSE,FALSE,FALSE,TRUE +nep,ne,Latn,Nepali (macrolanguage),,StanzaTokenizer,kmr,TRUE,TRUE,FALSE,FALSE +new,,Deva,Newari,Sino-Tibetan,IndicNLPTokenizer,hi,TRUE,TRUE,FALSE,TRUE +nfa,,Latn,Dhao,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +nfr,,Latn,Nafaanra,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +ngb,,Latn,Northern Ngbandi,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +ngc,,Latn,Ngombe (Democratic Republic of Congo),Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +ngl,,Latn,Lomwe,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +ngp,,Latn,Ngulu,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +ngu,,Latn,Guerrero Nahuatl,Uto-Aztecan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +nhd,gn,Latn,Chiripá,Tupian,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +nhe,,Latn,Eastern Huasteca Nahuatl,Uto-Aztecan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +nhg,,Latn,Tetelcingo Nahuatl,Uto-Aztecan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +nhi,,Latn,Zacatlán-Ahuacatlán-Tepetzintla Nahuatl,Uto-Aztecan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +nhk,,Latn,Isthmus-Cosoleacaque Nahuatl,Uto-Aztecan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +nho,,Latn,Takuu,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +nhr,,Latn,Naro,Khoe-Kwadi,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +nhu,,Latn,Noone,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +nhw,,Latn,Western Huasteca Nahuatl,Uto-Aztecan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +nhx,,Latn,Isthmus-Mecayapan Nahuatl,Uto-Aztecan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +nhy,,Latn,Northern Oaxaca Nahuatl,Uto-Aztecan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +nia,,Latn,Nias,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +nif,,Latn,Nek,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +nii,,Latn,Nii,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +nij,,Latn,Ngaju,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +nim,,Latn,Nilamba,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +nin,,Latn,Ninzo,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +nio,,Cyrl,Nganasan,Uralic,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +niq,,Latn,Nandi,Nilo-Saharan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +niu,,Latn,Niuean,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +niy,,Latn,Ngiti,Nilo-Saharan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +njb,,Latn,Nocte Naga,Sino-Tibetan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +njm,,Latn,Angami Naga,Sino-Tibetan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +njn,,Latn,Liangmai Naga,Sino-Tibetan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +njo,,Latn,Ao Naga,Sino-Tibetan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +njz,,Latn,Nyishi,Sino-Tibetan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +nkf,,Latn,Inpui Naga,Sino-Tibetan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +nki,,Latn,Thangal Naga,Sino-Tibetan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +nko,,Latn,Nkonya,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +nla,,Latn,Ngombale,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +nlc,,Latn,Nalca,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +nld,nl,Latn,Dutch,Indo-European,SpaCyTokenizer,nl,FALSE,FALSE,TRUE,TRUE +nlg,,Latn,Gela,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +nma,,Latn,Maram Naga,Sino-Tibetan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +nmf,,Latn,Tangkhul Naga (India),Sino-Tibetan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +nmh,,Latn,Monsang Naga,Sino-Tibetan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +nmo,,Latn,Moyon Naga,Sino-Tibetan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +nmw,,Latn,Nimoa,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +nmz,,Latn,Nawdm,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +nnb,,Latn,Nande,Niger-Congo,SpaCyTokenizer,lg,TRUE,TRUE,FALSE,TRUE +nng,,Latn,Maring Naga,Sino-Tibetan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +nnh,,Latn,Ngiemboon,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +nnl,,Latn,Northern Rengma Naga,Sino-Tibetan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +nno,nn,Latn,Norwegian Nynorsk,,SpaCyTokenizer,nb,FALSE,FALSE,TRUE,TRUE +nnp,,Latn,Wancho Naga,Sino-Tibetan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +nnq,,Latn,Ngindo,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +nnw,,Latn,Southern Nuni,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +noa,,Latn,Woun Meu,Chocoan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +nob,nb,Latn,Norwegian Bokmål,,SpaCyTokenizer,nb,FALSE,FALSE,TRUE,TRUE +nod,,Thai,Northern Thai,Kra-Dai,ThaiTokenizer,,TRUE,TRUE,FALSE,TRUE +nog,,Cyrl,Nogai,Turkic,StanzaTokenizer,kk,TRUE,TRUE,FALSE,TRUE +non,,Latn,Old Norse,,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +nop,,Latn,Numanggang,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +nor,no,Latn,Norwegian,Indo-European,SpaCyTokenizer,nb,FALSE,FALSE,FALSE,TRUE +not,,Latn,Nomatsiguenga,Maipurean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +nou,,Latn,Ewage-Notu,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +nov,,Latn,Novial,,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +nph,,Latn,Phom Naga,Sino-Tibetan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +npi,ne,Deva,Nepali (individual language),Indo-European,IndicNLPTokenizer,ne,FALSE,FALSE,TRUE,TRUE +npi,ne,Latn,Nepali (individual language),Indo-European,StanzaTokenizer,kmr,TRUE,TRUE,TRUE,FALSE +npl,,Latn,Southeastern Puebla Nahuatl,Uto-Aztecan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +npo,,Latn,Pochuri Naga,Sino-Tibetan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +npy,,Latn,Napu,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +nqo,,Nkoo,N'Ko,Mixed language,WhitespaceTokenizer,,TRUE,TRUE,FALSE,TRUE +nre,,Latn,Southern Rengma Naga,Sino-Tibetan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +nrf,,Latn,Jèrriais,Indo-European,SpaCyTokenizer,fr,TRUE,TRUE,FALSE,TRUE +nri,,Latn,Chokri Naga,Sino-Tibetan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +nrm,,Latn,Jèrriais,Indo-European,SpaCyTokenizer,fr,TRUE,TRUE,FALSE,TRUE +nrm_narom,,Latn,Narom,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +nsa,,Latn,Sangtam Naga,Sino-Tibetan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +nse,,Latn,Nsenga,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +nsm,,Latn,Sumi Naga,Sino-Tibetan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +nsn,,Latn,Nehan,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +nso,,Latn,Pedi,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +nss,,Latn,Nali,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +nst,,Latn,Tase Naga,Sino-Tibetan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +nsu,,Latn,Sierra Negra Nahuatl,Uto-Aztecan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +ntp,,Latn,Northern Tepehuan,Uto-Aztecan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +ntr,,Latn,Delo,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +ntu,,Latn,Natügu,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +nuj,,Latn,Nyole,Niger-Congo,SpaCyTokenizer,lg,TRUE,TRUE,FALSE,TRUE +nus,,Latn,Nuer,Nilo-Saharan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +nuy,,Latn,Nunggubuyu,Australian,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +nuz,,Latn,Tlamacazapa Nahuatl,Uto-Aztecan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +nvm,,Latn,Namiae,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +nwb,,Latn,Nyabwa,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +nwi,,Latn,Southwest Tanna,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +nwx,,Deva,Middle Newar,,WhitespaceTokenizer,,TRUE,TRUE,FALSE,TRUE +nxd,,Latn,Ngando (Democratic Republic of Congo),Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +nya,ny,Latn,Nyanja,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,TRUE,TRUE +nyf,,Latn,Giryama,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +nyk,,Latn,Nyaneka,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +nyn,,Latn,Nyankole,Niger-Congo,SpaCyTokenizer,lg,TRUE,TRUE,FALSE,TRUE +nyo,,Latn,Nyoro,Niger-Congo,SpaCyTokenizer,lg,TRUE,TRUE,FALSE,TRUE +nyu,,Latn,Nyungwe,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +nyy,,Latn,Nyakyusa-Ngonde,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +nza,,Latn,Tigon Mbembe,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +nzi,,Latn,Nzima,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +nzm,,Latn,Zeme Naga,Sino-Tibetan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +obo,,Latn,Obo Manobo,Austronesian,SpaCyTokenizer,tl,TRUE,TRUE,FALSE,TRUE +oci,oc,Latn,Occitan (post 1500),Indo-European,SpaCyTokenizer,es,TRUE,TRUE,TRUE,TRUE +ogo,,Latn,Khana,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +ojb,oj,Cans,Northwestern Ojibwa,Algic,WhitespaceTokenizer,,TRUE,TRUE,TRUE,FALSE +ojb,oj,Latn,Northwestern Ojibwa,Algic,SpaCyTokenizer,xx,TRUE,TRUE,TRUE,TRUE +oji,oj,Cans,Ojibwa,,WhitespaceTokenizer,,TRUE,TRUE,FALSE,FALSE +oji,oj,Latn,Ojibwa,,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +oke,,Latn,Okpe (Southwestern Edo),Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +oku,,Latn,Oku,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +okv,,Latn,Orokaiva,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +old,,Latn,Mochi,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +olo,,Latn,Livvi,Uralic,SpaCyTokenizer,fi,TRUE,TRUE,FALSE,TRUE +omb,,Latn,East Ambae,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +omw,,Latn,South Tairora,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +ong,,Latn,Olo,Torricelli,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +ons,,Latn,Ono,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +ood,,Latn,Tohono O'odham,Uto-Aztecan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +opm,,Latn,Oksapmin,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +ori,or,Latn,Oriya (macrolanguage),,StanzaTokenizer,kmr,TRUE,TRUE,FALSE,FALSE +ori,or,Orya,Oriya (macrolanguage),,IndicNLPTokenizer,or,FALSE,FALSE,FALSE,TRUE +orm,om,Latn,Oromo,,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +orv,,Cyrl,Old Russian,,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +ory,or,Latn,Odia,Indo-European,StanzaTokenizer,kmr,TRUE,TRUE,TRUE,FALSE +ory,or,Orya,Odia,Indo-European,IndicNLPTokenizer,or,FALSE,FALSE,TRUE,TRUE +oss,os,Cyrl,Ossetian,Indo-European,SpaCyTokenizer,xx,TRUE,TRUE,TRUE,TRUE +ota,,Arab,Ottoman Turkish (1500-1928),,WhitespaceTokenizer,,TRUE,TRUE,FALSE,TRUE +otd,,Latn,Ot Danum,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +ote,,Latn,Mezquital Otomi,Otomanguean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +otm,,Latn,Eastern Highland Otomi,Otomanguean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +otn,,Latn,Tenango Otomi,Otomanguean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +oto,,Latn,,,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +otq,,Latn,Querétaro Otomi,Otomanguean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +ots,,Latn,Estado de México Otomi,Otomanguean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +otw,oj,Latn,Ottawa,Algic,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +oym,,Latn,Wayampi,Tupian,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +ozm,,Latn,Koonzime,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +pab,,Latn,Parecís,Maipurean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +pad,,Latn,Paumarí,Arauan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +pag,,Latn,Pangasinan,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +pah,,Latn,Tenharim,Tupian,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +pam,,Latn,Pampanga,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +pan,pa,Guru,Panjabi,Indo-European,IndicNLPTokenizer,pa,FALSE,FALSE,TRUE,TRUE +pan,pa,Latn,Panjabi,Indo-European,StanzaTokenizer,kmr,TRUE,TRUE,TRUE,FALSE +pao,,Latn,Northern Paiute,Uto-Aztecan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +pap,,Latn,Papiamento,Creole,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +pau,,Latn,Palauan,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +pbb,,Latn,Páez,Paezan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +pbc,,Latn,Patamona,Cariban,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +pbi,,Latn,Parkwa,Afro-Asiatic,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +pbt,ps,Arab,Southern Pashto,Indo-European,IndicNLPTokenizer,ur,TRUE,TRUE,TRUE,TRUE +pcd,,Latn,Picard,Indo-European,SpaCyTokenizer,fr,TRUE,TRUE,FALSE,TRUE +pck,,Latn,Paite Chin,Sino-Tibetan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +pcm,,Latn,Nigerian Pidgin,Creole,StanzaTokenizer,pcm,FALSE,FALSE,FALSE,TRUE +pdc,,Latn,Pennsylvania German,Indo-European,SpaCyTokenizer,lb,TRUE,TRUE,FALSE,TRUE +pdt,,Latn,Plautdietsch,Indo-European,SpaCyTokenizer,nl,TRUE,TRUE,FALSE,TRUE +pem,,Latn,Phende,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +pfe,,Latn,Pere,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +pfl,,Latn,Pfaelzisch,Indo-European,SpaCyTokenizer,lb,TRUE,TRUE,FALSE,TRUE +phm,,Latn,Phimbi,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +pib,,Latn,Yine,Maipurean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +pio,,Latn,Piapoco,Maipurean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +pir,,Latn,Piratapuyo,Tucanoan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +pis,,Latn,Pijin,Creole,StanzaTokenizer,pcm,TRUE,TRUE,FALSE,TRUE +pjt,,Latn,Pitjantjatjara,Australian,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +pkb,,Latn,Pokomo,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +plg,,Latn,Pilagá,Guaykuruan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +pls,,Latn,San Marcos Tlacoyalco Popoloca,Otomanguean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +plt,mg,Latn,Plateau Malagasy,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,TRUE,TRUE +plu,,Latn,Palikúr,Maipurean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +plw,,Latn,Brooke's Point Palawano,Austronesian,SpaCyTokenizer,tl,TRUE,TRUE,FALSE,TRUE +pma,,Latn,Paama,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +pmf,,Latn,Pamona,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +pmq,,Latn,Northern Pame,Otomanguean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +pms,,Latn,Piemontese,Indo-European,SpaCyTokenizer,lij,TRUE,TRUE,FALSE,TRUE +pmx,,Latn,Poumei Naga,Sino-Tibetan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +pnb,,Arab,Western Panjabi,Indo-European,IndicNLPTokenizer,ur,TRUE,TRUE,FALSE,TRUE +pne,,Latn,Western Penan,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +pnt,,Grek,Pontic,Indo-European,SpaCyTokenizer,el,TRUE,TRUE,FALSE,TRUE +pny,,Latn,Pinyin,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +poe,,Latn,San Juan Atzingo Popoloca,Otomanguean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +poh,,Latn,Poqomchi',Mayan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +poi,,Latn,Highland Popoluca,Mixe-Zoquean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +pol,pl,Latn,Polish,Indo-European,SpaCyTokenizer,pl,FALSE,FALSE,TRUE,TRUE +pon,,Latn,Pohnpeian,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +por,pt,Latn,Portuguese,Indo-European,SpaCyTokenizer,pt,FALSE,FALSE,TRUE,TRUE +pos,,Latn,Sayula Popoluca,Mixe-Zoquean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +pot,,Latn,Potawatomi,Algic,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +pov,,Latn,Upper Guinea Crioulo,Creole,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +poy,,Latn,Pogolo,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +ppk,,Latn,Uma,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +ppo,,Latn,Folopa,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +pps,,Latn,San Luís Temalacayuca Popoloca,Otomanguean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +prf,,Latn,Paranan,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +prg,,Latn,Prussian,Indo-European,SpaCyTokenizer,lv,TRUE,TRUE,FALSE,TRUE +pri,,Latn,Paicî,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +prq,,Latn,Ashéninka Perené,Maipurean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +pse,ms,Latn,Central Malay,Austronesian,SpaCyTokenizer,ms,FALSE,FALSE,FALSE,TRUE +pss,,Latn,Kaulong,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +ptp,,Latn,Patep,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +ptu,,Latn,Bambam,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +pua,,Latn,Western Highland Purepecha,Tarascan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +pui,,Latn,Puinave,Puinavean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +pus,ps,Arab,Pushto,,IndicNLPTokenizer,ur,TRUE,TRUE,FALSE,TRUE +pwg,,Latn,Gapapaiwa,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +pwn,,Latn,Paiwan,Austronesian,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +pww,,Thai,Pwo Northern Karen,Sino-Tibetan,ThaiTokenizer,,TRUE,TRUE,FALSE,TRUE +pxm,,Latn,Quetzaltepec Mixe,Mixe-Zoquean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +qub,qu,Latn,Huallaga Huánuco Quechua,Quechuan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +quc,,Latn,K'iche',Mayan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +que,qu,Latn,Quechua,,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +quf,qu,Latn,Lambayeque Quechua,Quechuan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +qug,qu,Latn,Chimborazo Highland Quichua,Quechuan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +quh,qu,Latn,South Bolivian Quechua,Quechuan,SpaCyTokenizer,xx,TRUE,TRUE,TRUE,TRUE +qul,qu,Latn,North Bolivian Quechua,Quechuan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +qup,qu,Latn,Southern Pastaza Quechua,Quechuan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +qus,qu,Latn,Santiago del Estero Quichua,Quechuan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +quw,qu,Latn,Tena Lowland Quichua,Quechuan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +quy,qu,Latn,Ayacucho Quechua,Quechuan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +quz,qu,Latn,Cusco Quechua,Quechuan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +qva,qu,Latn,Ambo-Pasco Quechua,Quechuan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +qvc,qu,Latn,Cajamarca Quechua,Quechuan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +qve,qu,Latn,Eastern Apurímac Quechua,Quechuan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +qvh,qu,Latn,Huamalíes-Dos de Mayo Huánuco Quechua,Quechuan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +qvi,qu,Latn,Imbabura Highland Quichua,Quechuan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +qvm,qu,Latn,Margos-Yarowilca-Lauricocha Quechua,Quechuan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +qvn,qu,Latn,North Junín Quechua,Quechuan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +qvo,qu,Latn,Napo Lowland Quechua,Quechuan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +qvs,qu,Latn,San Martín Quechua,Quechuan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +qvw,qu,Latn,Huaylla Wanca Quechua,Quechuan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +qvz,qu,Latn,Northern Pastaza Quichua,Quechuan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +qwh,qu,Latn,Huaylas Ancash Quechua,Quechuan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +qxh,qu,Latn,Panao Huánuco Quechua,Quechuan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +qxl,qu,Latn,Salasaca Highland Quichua,Quechuan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +qxn,qu,Latn,Northern Conchucos Ancash Quechua,Quechuan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +qxo,qu,Latn,Southern Conchucos Ancash Quechua,Quechuan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +qxr,qu,Latn,Cañar Highland Quichua,Quechuan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +rad,,Latn,Rade,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +rai,,Latn,Ramoaaina,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +raj,,Deva,Rajasthani,,WhitespaceTokenizer,,TRUE,TRUE,FALSE,TRUE +rap,,Latn,Rapanui,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +rar,,Latn,Rarotongan,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +rav,,Deva,Sampang,Sino-Tibetan,IndicNLPTokenizer,hi,TRUE,TRUE,FALSE,TRUE +raw,,Latn,Rawang,Sino-Tibetan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +rcf,,Latn,Réunion Creole French,Creole,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +rej,,Latn,Rejang,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +rel,,Latn,Rendille,Afro-Asiatic,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +rgu,,Latn,Ringgou,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +rhg,,Latn,Rohingya,Indo-European,StanzaTokenizer,kmr,TRUE,TRUE,FALSE,TRUE +ria,,Latn,Riang (India),Sino-Tibetan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +rim,,Latn,Nyaturu,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +rjs,,Deva,Rajbanshi,Indo-European,IndicNLPTokenizer,hi,TRUE,TRUE,FALSE,TRUE +rkb,,Latn,Rikbaktsa,Language isolate,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +rmc,,Latn,Carpathian Romani,Indo-European,StanzaTokenizer,kmr,TRUE,TRUE,FALSE,TRUE +rme,,Latn,Angloromani,Mixed language,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +rml,,Latn,Baltic Romani,Indo-European,StanzaTokenizer,kmr,TRUE,TRUE,FALSE,TRUE +rmn,,Cyrl,Balkan Romani,Indo-European,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,FALSE +rmn,,Grek,Balkan Romani,Indo-European,WhitespaceTokenizer,,TRUE,TRUE,FALSE,FALSE +rmn,,Latn,Balkan Romani,Indo-European,StanzaTokenizer,kmr,TRUE,TRUE,FALSE,TRUE +rmo,,Latn,Sinte Romani,Indo-European,StanzaTokenizer,kmr,TRUE,TRUE,FALSE,TRUE +rmq,,Latn,Caló,Mixed language,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +rmy,,Cyrl,Vlax Romani,Indo-European,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,FALSE +rmy,,Latn,Vlax Romani,Indo-European,StanzaTokenizer,kmr,TRUE,TRUE,FALSE,TRUE +rnd,,Latn,Ruund,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +rng,,Latn,Ronga,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +rnl,,Latn,Ranglong,Sino-Tibetan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +roh,rm,Latn,Romansh,Indo-European,SpaCyTokenizer,fr,TRUE,TRUE,TRUE,TRUE +rom,,Cyrl,Romany,,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,FALSE +rom,,Grek,Romany,,WhitespaceTokenizer,,TRUE,TRUE,FALSE,FALSE +rom,,Latn,Romany,,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +ron,ro,Cyrl,Romanian,Indo-European,SpaCyTokenizer,xx,TRUE,TRUE,TRUE,FALSE +ron,ro,Latn,Romanian,Indo-European,SpaCyTokenizer,ro,FALSE,FALSE,TRUE,TRUE +roo,,Latn,Rotokas,North Bougainville,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +rop,,Latn,Kriol,Creole,StanzaTokenizer,pcm,TRUE,TRUE,FALSE,TRUE +row,,Latn,Dela-Oenale,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +rro,,Latn,Waima,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +rtm,,Latn,Rotuman,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +rub,,Latn,Gungu,Niger-Congo,SpaCyTokenizer,lg,TRUE,TRUE,FALSE,TRUE +rue,,Cyrl,Rusyn,Indo-European,SpaCyTokenizer,ru,TRUE,TRUE,FALSE,TRUE +ruf,,Latn,Luguru,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +rug,,Latn,Roviana,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +run,rn,Latn,Rundi,Niger-Congo,SpaCyTokenizer,lg,TRUE,TRUE,TRUE,TRUE +rup,,Latn,Macedo-Romanian,Indo-European,SpaCyTokenizer,ro,TRUE,TRUE,FALSE,TRUE +rus,ru,Cyrl,Russian,Indo-European,SpaCyTokenizer,ru,FALSE,FALSE,TRUE,TRUE +rwo,,Latn,Rawa,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +sab,,Latn,Buglere,Chibchan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +sag,sg,Latn,Sango,Creole,SpaCyTokenizer,xx,TRUE,TRUE,TRUE,TRUE +sah,,Cyrl,Yakut,Turkic,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +saj,,Latn,Sahu,West Papuan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +san,sa,Deva,Sanskrit,Indo-European,IndicNLPTokenizer,sa,FALSE,FALSE,TRUE,TRUE +san,sa,Latn,Sanskrit,Indo-European,SpaCyTokenizer,sq,TRUE,TRUE,TRUE,FALSE +sas,,Latn,Sasak,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +sat,,Latn,Santali,Austro-Asiatic,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,FALSE +sat,,Olck,Santali,Austro-Asiatic,WhitespaceTokenizer,,TRUE,TRUE,FALSE,TRUE +say,,Latn,Saya,Afro-Asiatic,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +sba,,Latn,Ngambay,Nilo-Saharan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +sbd,,Latn,Southern Samo,Niger-Congo,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +sbe,,Latn,Saliba,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +sbl,,Latn,Botolan Sambal,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +sbs,,Latn,Subiya,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +sby,,Latn,Soli,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +sck,,Deva,Sadri,Indo-European,IndicNLPTokenizer,hi,TRUE,TRUE,FALSE,TRUE +scn,,Latn,Sicilian,Indo-European,SpaCyTokenizer,it,TRUE,TRUE,FALSE,TRUE +sco,,Latn,Scots,Indo-European,SpaCyTokenizer,en,TRUE,TRUE,FALSE,TRUE +sda,,Latn,Toraja-Sa'dan,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +sdc,sc,Latn,Sassarese Sardinian,Indo-European,SpaCyTokenizer,es,TRUE,TRUE,FALSE,TRUE +sdh,ku,Arab,Southern Kurdish,Indo-European,IndicNLPTokenizer,ur,TRUE,TRUE,FALSE,TRUE +sdo,,Latn,Bukar-Sadung Bidayuh,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +sdq,,Latn,Semandang,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +seh,,Latn,Sena,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +sel,,Cyrl,Selkup,Uralic,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +ses,,Latn,Koyraboro Senni Songhai,Nilo-Saharan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +sey,,Latn,Secoya,Tucanoan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +sfw,,Latn,Sehwi,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +sgb,,Latn,Mag-antsi Ayta,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +sgc,,Latn,Kipsigis,Nilo-Saharan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +sgh,,Cyrl,Shughni,Indo-European,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +sgs,,Latn,Samogitian,Indo-European,SpaCyTokenizer,lv,TRUE,TRUE,FALSE,TRUE +sgw,,Ethi,Sebat Bet Gurage,Afro-Asiatic,SpaCyTokenizer,am,TRUE,TRUE,FALSE,TRUE +sgz,,Latn,Sursurunga,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +shi,,Latn,Tachelhit,Afro-Asiatic,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +shk,,Latn,Shilluk,Nilo-Saharan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +shn,,Mymr,Shan,Kra-Dai,BurmeseTokenizer,,TRUE,TRUE,FALSE,TRUE +shp,,Latn,Shipibo-Conibo,Panoan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +shr,,Latn,Shi,Niger-Congo,SpaCyTokenizer,lg,TRUE,TRUE,FALSE,TRUE +shu,ar,Arab,Chadian Arabic,Afro-Asiatic,SpaCyTokenizer,ar,FALSE,FALSE,FALSE,TRUE +sid,,Latn,Sidamo,Afro-Asiatic,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +sig,,Latn,Paasaal,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +sil,,Latn,Tumulung Sisaala,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +sim,,Latn,Mende (Papua New Guinea),Sepik,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +sin,si,Sinh,Sinhala,Indo-European,IndicNLPTokenizer,si,FALSE,FALSE,TRUE,TRUE +sja,,Latn,Epena,Chocoan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +sjo,,Mong,Xibe,Tungusic,WhitespaceTokenizer,,TRUE,TRUE,FALSE,TRUE +sju,,Latn,Ume Sami,Uralic,StanzaTokenizer,se,TRUE,TRUE,FALSE,TRUE +skg,mg,Latn,Sakalava Malagasy,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +skr,,Arab,Saraiki,Indo-European,IndicNLPTokenizer,ur,TRUE,TRUE,FALSE,TRUE +sld,,Latn,Sissala,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +slk,sk,Latn,Slovak,Indo-European,SpaCyTokenizer,sk,FALSE,FALSE,TRUE,TRUE +sll,,Latn,Salt-Yui,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +slv,sl,Latn,Slovenian,Indo-European,SpaCyTokenizer,sl,FALSE,FALSE,TRUE,TRUE +sma,,Latn,Southern Sami,Uralic,StanzaTokenizer,se,TRUE,TRUE,FALSE,TRUE +sme,se,Latn,Northern Sami,Uralic,StanzaTokenizer,se,FALSE,FALSE,TRUE,TRUE +smj,,Latn,Lule Sami,Uralic,StanzaTokenizer,se,TRUE,TRUE,FALSE,TRUE +smk,,Latn,Bolinao,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +sml,,Latn,Central Sama,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +smn,,Latn,Inari Sami,Uralic,StanzaTokenizer,se,TRUE,TRUE,FALSE,TRUE +smo,sm,Latn,Samoan,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,TRUE,TRUE +sms,,Latn,Skolt Sami,Uralic,StanzaTokenizer,se,TRUE,TRUE,FALSE,TRUE +smt,,Latn,Simte,Sino-Tibetan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +sna,sn,Latn,Shona,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,TRUE,TRUE +snc,,Latn,Sinaugoro,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +snd,sd,Arab,Sindhi,Indo-European,IndicNLPTokenizer,ur,FALSE,FALSE,TRUE,TRUE +snd,sd,Deva,Sindhi,Indo-European,IndicNLPTokenizer,mr,TRUE,TRUE,TRUE,FALSE +snd,sd,Latn,Sindhi,Indo-European,StanzaTokenizer,kmr,TRUE,TRUE,TRUE,FALSE +snf,,Latn,Noon,Niger-Congo,StanzaTokenizer,wo,TRUE,TRUE,FALSE,TRUE +snn,,Latn,Siona,Tucanoan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +snp,,Latn,Siane,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +snw,,Latn,Selee,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +sny,,Latn,Saniyo-Hiyewe,Sepik,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +soe,,Latn,Songomeno,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +som,so,Latn,Somali,Afro-Asiatic,SpaCyTokenizer,xx,TRUE,TRUE,TRUE,TRUE +sop,,Latn,Songe,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +soq,,Latn,Kanasi,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +sot,st,Latn,Southern Sotho,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,TRUE,TRUE +soy,,Latn,Miyobe,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +spa,es,Latn,Spanish,Indo-European,SpaCyTokenizer,es,FALSE,FALSE,TRUE,TRUE +spl,,Latn,Selepet,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +spm,,Latn,Akukem,Ramu-Lower Sepik,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +spp,,Latn,Supyire Senoufo,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +sps,,Latn,Saposa,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +spy,,Latn,Sabaot,Nilo-Saharan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +sqi,sq,Latn,Albanian,,SpaCyTokenizer,sq,FALSE,TRUE,FALSE,TRUE +srd,sc,Latn,Sardinian,,SpaCyTokenizer,es,TRUE,TRUE,TRUE,TRUE +sri,,Latn,Siriano,Tucanoan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +srm,,Latn,Saramaccan,Creole,StanzaTokenizer,pcm,TRUE,TRUE,FALSE,TRUE +srn,,Latn,Sranan Tongo,Creole,StanzaTokenizer,pcm,TRUE,TRUE,FALSE,TRUE +srp,sr,Cyrl,Serbian,Indo-European,SpaCyTokenizer,sr,FALSE,FALSE,TRUE,TRUE +srp,sr,Latn,Serbian,Indo-European,SpaCyTokenizer,sr,FALSE,FALSE,TRUE,FALSE +srq,,Latn,Sirionó,Tupian,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +srr,,Latn,Serer,Niger-Congo,StanzaTokenizer,wo,TRUE,TRUE,FALSE,TRUE +ssd,,Latn,Siroi,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +ssg,,Latn,Seimat,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +ssw,ss,Latn,Swati,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,TRUE,TRUE +ssx,,Latn,Samberigi,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +stn,,Latn,Owa,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +stp,,Latn,Southeastern Tepehuan,Uto-Aztecan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +stq,,Latn,Saterfriesisch,Indo-European,SpaCyTokenizer,en,TRUE,TRUE,FALSE,TRUE +sua,,Latn,Sulka,Language isolate,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +suc,,Latn,Western Subanon,Austronesian,SpaCyTokenizer,tl,TRUE,TRUE,FALSE,TRUE +sue,,Latn,Suena,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +suk,,Latn,Sukuma,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +sun,su,Latn,Sundanese,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,TRUE,TRUE +sur,,Latn,Mwaghavul,Afro-Asiatic,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +sus,,Arab,Susu,Niger-Congo,WhitespaceTokenizer,,TRUE,TRUE,FALSE,FALSE +sus,,Latn,Susu,Niger-Congo,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +suz,,Deva,Sunwar,Sino-Tibetan,IndicNLPTokenizer,hi,TRUE,TRUE,FALSE,TRUE +swa,sw,Latn,Swahili (macrolanguage),,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +swb,,Latn,Maore Comorian,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +swc,sw,Latn,Congo Swahili,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +swe,sv,Latn,Swedish,Indo-European,SpaCyTokenizer,sv,FALSE,FALSE,TRUE,TRUE +swg,,Latn,Swabian,Indo-European,SpaCyTokenizer,de,TRUE,TRUE,FALSE,TRUE +swh,sw,Latn,Swahili (individual language),Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,TRUE,TRUE +swk,,Latn,Malawi Sena,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +swp,,Latn,Suau,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +sxb,,Latn,Suba,Niger-Congo,SpaCyTokenizer,lg,TRUE,TRUE,FALSE,TRUE +sxn,,Latn,Sangir,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +syb,,Latn,Central Subanen,Austronesian,SpaCyTokenizer,tl,TRUE,TRUE,FALSE,TRUE +syc,,Syrc,Classical Syriac,Afro-Asiatic,WhitespaceTokenizer,,TRUE,TRUE,FALSE,TRUE +syl,,Beng,Sylheti,Indo-European,IndicNLPTokenizer,bn,TRUE,TRUE,FALSE,FALSE +syl,,Latn,Sylheti,Indo-European,StanzaTokenizer,kmr,TRUE,TRUE,FALSE,TRUE +syr,,Syrc,Syriac,,WhitespaceTokenizer,,TRUE,TRUE,FALSE,TRUE +szb,,Latn,Ngalum,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +szl,,Latn,Silesian,Indo-European,SpaCyTokenizer,pl,TRUE,TRUE,FALSE,TRUE +szy,,Latn,Sakizaya,Austronesian,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +tab,,Cyrl,Tabassaran,Nakh-Daghestanian,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +tac,,Latn,Lowland Tarahumara,Uto-Aztecan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +tah,ty,Latn,Tahitian,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,TRUE,TRUE +taj,,Deva,Eastern Tamang,Sino-Tibetan,IndicNLPTokenizer,hi,TRUE,TRUE,FALSE,TRUE +tam,ta,Latn,Tamil,Dravidian,SpaCyTokenizer,xx,TRUE,TRUE,TRUE,FALSE +tam,ta,Taml,Tamil,Dravidian,IndicNLPTokenizer,ta,FALSE,FALSE,TRUE,TRUE +tap,,Latn,Taabwa,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +taq,,Latn,Tamasheq,Afro-Asiatic,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +taq,,Tfng,Tamasheq,Afro-Asiatic,WhitespaceTokenizer,,TRUE,TRUE,FALSE,FALSE +tar,,Latn,Central Tarahumara,Uto-Aztecan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +tat,tt,Cyrl,Tatar,Turkic,SpaCyTokenizer,tt,FALSE,FALSE,TRUE,TRUE +tat,tt,Latn,Tatar,Turkic,SpaCyTokenizer,xx,TRUE,TRUE,TRUE,FALSE +tav,,Latn,Tatuyo,Tucanoan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +taw,,Latn,Tai,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +tay,,Latn,Atayal,Austronesian,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +tbc,,Latn,Takia,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +tbg,,Latn,North Tairora,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +tbk,,Latn,Calamian Tagbanwa,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +tbl,,Latn,Tboli,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +tbo,,Latn,Tawala,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +tbw,,Latn,Tagbanwa,Austronesian,SpaCyTokenizer,tl,TRUE,TRUE,FALSE,TRUE +tby,,Latn,Tabaru,West Papuan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +tbz,,Latn,Ditammari,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +tca,,Latn,Ticuna,Language isolate,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +tcc,,Latn,Datooga,Nilo-Saharan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +tcf,,Latn,Malinaltepec Me'phaa,Otomanguean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +tcs,,Latn,Torres Strait Creole,Creole,StanzaTokenizer,pcm,TRUE,TRUE,FALSE,TRUE +tcy,,Knda,Tulu,Dravidian,IndicNLPTokenizer,kn,TRUE,TRUE,FALSE,TRUE +tcz,,Latn,Thado Chin,Sino-Tibetan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +tdt,,Latn,Tetun Dili,Creole,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +tdx,mg,Latn,Tandroy-Mahafaly Malagasy,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +ted,,Latn,Tepo Krumen,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +tee,,Latn,Huehuetla Tepehua,Totonacan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +tel,te,Latn,Telugu,Dravidian,SpaCyTokenizer,xx,TRUE,TRUE,TRUE,FALSE +tel,te,Telu,Telugu,Dravidian,IndicNLPTokenizer,te,FALSE,FALSE,TRUE,TRUE +tem,,Latn,Timne,Niger-Congo,StanzaTokenizer,wo,TRUE,TRUE,FALSE,TRUE +teo,,Latn,Teso,Nilo-Saharan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +ter,,Latn,Tereno,Maipurean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +tet,,Latn,Tetum,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +tew,,Latn,Tewa (USA),Kiowa-Tanoan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +tfr,,Latn,Teribe,Chibchan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +tgk,tg,Cyrl,Tajik,Indo-European,SpaCyTokenizer,xx,TRUE,TRUE,TRUE,TRUE +tgo,,Latn,Sudest,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +tgp,,Latn,Tangoa,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +tha,th,Thai,Thai,Kra-Dai,ThaiTokenizer,,FALSE,FALSE,TRUE,TRUE +thk,,Latn,Tharaka,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +thl,,Deva,Dangaura Tharu,Indo-European,IndicNLPTokenizer,hi,TRUE,TRUE,FALSE,TRUE +thv,,Latn,Tahaggart Tamahaq,Afro-Asiatic,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +tif,,Latn,Tifal,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +tig,,Ethi,Tigre,Afro-Asiatic,SpaCyTokenizer,ti,TRUE,TRUE,FALSE,TRUE +tih,,Latn,Timugon Murut,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +tik,,Latn,Tikar,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +tim,,Latn,Timbe,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +tir,ti,Ethi,Tigrinya,Afro-Asiatic,SpaCyTokenizer,ti,FALSE,FALSE,TRUE,TRUE +tiv,,Latn,Tiv,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +tiy,,Latn,Tiruray,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +tke,,Latn,Takwane,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +tkl,,Latn,Tokelau,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +tkr,,Cyrl,Tsakhur,Nakh-Daghestanian,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +tku,,Latn,Upper Necaxa Totonac,Totonacan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +tlb,,Latn,Tobelo,West Papuan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +tlf,,Latn,Telefol,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +tlh,,Latn,Klingon,,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +tlj,,Latn,Talinga-Bwisi,Niger-Congo,SpaCyTokenizer,lg,TRUE,TRUE,FALSE,TRUE +tll,,Latn,Tetela,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +tly,,Latn,Talysh,Indo-European,StanzaTokenizer,kmr,TRUE,TRUE,FALSE,TRUE +tmc,,Latn,Tumak,Afro-Asiatic,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +tmd,,Latn,Haruai,Piawi,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +tmh,,Latn,Tamashek,,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +tmh,,Tfng,Tamashek,,WhitespaceTokenizer,,TRUE,TRUE,FALSE,FALSE +tna,,Latn,Tacana,Tacanan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +tnc,,Latn,Tanimuca-Retuarã,Tucanoan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +tnk,,Latn,Kwamera,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +tnn,,Latn,North Tanna,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +tnp,,Latn,Whitesands,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +tnr,,Latn,Ménik,Niger-Congo,StanzaTokenizer,wo,TRUE,TRUE,FALSE,TRUE +tob,,Latn,Toba,Guaykuruan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +toc,,Latn,Coyutla Totonac,Totonacan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +tod,,Latn,Toma,Niger-Congo,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +tog,,Latn,Tonga (Nyasa),Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +toh,,Latn,Gitonga,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +toi,,Latn,Tonga (Zambia),Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +toj,,Latn,Tojolabal,Mayan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +tok,,Latn,Toki Pona,,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +ton,to,Latn,Tonga (Tonga Islands),Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,TRUE,TRUE +too,,Latn,Xicotepec De Juárez Totonac,Totonacan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +top,,Latn,Papantla Totonac,Totonacan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +tos,,Latn,Highland Totonac,Totonacan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +tpa,,Latn,Taupota,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +tpi,,Latn,Tok Pisin,Creole,StanzaTokenizer,pcm,TRUE,TRUE,FALSE,TRUE +tpm,,Latn,Tampulma,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +tpn,,Latn,Tupinambá,Tupian,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +tpp,,Latn,Pisaflores Tepehua,Totonacan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +tpt,,Latn,Tlachichilco Tepehua,Totonacan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +tpw,,Latn,,,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +tpz,,Latn,Tinputz,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +tqo,,Latn,Toaripi,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +trc,,Latn,Copala Triqui,Otomanguean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +trn,,Latn,Trinitario,Maipurean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +tro,,Latn,Tarao Naga,Sino-Tibetan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +trp,,Latn,Kok Borok,Sino-Tibetan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +trq,,Latn,San Martín Itunyoso Triqui,Otomanguean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +trs,,Latn,Chicahuaxtla Triqui,Otomanguean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +trv,,Latn,Sediq,Austronesian,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +tsc,,Latn,Tswa,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +tsg,,Latn,Tausug,Austronesian,SpaCyTokenizer,tl,TRUE,TRUE,FALSE,TRUE +tsn,tn,Latn,Tswana,Niger-Congo,SpaCyTokenizer,tn,FALSE,FALSE,TRUE,TRUE +tso,ts,Latn,Tsonga,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,TRUE,TRUE +tsw,,Latn,Tsishingini,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +tsz,,Latn,Purepecha,Tarascan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +ttc,,Latn,Tektiteko,Mayan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +tte,,Latn,Bwanabwana,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +ttj,,Latn,Tooro,Niger-Congo,SpaCyTokenizer,lg,TRUE,TRUE,FALSE,TRUE +ttq,,Latn,Tawallammat Tamajaq,Afro-Asiatic,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +ttq,,Tfng,Tawallammat Tamajaq,Afro-Asiatic,WhitespaceTokenizer,,TRUE,TRUE,FALSE,FALSE +tuc,,Latn,Mutu,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +tue,,Latn,Tuyuca,Tucanoan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +tuf,,Latn,Central Tunebo,Chibchan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +tui,,Latn,Tupuri,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +tuk,tk,Arab,Turkmen,Turkic,WhitespaceTokenizer,,TRUE,TRUE,TRUE,FALSE +tuk,tk,Cyrl,Turkmen,Turkic,SpaCyTokenizer,xx,TRUE,TRUE,TRUE,FALSE +tuk,tk,Latn,Turkmen,Turkic,SpaCyTokenizer,tr,TRUE,TRUE,TRUE,TRUE +tul,,Latn,Tula,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +tum,,Latn,Tumbuka,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +tuo,,Latn,Tucano,Tucanoan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +tur,tr,Latn,Turkish,Turkic,SpaCyTokenizer,tr,FALSE,FALSE,TRUE,TRUE +tuv,,Latn,Turkana,Nilo-Saharan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +tvk,,Latn,Southeast Ambrym,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +tvl,,Latn,Tuvalu,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +twb,,Latn,Western Tawbuid,Austronesian,SpaCyTokenizer,tl,TRUE,TRUE,FALSE,TRUE +twi,tw,Latn,Twi,,SpaCyTokenizer,xx,TRUE,TRUE,TRUE,TRUE +twu,,Latn,Termanu,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +twx,,Latn,Tewe,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +txq,,Latn,Tii,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +txu,,Latn,Kayapó,Jean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +tyv,,Cyrl,Tuvinian,Turkic,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +tzh,,Latn,Tzeltal,Mayan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +tzj,,Latn,Tz'utujil,Mayan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +tzl,,Latn,Talossan,,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +tzm,,Tfng,Central Atlas Tamazight,Afro-Asiatic,WhitespaceTokenizer,,TRUE,TRUE,FALSE,TRUE +tzo,,Latn,Tzotzil,Mayan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +ubr,,Latn,Ubir,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +ubu,,Latn,Umbu-Ungu,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +udm,,Cyrl,Udmurt,Uralic,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +udu,,Latn,Uduk,Nilo-Saharan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +uig,ug,Arab,Uighur,Turkic,StanzaTokenizer,ug,FALSE,FALSE,TRUE,TRUE +uig,ug,Cyrl,Uighur,Turkic,SpaCyTokenizer,xx,TRUE,TRUE,TRUE,FALSE +uig,ug,Latn,Uighur,Turkic,SpaCyTokenizer,xx,TRUE,TRUE,TRUE,FALSE +ukr,uk,Cyrl,Ukrainian,Indo-European,SpaCyTokenizer,uk,FALSE,FALSE,TRUE,TRUE +umb,,Latn,Umbundu,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +und,,Adlm,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Aghb,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Ahom,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Armi,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Armn,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Avst,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Bali,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Bamu,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Bass,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Batk,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Beng,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Bhks,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Bopo,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Brah,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Brai,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Bugi,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Buhd,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Cakm,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Cans,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Cari,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Cham,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Cher,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Chrs,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Copt,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Cpmn,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Cprt,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Cyrl,Undetermined,,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,FALSE +und,,Deva,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Diak,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Dogr,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Dsrt,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Dupl,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Egyp,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Elba,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Elym,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Ethi,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Geor,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Glag,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Gong,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Gonm,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Goth,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Gran,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Grek,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Gujr,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Guru,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Hano,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Hatr,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Hebr,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Hira,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Hluw,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Hmng,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Hmnp,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Hung,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Ital,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Java,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Kali,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Kana,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Kawi,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Khar,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Khmr,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Khoj,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Kits,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Knda,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Kthi,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Lana,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Laoo,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Lepc,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Limb,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Lina,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Linb,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Lisu,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Lyci,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Lydi,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Mahj,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Maka,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Mand,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Mani,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Marc,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Medf,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Mend,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Merc,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Mero,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Mlym,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Modi,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Mong,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Mroo,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Mtei,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Mult,Undetermined,,,,TRUE,TRUE,FALSE,TRUE +und,,Mymr,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Nagm,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Nand,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Narb,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Nbat,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Newa,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Nkoo,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Nshu,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Ogam,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Olck,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Orkh,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Orya,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Osge,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Osma,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Ougr,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Palm,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Pauc,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Perm,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Phag,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Phli,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Phlp,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Phnx,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Plrd,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Prti,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Rjng,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Rohg,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Runr,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Samr,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Sarb,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Saur,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Sgnw,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Shaw,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Shrd,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Sidd,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Sind,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Sinh,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Sogd,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Sogo,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Sora,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Soyo,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Sund,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Sylo,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Syrc,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Tagb,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Takr,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Tale,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Talu,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Taml,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Tang,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Tavt,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Telu,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Tfng,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Tglg,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Thaa,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Thai,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Tibt,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Tirh,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Tnsa,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Toto,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Ugar,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Vaii,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Vith,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Wara,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Wcho,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Xpeo,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Xsux,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Yezi,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Yiii,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +und,,Zanb,Undetermined,,,,TRUE,TRUE,FALSE,FALSE +upv,,Latn,Uripiv-Wala-Rano-Atchin,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +ura,,Latn,Urarina,Language isolate,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +urb,,Latn,Urubú-Kaapor,Tupian,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +urd,ur,Arab,Urdu,Indo-European,IndicNLPTokenizer,ur,FALSE,FALSE,TRUE,TRUE +urd,ur,Latn,Urdu,Indo-European,StanzaTokenizer,kmr,TRUE,TRUE,TRUE,FALSE +urh,,Latn,Urhobo,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +uri,,Latn,Urim,Torricelli,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +urk,ms,Thai,Urak Lawoi',Austronesian,ThaiTokenizer,,TRUE,TRUE,FALSE,TRUE +urt,,Latn,Urat,Torricelli,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +urw,,Latn,Sop,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +ury,,Latn,Orya,Tor-Kwerba,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +usa,,Latn,Usarufa,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +usp,,Latn,Uspanteco,Mayan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +uth,,Latn,ut-Hun,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +uvh,,Latn,Uri,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +uvl,,Latn,Lote,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +uzb,uz,Arab,Uzbek,,StanzaTokenizer,ug,TRUE,TRUE,FALSE,FALSE +uzb,uz,Cyrl,Uzbek,,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,FALSE +uzb,uz,Latn,Uzbek,,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +uzn,uz,Cyrl,Northern Uzbek,Turkic,SpaCyTokenizer,xx,TRUE,TRUE,TRUE,FALSE +uzn,uz,Latn,Northern Uzbek,Turkic,SpaCyTokenizer,xx,TRUE,TRUE,TRUE,TRUE +uzs,uz,Arab,Southern Uzbek,Turkic,StanzaTokenizer,ug,TRUE,TRUE,FALSE,TRUE +vag,,Latn,Vagla,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +vap,,Latn,Vaiphei,Sino-Tibetan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +var,,Latn,Huarijio,Uto-Aztecan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +vec,,Latn,Venetian,Indo-European,SpaCyTokenizer,lij,TRUE,TRUE,FALSE,TRUE +ven,ve,Latn,Venda,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,TRUE,TRUE +vep,,Latn,Veps,Uralic,SpaCyTokenizer,fi,TRUE,TRUE,FALSE,TRUE +vid,,Latn,Vidunda,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +vie,vi,Latn,Vietnamese,Austro-Asiatic,SpaCyTokenizer,vi,FALSE,FALSE,TRUE,TRUE +viv,,Latn,Iduna,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +vls,,Latn,Vlaams,Indo-European,SpaCyTokenizer,nl,TRUE,TRUE,FALSE,TRUE +vmk,,Latn,Makhuwa-Shirima,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +vmw,,Latn,Makhuwa,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +vmy,,Latn,Ayautla Mazatec,Otomanguean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +vol,vo,Latn,Volapük,,SpaCyTokenizer,xx,TRUE,TRUE,TRUE,TRUE +vot,,Latn,Votic,Uralic,SpaCyTokenizer,fi,TRUE,TRUE,FALSE,TRUE +vro,et,Latn,Võro,Uralic,SpaCyTokenizer,et,FALSE,FALSE,FALSE,TRUE +vun,,Latn,Vunjo,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +vut,,Latn,Vute,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +waj,,Latn,Waffa,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +wal,,Ethi,Wolaytta,Afro-Asiatic,WhitespaceTokenizer,,TRUE,TRUE,FALSE,FALSE +wal,,Latn,Wolaytta,Afro-Asiatic,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +wap,,Latn,Wapishana,Maipurean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +war,,Latn,Waray (Philippines),Austronesian,SpaCyTokenizer,tl,TRUE,TRUE,FALSE,TRUE +wat,,Latn,Kaninuwa,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +way,,Latn,Wayana,Cariban,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +wba,,Latn,Warao,Language isolate,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +wbm,,Latn,Wa,Austro-Asiatic,SpaCyTokenizer,vi,TRUE,TRUE,FALSE,TRUE +wbp,,Latn,Warlpiri,Australian,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +wed,,Latn,Wedau,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +wer,,Latn,Weri,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +wes,,Latn,Cameroon Pidgin,Creole,StanzaTokenizer,pcm,TRUE,TRUE,FALSE,TRUE +wew,,Latn,Wejewa,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +whg,,Latn,North Wahgi,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +whk,,Latn,Wahau Kenyah,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +wib,,Latn,Southern Toussian,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +wim,,Latn,Wik-Mungkan,Australian,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +wiu,,Latn,Wiru,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +wln,wa,Latn,Walloon,Indo-European,SpaCyTokenizer,fr,TRUE,TRUE,TRUE,TRUE +wls,,Latn,Wallisian,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +wlv,,Latn,Wichí Lhamtés Vejoz,Matacoan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +wlx,,Latn,Wali (Ghana),Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +wmt,,Latn,Walmajarri,Australian,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +wmw,,Latn,Mwani,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +wnc,,Latn,Wantoat,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +wnu,,Latn,Usan,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +wob,,Latn,Wè Northern,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +wol,wo,Latn,Wolof,Niger-Congo,StanzaTokenizer,wo,FALSE,FALSE,TRUE,TRUE +wos,,Latn,Hanga Hundi,Sepik,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +wrk,,Latn,Garrwa,Australian,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +wrs,,Latn,Waris,Border,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +wsg,,Telu,Adilabad Gondi,Dravidian,IndicNLPTokenizer,te,TRUE,TRUE,FALSE,TRUE +wsk,,Latn,Waskia,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +wuu,zh,Hani,Wu Chinese,Sino-Tibetan,SpaCyTokenizer,zh,FALSE,FALSE,FALSE,TRUE +wuv,,Latn,Wuvulu-Aua,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +wwa,,Latn,Waama,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +xal,,Cyrl,Kalmyk,Mongolic,StanzaTokenizer,bxr,TRUE,TRUE,FALSE,TRUE +xav,,Latn,Xavánte,Jean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +xbi,,Latn,Kombio,Torricelli,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +xbr,,Latn,Kambera,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +xed,,Latn,Hdi,Afro-Asiatic,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +xho,xh,Latn,Xhosa,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,TRUE,TRUE +xla,,Latn,Kamula,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +xmf,,Geor,Mingrelian,Kartvelian,WhitespaceTokenizer,,TRUE,TRUE,FALSE,TRUE +xmm,ms,Latn,Manado Malay,Creole,SpaCyTokenizer,ms,FALSE,FALSE,FALSE,TRUE +xmv,mg,Latn,Antankarana Malagasy,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +xnn,,Latn,Northern Kankanay,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +xog,,Latn,Soga,Niger-Congo,SpaCyTokenizer,lg,TRUE,TRUE,FALSE,TRUE +xon,,Latn,Konkomba,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +xrb,,Latn,Eastern Karaboro,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +xsb,,Latn,Sambal,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +xsi,,Latn,Sio,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +xsm,,Latn,Kasem,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +xsr,,Deva,Sherpa,Sino-Tibetan,IndicNLPTokenizer,hi,TRUE,TRUE,FALSE,TRUE +xsu,,Latn,Sanumá,Yanomaman,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +xtd,,Latn,Diuxi-Tilantongo Mixtec,Otomanguean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +xtm,,Latn,Magdalena Peñasco Mixtec,Otomanguean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +xtn,,Latn,Northern Tlaxiaco Mixtec,Otomanguean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +xum,,Latn,Umbrian,,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +xuo,,Latn,Kuo,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +yaa,,Latn,Yaminahua,Panoan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +yad,,Latn,Yagua,Yaguan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +yal,,Latn,Yalunka,Niger-Congo,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +yam,,Latn,Yamba,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +yan,,Latn,Mayangna,Misumalpan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +yao,,Latn,Yao,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +yap,,Latn,Yapese,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +yaq,,Latn,Yaqui,Uto-Aztecan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +yas,,Latn,Nugunu (Cameroon),Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +yat,,Latn,Yambeta,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +yaz,,Latn,Lokaa,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +ybb,,Latn,Yemba,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +yby,,Latn,Yaweyuha,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +ycn,,Latn,Yucuna,Maipurean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +ydd,yi,Hebr,Eastern Yiddish,Indo-European,WhitespaceTokenizer,,TRUE,TRUE,TRUE,TRUE +yid,yi,Hebr,Yiddish,,WhitespaceTokenizer,,TRUE,TRUE,FALSE,TRUE +yim,,Latn,Yimchungru Naga,Sino-Tibetan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +yka,,Latn,Yakan,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +yle,,Latn,Yele,Yele-West New Britain,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +yli,,Latn,Angguruk Yali,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +yml,,Latn,Iamalele,Austronesian,SpaCyTokenizer,ms,TRUE,TRUE,FALSE,TRUE +yom,,Latn,Yombe,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +yon,,Latn,Yongkom,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +yor,yo,Latn,Yoruba,Niger-Congo,SpaCyTokenizer,yo,FALSE,FALSE,TRUE,TRUE +yrb,,Latn,Yareba,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +yre,,Latn,Yaouré,Niger-Congo,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +yrk,,Cyrl,Nenets,Uralic,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +yrl,,Latn,Nhengatu,Tupian,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +yss,,Latn,Yessan-Mayo,Sepik,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +yua,,Latn,Yucateco,Mayan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +yue,zh,Hani,Yue Chinese,Sino-Tibetan,SpaCyTokenizer,zh,FALSE,FALSE,FALSE,TRUE +yuj,,Latn,Karkar-Yuri,Pauwasi,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +yup,,Latn,Yukpa,Cariban,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +yut,,Latn,Yopno,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +yuw,,Latn,Yau (Morobe Province),Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +yuz,,Latn,Yuracare,Language isolate,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +yva,,Latn,Yawa,West Papuan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +zaa,,Latn,Sierra de Juárez Zapotec,Otomanguean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +zab,,Latn,Western Tlacolula Valley Zapotec,Otomanguean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +zac,,Latn,Ocotlán Zapotec,Otomanguean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +zad,,Latn,Cajonos Zapotec,Otomanguean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +zae,,Latn,Yareni Zapotec,Otomanguean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +zai,,Latn,Isthmus Zapotec,Otomanguean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +zam,,Latn,Miahuatlán Zapotec,Otomanguean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +zao,,Latn,Ozolotepec Zapotec,Otomanguean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +zap,,Latn,Zapotec,,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +zar,,Latn,Rincón Zapotec,Otomanguean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +zas,,Latn,Santo Domingo Albarradas Zapotec,Otomanguean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +zat,,Latn,Tabaa Zapotec,Otomanguean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +zav,,Latn,Yatzachi Zapotec,Otomanguean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +zaw,,Latn,Mitla Zapotec,Otomanguean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +zca,,Latn,Coatecas Altas Zapotec,Otomanguean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +zdj,,Latn,Ngazidja Comorian,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +zea,,Latn,Zeeuws,Indo-European,SpaCyTokenizer,nl,TRUE,TRUE,FALSE,TRUE +zgh,,Tfng,Standard Moroccan Tamazight,Afro-Asiatic,WhitespaceTokenizer,,TRUE,TRUE,FALSE,TRUE +zha,za,Latn,Zhuang,,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +zho,zh,Hani,Chinese,,SpaCyTokenizer,zh,FALSE,FALSE,FALSE,TRUE +zho,zh,Latn,Chinese,,WhitespaceTokenizer,,TRUE,TRUE,FALSE,FALSE +zia,,Latn,Zia,Trans-New Guinea,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +ziw,,Latn,Zigula,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +zne,,Latn,Zande (individual language),Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,FALSE,TRUE +zoc,,Latn,Copainalá Zoque,Mixe-Zoquean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +zom,,Latn,Zou,Sino-Tibetan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +zos,,Latn,Francisco León Zoque,Mixe-Zoquean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +zpa,,Latn,Lachiguiri Zapotec,Otomanguean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +zpc,,Latn,Choapan Zapotec,Otomanguean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +zpg,,Latn,Guevea De Humboldt Zapotec,Otomanguean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +zpi,,Latn,Santa María Quiegolani Zapotec,Otomanguean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +zpj,,Latn,Quiavicuzas Zapotec,Otomanguean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +zpl,,Latn,Lachixío Zapotec,Otomanguean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +zpm,,Latn,Mixtepec Zapotec,Otomanguean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +zpo,,Latn,Amatlán Zapotec,Otomanguean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +zpq,,Latn,Zoogocho Zapotec,Otomanguean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +zpt,,Latn,San Vicente Coatlán Zapotec,Otomanguean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +zpu,,Latn,Yalálag Zapotec,Otomanguean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +zpv,,Latn,Chichicapan Zapotec,Otomanguean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +zpz,,Latn,Texmelucan Zapotec,Otomanguean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +zsm,ms,Arab,Standard Malay,Austronesian,WhitespaceTokenizer,,TRUE,TRUE,TRUE,FALSE +zsm,ms,Latn,Standard Malay,Austronesian,SpaCyTokenizer,ms,FALSE,FALSE,TRUE,TRUE +zsr,,Latn,Southern Rincon Zapotec,Otomanguean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +ztq,,Latn,Quioquitani-Quierí Zapotec,Otomanguean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +zty,,Latn,Yatee Zapotec,Otomanguean,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +zul,zu,Latn,Zulu,Niger-Congo,SpaCyTokenizer,tn,TRUE,TRUE,TRUE,TRUE +zxx,,Arab,No linguistic content,,,,TRUE,TRUE,FALSE,FALSE +zxx,,Latn,No linguistic content,,,,TRUE,TRUE,FALSE,FALSE +zxx,,Zzzz,No linguistic content,,,,TRUE,TRUE,FALSE,FALSE +zyb,za,Latn,Yongbei Zhuang,Kra-Dai,SpaCyTokenizer,xx,TRUE,TRUE,TRUE,TRUE +zyp,,Latn,Zyphe Chin,Sino-Tibetan,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE +zza,,Latn,Zaza,,SpaCyTokenizer,xx,TRUE,TRUE,FALSE,TRUE diff --git a/src/datatrove/assets/url_filterblacklists.tar.gz b/src/datatrove/assets/url_filterblacklistsv0_3_0.tar.gz similarity index 65% rename from src/datatrove/assets/url_filterblacklists.tar.gz rename to src/datatrove/assets/url_filterblacklistsv0_3_0.tar.gz index 52e745a2..85d76bb3 100644 Binary files a/src/datatrove/assets/url_filterblacklists.tar.gz and b/src/datatrove/assets/url_filterblacklistsv0_3_0.tar.gz differ diff --git a/src/datatrove/io.py b/src/datatrove/io.py index dd8dd329..b401a968 100644 --- a/src/datatrove/io.py +++ b/src/datatrove/io.py @@ -162,7 +162,7 @@ def list_files( ] ) - def get_shard(self, rank: int, world_size: int, **kwargs) -> list[str]: + def get_shard(self, rank: int, world_size: int, **kwargs) -> list[str] | None: """Fetch a shard (set of files) for a given rank, assuming there are a total of `world_size` shards. This should be deterministic to not have any overlap among different ranks. Will return files [rank, rank+world_size, rank+2*world_size, ...] @@ -175,7 +175,10 @@ def get_shard(self, rank: int, world_size: int, **kwargs) -> list[str]: Returns: a list of file paths """ - return self.list_files(**kwargs)[rank::world_size] + all_files = self.list_files(**kwargs) + if len(all_files) == 0: + return None + return all_files[rank::world_size] def resolve_paths(self, paths) -> list[str] | str: """ diff --git a/src/datatrove/pipeline/dedup/minhash.py b/src/datatrove/pipeline/dedup/minhash.py index 8f254754..f6443898 100644 --- a/src/datatrove/pipeline/dedup/minhash.py +++ b/src/datatrove/pipeline/dedup/minhash.py @@ -9,6 +9,7 @@ import numpy as np from fsspec.spec import AbstractBufferedFile +from tqdm import tqdm from datatrove.data import DocumentsPipeline from datatrove.io import DataFolderLike, get_datafolder @@ -132,7 +133,13 @@ class MinhashDedupSignature(PipelineStep): type = "🫂 - DEDUP" name = "🎯 MinHash stage 1" - def __init__(self, output_folder: DataFolderLike, config: MinhashConfig = None, language: str = Languages.english): + def __init__( + self, + output_folder: DataFolderLike, + config: MinhashConfig = None, + language: str = Languages.english, + skip_existing_sigs: bool = False, + ): super().__init__() self.output_folder = get_datafolder(output_folder) self.config = config or MinhashConfig() @@ -141,6 +148,7 @@ def __init__(self, output_folder: DataFolderLike, config: MinhashConfig = None, self._hash_func = create_hash_func(self.config.hash_config) self.language = language self.word_tokenizer = load_word_tokenizer(language) + self.skip_existing_sigs = skip_existing_sigs @property def parameters(self): @@ -200,51 +208,75 @@ def get_shingles(self, text: str) -> np.ndarray: dtype=np.uint64, ).reshape((-1, 1)) - def run(self, data: DocumentsPipeline, rank: int = 0, world_size: int = 1): - buckets = [ - self.output_folder.open(f"bucket_{bi:03d}/{rank:05d}.minhash.sig", mode="wb") + def check_can_skip_sig_writing(self, rank): + if not self.skip_existing_sigs: + return False + + # check if the files exist + if any( + not self.output_folder.exists(f"bucket_{bi:03d}/{rank:05d}.minhash.sig") for bi in range(self.config.num_buckets) + ): + return False + + # check if they all have the same size (same nb of docs) + fsizes = [ + self.output_folder.size(f"bucket_{bi:03d}/{rank:05d}.minhash.sig") for bi in range(self.config.num_buckets) ] + if any(fsize != fsizes[0] for fsize in fsizes): + return False + + # check if they aren't empty and if they have a multiple of a full sig + sig_doc_size = struct.calcsize(f"<{self.config.hashes_per_bucket}{self.config.hash_config.struct_format}I") + if fsizes[0] == 0 or fsizes[0] % sig_doc_size != 0: + return False + + logger.info(f"Found existing sig files with {fsizes[0] // sig_doc_size} entries. Skipping sig writing step.") + return True + + def run(self, data: DocumentsPipeline, rank: int = 0, world_size: int = 1): with self.track_time(): - for doc_idx, doc in enumerate(data): - self.stat_update(StatHints.total) - shingles = self.get_shingles(doc.text) - if shingles.size != 0: - sig = self.get_signature(shingles) - for bi, (bucket, bucket_sig) in enumerate(zip(buckets, sig)): - # print(f"{self.hashes_per_bucket=} {bucket_sig=}") - bucket.write( - struct.pack( - f"<{self.config.hashes_per_bucket}{self.config.hash_config.struct_format}I", - *bucket_sig, - doc_idx, + # check if we can skip the sig writing step + if not self.check_can_skip_sig_writing(rank): + buckets = [ + self.output_folder.open(f"bucket_{bi:03d}/{rank:05d}.minhash.sig", mode="wb") + for bi in range(self.config.num_buckets) + ] + for doc_idx, doc in enumerate(data): + self.stat_update(StatHints.total) + shingles = self.get_shingles(doc.text) + if shingles.size != 0: + sig = self.get_signature(shingles) + for bi, (bucket, bucket_sig) in enumerate(zip(buckets, sig)): + # print(f"{self.hashes_per_bucket=} {bucket_sig=}") + bucket.write( + struct.pack( + f"<{self.config.hashes_per_bucket}{self.config.hash_config.struct_format}I", + *bucket_sig, + doc_idx, + ) ) - ) - # TODO: prevent these files from being uploaded/redownloaded in the first place - for file in buckets: - file.close() + for file in buckets: + file.close() logger.info("Sorting buckets...") - for bi in range(len(buckets)): - # read one by one, sort and write back - sigs = sorted( - read_sigs( - self.output_folder.open(f"bucket_{bi:03d}/{rank:05d}.minhash.sig", mode="rb"), - -1, - self.config, - ensure_order=False, - lines_to_buffer=-1, # load everything in one go - ) + for bi in range(self.config.num_buckets): + # read all records, sort and write back + dtype = np.dtype( + [ + (f"field{i + 1}", f"<{self.config.hash_config.struct_format}") + for i in range(self.config.hashes_per_bucket) + ] + + [(f"field{self.config.hashes_per_bucket + 1}", "= last, f"Sig queue sort error. {v=} < {last=}" if not v.is_from_index(): - if last and last.sig == v.sig: + if last is not None and last.sig == v.sig: # write (file_id1, doc_id1, file_id2, doc_id2) if last.is_from_index(): # we can't actually write -1, so we use SENTINEL instead @@ -430,6 +462,7 @@ def __init__( output_folder: DataFolderLike, config: MinhashConfig = None, save_cluster_id: bool = False, + save_cluster_size: bool = False, ignore_index_matches: bool = False, lines_to_buffer: int = 5, ): @@ -438,6 +471,7 @@ def __init__( self.output_folder = get_datafolder(output_folder) self.config = config or MinhashConfig() self.save_cluster_id = save_cluster_id + self.save_cluster_size = save_cluster_size self.ignore_index_matches = ignore_index_matches self.lines_to_buffer = lines_to_buffer @@ -448,34 +482,55 @@ def run(self, data: DocumentsPipeline = None, _: int = 0, world_size: int = 1): ) == 0, "Number of .dups files should be divisible by number of buckets" assert world_size == 1, "World size must be 1 for clustering" union_set = {} + set_size = {} def parent(x): if x not in union_set or union_set[x] == x: + union_set[x] = x return x # Path Compression union_set[x] = parent(union_set[x]) return union_set[x] + def union(v_a, v_b): + root_a = parent(v_a) + root_b = parent(v_b) + + if root_a != root_b: + # Union by size + size_a = set_size.get(root_a, 1) + size_b = set_size.get(root_b, 1) + if size_a < size_b: + root_a, root_b = root_b, root_a + # #a >= #b + union_set[root_b] = root_a # make the smallest one join the biggest one to keep sets shallow + set_size[root_a] = size_a + size_b + set_size.pop(root_b, None) # clear up space + with self.track_time(): - for dup_file in dup_files: + logger.info("Loading dup files...") + for dup_file in tqdm(dup_files, desc="Reading dup files"): with self.input_folder.open(dup_file, "rb") as dupf: for f1, d1, f2, d2 in read_tuples_from_file(dupf, "4I", lines_to_buffer=self.lines_to_buffer): a, b = (f1, d1), (f2, d2) if self.ignore_index_matches and a == (SENTINEL, SENTINEL): # if we are skipping matches with the index and "a" is from the index continue - union_set[parent(b)] = parent(a) + union(a, b) + logger.info("Finished reading dup files.") ci = 0 cluster_ids = {} with self.output_folder.get_output_file_manager(mode="wb") as output_mg: - for node in sorted(union_set.keys()): + for node in sorted(union_set): self.stat_update("duplicates") file, doc = node p = parent(node) if node != p: output_mg.write(f"{file:06d}.remove", struct.pack(" Do and ( # min num sentences self.config.min_num_sentences <= 0 - or len(split_into_parts(filtered_text, SPLIT_TEXT_SENTENCES, self.language)) + or len(split_into_sentences(filtered_text, self.language)) >= self.config.min_num_sentences ) ) diff --git a/src/datatrove/pipeline/filters/c4_filters.py b/src/datatrove/pipeline/filters/c4_filters.py index 34d27191..f2d71a85 100644 --- a/src/datatrove/pipeline/filters/c4_filters.py +++ b/src/datatrove/pipeline/filters/c4_filters.py @@ -7,8 +7,8 @@ from datatrove.io import cached_asset_path_or_download from datatrove.pipeline.filters.base_filter import BaseFilter from datatrove.pipeline.writers.disk_base import DiskWriter +from datatrove.utils.text import split_into_sentences from datatrove.utils.typeshelper import Languages -from datatrove.utils.word_tokenizers import load_word_tokenizer CITATION_REGEX = re.compile(r"\[\d*]|\[edit]|\[citation needed]") @@ -83,10 +83,10 @@ def __init__( self.filter_javascript = filter_javascript self.filter_curly_bracket = filter_curly_bracket self.filter_policy = filter_policy - self.tokenizer = load_word_tokenizer(language) + self.language = language def filter(self, doc: Document) -> bool | tuple[bool, str]: - lines = doc.text.splitlines() if self.split_paragraph else self.tokenizer.sent_tokenize(doc.text) + lines = doc.text.splitlines() if self.split_paragraph else split_into_sentences(doc.text, self.language) num_sentences = 0 kept_lines = [] @@ -126,7 +126,7 @@ def filter(self, doc: Document) -> bool | tuple[bool, str]: self.stat_update("line-filter-policy") continue if self.min_num_sentences != -1: - num_sentences += len(self.tokenizer.sent_tokenize(line)) if self.split_paragraph else 1 + num_sentences += len(split_into_sentences(line, self.language)) if self.split_paragraph else 1 kept_lines.append(line) self.stat_update("line-kept") if num_sentences < self.min_num_sentences: diff --git a/src/datatrove/pipeline/filters/fineweb_quality_filter.py b/src/datatrove/pipeline/filters/fineweb_quality_filter.py index 0d40f785..f37fee61 100644 --- a/src/datatrove/pipeline/filters/fineweb_quality_filter.py +++ b/src/datatrove/pipeline/filters/fineweb_quality_filter.py @@ -1,8 +1,8 @@ from datatrove.pipeline.filters.base_filter import BaseFilter from datatrove.pipeline.filters.gopher_repetition_filter import find_duplicates from datatrove.pipeline.writers.disk_base import DiskWriter +from datatrove.utils.text import TERMINAL_PUNCTUATION, split_into_words from datatrove.utils.typeshelper import Languages -from datatrove.utils.word_tokenizers import load_word_tokenizer class FineWebQualityFilter(BaseFilter): @@ -13,6 +13,7 @@ def __init__( exclusion_writer: DiskWriter = None, line_punct_thr: float = 0.12, line_punct_exclude_zero: bool = False, + stop_chars: tuple[str] | None = None, short_line_thr: float = 0.67, short_line_length: int = 30, char_duplicates_ratio: float = 0.01, @@ -22,31 +23,32 @@ def __init__( super().__init__(exclusion_writer) self.line_punct_thr = line_punct_thr self.line_punct_exclude_zero = line_punct_exclude_zero + self.stop_chars = stop_chars if stop_chars is not None else tuple(TERMINAL_PUNCTUATION) self.short_line_threshold = short_line_thr self.short_line_length = short_line_length self.char_duplicates_ratio = char_duplicates_ratio self.new_line_ratio = new_line_ratio - self.tokenizer = load_word_tokenizer(language) + self.language = language def filter(self, doc) -> bool | tuple[bool, str]: - stop_chars = (".", "'", '"', "!", "?") - lines = doc.text.split("\n") - ratio = sum(1 for line in lines if line.endswith(stop_chars)) / len(lines) - if ratio <= self.line_punct_thr and not (ratio == 0 and self.line_punct_exclude_zero): + lines = [line for line in lines if line.strip() != ""] + if len(lines) == 0: + return False, "empty" + ratio = sum(1 for line in lines if line.endswith(self.stop_chars)) / len(lines) + if ratio < self.line_punct_thr and not (ratio == 0 and self.line_punct_exclude_zero): return False, "line_punct_ratio" ratio = sum(1 for line in lines if len(line) <= self.short_line_length) / len(lines) - if ratio >= self.short_line_threshold: + if ratio > self.short_line_threshold: return False, "short_line_ratio" - non_empty_lines = [line for line in lines if line.strip() != ""] - ratio = find_duplicates(non_empty_lines)[1] / len(doc.text.replace("\n", "")) + ratio = find_duplicates(lines)[1] / len(doc.text.replace("\n", "")) - if ratio >= self.char_duplicates_ratio: + if ratio > self.char_duplicates_ratio: return False, "char_dup_ratio" - words = self.tokenizer.word_tokenize(doc.text) + words = split_into_words(doc.text, self.language) new_line = doc.text.count("\n") if new_line / len(words) > self.new_line_ratio: return False, "list_ratio" diff --git a/src/datatrove/pipeline/filters/gopher_quality_filter.py b/src/datatrove/pipeline/filters/gopher_quality_filter.py index aaa530d3..9e5e68b6 100644 --- a/src/datatrove/pipeline/filters/gopher_quality_filter.py +++ b/src/datatrove/pipeline/filters/gopher_quality_filter.py @@ -3,9 +3,8 @@ from datatrove.data import Document from datatrove.pipeline.filters.base_filter import BaseFilter from datatrove.pipeline.writers.disk_base import DiskWriter -from datatrove.utils.text import PUNCTUATION_SET +from datatrove.utils.text import PUNCTUATION_SET, split_into_words from datatrove.utils.typeshelper import Languages -from datatrove.utils.word_tokenizers import load_word_tokenizer STOP_WORDS = ["the", "be", "to", "of", "and", "that", "have", "with"] @@ -54,10 +53,10 @@ def __init__( self.max_symbol_word_ratio = max_symbol_word_ratio self.max_bullet_lines_ratio = max_bullet_lines_ratio self.max_ellipsis_lines_ratio = max_ellipsis_lines_ratio - self.max_non_alpha_words_ratio = max_non_alpha_words_ratio + self.max_non_alpha_words_ratio = max_non_alpha_words_ratio # TODO rename to min_alpha_words_ratio self.min_stop_words = min_stop_words self.stop_words = set(STOP_WORDS if stop_words is None else stop_words) - self.tokenizer = load_word_tokenizer(language) + self.language = language def filter(self, doc: Document) -> bool | tuple[bool, str]: """ @@ -70,7 +69,7 @@ def filter(self, doc: Document) -> bool | tuple[bool, str]: """ text = doc.text - words = self.tokenizer.word_tokenize(text) + words = split_into_words(text, self.language) n_words = len(words) non_symbol_words = [w for w in words if any(ch not in PUNCTUATION_SET for ch in w)] @@ -114,6 +113,7 @@ def filter(self, doc: Document) -> bool | tuple[bool, str]: # that 80 % of words in a document contain at least one alphabetic character if ( self.max_non_alpha_words_ratio + # nb of words with at least 1 alpha char < 0.8 and sum([any((c.isalpha() for c in w)) for w in words]) / n_words < self.max_non_alpha_words_ratio ): return False, "gopher_below_alpha_threshold" diff --git a/src/datatrove/pipeline/filters/gopher_repetition_filter.py b/src/datatrove/pipeline/filters/gopher_repetition_filter.py index 318c33da..249bf964 100644 --- a/src/datatrove/pipeline/filters/gopher_repetition_filter.py +++ b/src/datatrove/pipeline/filters/gopher_repetition_filter.py @@ -4,8 +4,8 @@ from datatrove.data import Document from datatrove.pipeline.filters.base_filter import BaseFilter from datatrove.pipeline.writers.disk_base import DiskWriter +from datatrove.utils.text import split_into_words from datatrove.utils.typeshelper import Languages -from datatrove.utils.word_tokenizers import load_word_tokenizer """ @@ -105,7 +105,7 @@ def __init__( self.dup_n_grams = dup_n_grams self.paragraph_exp = re.compile(r"\n{2,}") self._line_splitter = re.compile("\n+") - self.tokenizer = load_word_tokenizer(language) + self.language = language def filter(self, doc: Document) -> bool | tuple[bool, str]: text = doc.text @@ -124,7 +124,7 @@ def filter(self, doc: Document) -> bool | tuple[bool, str]: if self.dup_line_char_frac and char_duplicates / len(text) > self.dup_line_char_frac: return False, "dup_line_char_frac" - words = self.tokenizer.word_tokenize(text) + words = split_into_words(text, self.language) for n, n_frac in self.top_n_grams: n_grams = get_n_grams(words, n) diff --git a/src/datatrove/pipeline/filters/unigram_log_probs.py b/src/datatrove/pipeline/filters/unigram_log_probs.py index af42e096..cd2e735b 100644 --- a/src/datatrove/pipeline/filters/unigram_log_probs.py +++ b/src/datatrove/pipeline/filters/unigram_log_probs.py @@ -9,8 +9,8 @@ from datatrove.pipeline.filters.base_filter import BaseFilter from datatrove.pipeline.writers.disk_base import DiskWriter from datatrove.utils.logging import logger +from datatrove.utils.text import split_into_words from datatrove.utils.typeshelper import Languages -from datatrove.utils.word_tokenizers import load_word_tokenizer UNIGRAM_DOWNLOAD = "https://ai2-s2-research-public.s3-us-west-2.amazonaws.com/lucas/google-1T-unigram/unigram_freq.csv" @@ -38,7 +38,7 @@ def __init__( super().__init__(exclusion_writer) self.logprobs_threshold = logprobs_threshold self.unigram_frequencies = self.get_frequencies() - self.tokenizer = load_word_tokenizer(language) + self.language = language def get_frequencies(self): download_dir = cached_assets_path( @@ -60,7 +60,7 @@ def get_frequencies(self): return {word: count / total_count for word, count in zip(words, counts)} def get_logprob(self, doc): - words = self.tokenizer.word_tokenize(doc.text) + words = split_into_words(doc.text, self.language) freqs = [self.unigram_frequencies.get(word.lower(), 1e-9) for word in words] if len(freqs) == 0: diff --git a/src/datatrove/pipeline/filters/url_filter.py b/src/datatrove/pipeline/filters/url_filter.py index 0675a010..c4f6f25f 100644 --- a/src/datatrove/pipeline/filters/url_filter.py +++ b/src/datatrove/pipeline/filters/url_filter.py @@ -81,20 +81,18 @@ def download_data(self): if self._downloaded or not self.use_integrated_lists: return download_dir = cached_assets_path(library_name="datatrove", namespace="filters", subfolder="url_filter") - file_to_lock = os.path.join(download_dir, "url_filterblacklists.tar.gz") + file_to_lock = os.path.join(download_dir, "url_filterblacklistsv0_3_0.tar.gz") def do_extract(): logger.info("💥 Extracting url filter blacklists...") - with tarfile.open(os.path.join(ASSETS_PATH, "url_filterblacklists.tar.gz"), "r:gz") as tar: + with tarfile.open(os.path.join(ASSETS_PATH, "url_filterblacklistsv0_3_0.tar.gz"), "r:gz") as tar: tar.extractall(download_dir) logger.info("💥 Extracted url filter blacklists.") safely_create_file(file_to_lock, do_extract) - self.block_listed_domains = get_list( - download_dir, "adult/domains", self.block_listed_domains, do_normalize=False - ) - self.block_listed_url = get_list(download_dir, "adult/urls", self.block_listed_url, do_normalize=False) + self.block_listed_domains = get_list(download_dir, "domains", self.block_listed_domains, do_normalize=False) + self.block_listed_url = get_list(download_dir, "urls", self.block_listed_url, do_normalize=False) self.banned_words = get_list(ASSETS_PATH, "banned_words.txt", self.banned_words) self.banned_subwords = get_list(ASSETS_PATH, "banned_subwords.txt", self.banned_subwords) self.soft_banned_words = get_list(ASSETS_PATH, "soft_banned_words.txt", self.soft_banned_words) diff --git a/src/datatrove/pipeline/formatters/symbol_lines_remover.py b/src/datatrove/pipeline/formatters/symbol_lines_remover.py index 57903ffb..463e002d 100644 --- a/src/datatrove/pipeline/formatters/symbol_lines_remover.py +++ b/src/datatrove/pipeline/formatters/symbol_lines_remover.py @@ -11,10 +11,12 @@ class SymbolLinesFormatter(BaseFormatter): def __init__( self, + symbols_to_remove: list[str] | None = None, replace_char: str = "", # change to \n to replace with a paragraph ): super().__init__() self.replace_char = replace_char + self.symbols_to_remove = set(symbols_to_remove) if symbols_to_remove is not None else PUNCTUATION_SET # loop actually seems faster # puncts = "".join(map(re.escape, PUNCTUATION)) # self.symbol_regex = re.compile(rf"(^(([{puncts}]+[^\S\r\n]*)+\n?)+$((? str: formatted = [] in_removed_span = False for line in text.splitlines(): - chars_line = line.strip() != "" and all(c in PUNCTUATION_SET or c == " " for c in line) + chars_line = line.strip() != "" and all(c in self.symbols_to_remove or c == " " for c in line) if chars_line and not in_removed_span: if self.replace_char: formatted.append(self.replace_char) diff --git a/src/datatrove/pipeline/readers/base.py b/src/datatrove/pipeline/readers/base.py index e7f4569b..ed2cf4d5 100644 --- a/src/datatrove/pipeline/readers/base.py +++ b/src/datatrove/pipeline/readers/base.py @@ -224,11 +224,12 @@ def run(self, data: DocumentsPipeline = None, rank: int = 0, world_size: int = 1 if not self.paths_file else list(get_shard_from_paths_file(self.paths_file, rank, world_size)) ) - if len(files_shard) == 0: - if rank == 0: - raise RuntimeError(f"No files found on {self.data_folder.path}!") + if files_shard is None: + raise RuntimeError(f"No files found on {self.data_folder.path}!") + elif len(files_shard) == 0: # otherwise just a warning logger.warning(f"No files found on {self.data_folder.path} for {rank=}") + if self.shuffle_files: random.shuffle(files_shard) for doc in self.read_files_shard(files_shard): diff --git a/src/datatrove/pipeline/readers/huggingface.py b/src/datatrove/pipeline/readers/huggingface.py index c031cab9..8b09d6d8 100644 --- a/src/datatrove/pipeline/readers/huggingface.py +++ b/src/datatrove/pipeline/readers/huggingface.py @@ -56,10 +56,10 @@ def __init__( self.streaming = streaming self.shuffle_files = shuffle_files - def get_document_from_dict(self, data: dict, source: str, id_in_file: int | str): - document = super().get_document_from_dict(data, source, id_in_file) + def get_document_from_dict(self, data: dict, source_file: str, id_in_file: int | str): + document = super().get_document_from_dict(data, source_file, id_in_file) if document: - document.metadata.setdefault("dataset", source) + document.metadata.setdefault("dataset", source_file) return document def _get_dataset_shard(self, dst, rank: int, world_size: int): @@ -76,7 +76,7 @@ def _get_dataset_shard(self, dst, rank: int, world_size: int): f"Requested shard {rank} of a streaming dataset, but it only has {dst.n_shards} shards." ) return None - ex_iterable = dst._ex_iterable.shard_data_sources(rank, world_size) + ex_iterable = dst._ex_iterable.shard_data_sources(index=rank, num_shards=world_size, contiguous=False) return IterableDataset( ex_iterable=ex_iterable, info=dst._info.copy(), diff --git a/src/datatrove/tools/fast_mh3/Cargo.lock b/src/datatrove/tools/fast_mh3/Cargo.lock new file mode 100644 index 00000000..3a0df8a9 --- /dev/null +++ b/src/datatrove/tools/fast_mh3/Cargo.lock @@ -0,0 +1,2379 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "addr2line" +version = "0.24.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dfbe277e56a376000877090da837660b4427aad530e3028d44e0bffe4f89a1c1" +dependencies = [ + "gimli", +] + +[[package]] +name = "adler2" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "512761e0bb2578dd7380c6baaa0f4ce03e84f95e960231d1dec8bf4d7d6e2627" + +[[package]] +name = "allocator-api2" +version = "0.2.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "45862d1c77f2228b9e10bc609d5bc203d86ebc9b87ad8d5d5167a6c9abf739d9" + +[[package]] +name = "anstream" +version = "0.6.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8acc5369981196006228e28809f761875c0327210a891e941f4c683b3a99529b" +dependencies = [ + "anstyle", + "anstyle-parse", + "anstyle-query", + "anstyle-wincon", + "colorchoice", + "is_terminal_polyfill", + "utf8parse", +] + +[[package]] +name = "anstyle" +version = "1.0.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "55cc3b69f167a1ef2e161439aa98aed94e6028e5f9a59be9a6ffb47aef1651f9" + +[[package]] +name = "anstyle-parse" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b2d16507662817a6a20a9ea92df6652ee4f94f914589377d69f3b21bc5798a9" +dependencies = [ + "utf8parse", +] + +[[package]] +name = "anstyle-query" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "79947af37f4177cfead1110013d678905c37501914fba0efea834c3fe9a8d60c" +dependencies = [ + "windows-sys 0.59.0", +] + +[[package]] +name = "anstyle-wincon" +version = "3.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2109dbce0e72be3ec00bed26e6a7479ca384ad226efdd66db8fa2e3a38c83125" +dependencies = [ + "anstyle", + "windows-sys 0.59.0", +] + +[[package]] +name = "anyhow" +version = "1.0.93" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c95c10ba0b00a02636238b814946408b1322d5ac4760326e6fb8ec956d85775" + +[[package]] +name = "autocfg" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26" + +[[package]] +name = "aws-config" +version = "1.5.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b49afaa341e8dd8577e1a2200468f98956d6eda50bcf4a53246cc00174ba924" +dependencies = [ + "aws-credential-types", + "aws-runtime", + "aws-sdk-sso", + "aws-sdk-ssooidc", + "aws-sdk-sts", + "aws-smithy-async", + "aws-smithy-http", + "aws-smithy-json", + "aws-smithy-runtime", + "aws-smithy-runtime-api", + "aws-smithy-types", + "aws-types", + "bytes", + "fastrand", + "hex", + "http 0.2.12", + "ring", + "time", + "tokio", + "tracing", + "url", + "zeroize", +] + +[[package]] +name = "aws-credential-types" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "60e8f6b615cb5fc60a98132268508ad104310f0cfb25a1c22eee76efdf9154da" +dependencies = [ + "aws-smithy-async", + "aws-smithy-runtime-api", + "aws-smithy-types", + "zeroize", +] + +[[package]] +name = "aws-runtime" +version = "1.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a10d5c055aa540164d9561a0e2e74ad30f0dcf7393c3a92f6733ddf9c5762468" +dependencies = [ + "aws-credential-types", + "aws-sigv4", + "aws-smithy-async", + "aws-smithy-eventstream", + "aws-smithy-http", + "aws-smithy-runtime", + "aws-smithy-runtime-api", + "aws-smithy-types", + "aws-types", + "bytes", + "fastrand", + "http 0.2.12", + "http-body 0.4.6", + "once_cell", + "percent-encoding", + "pin-project-lite", + "tracing", + "uuid", +] + +[[package]] +name = "aws-sdk-s3" +version = "1.63.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f43850204a109a5eea1ea93951cf0440268cef98b0d27dfef4534949e23735f7" +dependencies = [ + "aws-credential-types", + "aws-runtime", + "aws-sigv4", + "aws-smithy-async", + "aws-smithy-checksums", + "aws-smithy-eventstream", + "aws-smithy-http", + "aws-smithy-json", + "aws-smithy-runtime", + "aws-smithy-runtime-api", + "aws-smithy-types", + "aws-smithy-xml", + "aws-types", + "bytes", + "fastrand", + "hex", + "hmac", + "http 0.2.12", + "http-body 0.4.6", + "lru", + "once_cell", + "percent-encoding", + "regex-lite", + "sha2", + "tracing", + "url", +] + +[[package]] +name = "aws-sdk-sso" +version = "1.49.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09677244a9da92172c8dc60109b4a9658597d4d298b188dd0018b6a66b410ca4" +dependencies = [ + "aws-credential-types", + "aws-runtime", + "aws-smithy-async", + "aws-smithy-http", + "aws-smithy-json", + "aws-smithy-runtime", + "aws-smithy-runtime-api", + "aws-smithy-types", + "aws-types", + "bytes", + "http 0.2.12", + "once_cell", + "regex-lite", + "tracing", +] + +[[package]] +name = "aws-sdk-ssooidc" +version = "1.50.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "81fea2f3a8bb3bd10932ae7ad59cc59f65f270fc9183a7e91f501dc5efbef7ee" +dependencies = [ + "aws-credential-types", + "aws-runtime", + "aws-smithy-async", + "aws-smithy-http", + "aws-smithy-json", + "aws-smithy-runtime", + "aws-smithy-runtime-api", + "aws-smithy-types", + "aws-types", + "bytes", + "http 0.2.12", + "once_cell", + "regex-lite", + "tracing", +] + +[[package]] +name = "aws-sdk-sts" +version = "1.50.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ada54e5f26ac246dc79727def52f7f8ed38915cb47781e2a72213957dc3a7d5" +dependencies = [ + "aws-credential-types", + "aws-runtime", + "aws-smithy-async", + "aws-smithy-http", + "aws-smithy-json", + "aws-smithy-query", + "aws-smithy-runtime", + "aws-smithy-runtime-api", + "aws-smithy-types", + "aws-smithy-xml", + "aws-types", + "http 0.2.12", + "once_cell", + "regex-lite", + "tracing", +] + +[[package]] +name = "aws-sigv4" +version = "1.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5619742a0d8f253be760bfbb8e8e8368c69e3587e4637af5754e488a611499b1" +dependencies = [ + "aws-credential-types", + "aws-smithy-eventstream", + "aws-smithy-http", + "aws-smithy-runtime-api", + "aws-smithy-types", + "bytes", + "crypto-bigint 0.5.5", + "form_urlencoded", + "hex", + "hmac", + "http 0.2.12", + "http 1.1.0", + "once_cell", + "p256", + "percent-encoding", + "ring", + "sha2", + "subtle", + "time", + "tracing", + "zeroize", +] + +[[package]] +name = "aws-smithy-async" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "62220bc6e97f946ddd51b5f1361f78996e704677afc518a4ff66b7a72ea1378c" +dependencies = [ + "futures-util", + "pin-project-lite", + "tokio", +] + +[[package]] +name = "aws-smithy-checksums" +version = "0.60.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba1a71073fca26775c8b5189175ea8863afb1c9ea2cceb02a5de5ad9dfbaa795" +dependencies = [ + "aws-smithy-http", + "aws-smithy-types", + "bytes", + "crc32c", + "crc32fast", + "hex", + "http 0.2.12", + "http-body 0.4.6", + "md-5", + "pin-project-lite", + "sha1", + "sha2", + "tracing", +] + +[[package]] +name = "aws-smithy-eventstream" +version = "0.60.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cef7d0a272725f87e51ba2bf89f8c21e4df61b9e49ae1ac367a6d69916ef7c90" +dependencies = [ + "aws-smithy-types", + "bytes", + "crc32fast", +] + +[[package]] +name = "aws-smithy-http" +version = "0.60.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c8bc3e8fdc6b8d07d976e301c02fe553f72a39b7a9fea820e023268467d7ab6" +dependencies = [ + "aws-smithy-eventstream", + "aws-smithy-runtime-api", + "aws-smithy-types", + "bytes", + "bytes-utils", + "futures-core", + "http 0.2.12", + "http-body 0.4.6", + "once_cell", + "percent-encoding", + "pin-project-lite", + "pin-utils", + "tracing", +] + +[[package]] +name = "aws-smithy-json" +version = "0.60.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4683df9469ef09468dad3473d129960119a0d3593617542b7d52086c8486f2d6" +dependencies = [ + "aws-smithy-types", +] + +[[package]] +name = "aws-smithy-query" +version = "0.60.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2fbd61ceb3fe8a1cb7352e42689cec5335833cd9f94103a61e98f9bb61c64bb" +dependencies = [ + "aws-smithy-types", + "urlencoding", +] + +[[package]] +name = "aws-smithy-runtime" +version = "1.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be28bd063fa91fd871d131fc8b68d7cd4c5fa0869bea68daca50dcb1cbd76be2" +dependencies = [ + "aws-smithy-async", + "aws-smithy-http", + "aws-smithy-runtime-api", + "aws-smithy-types", + "bytes", + "fastrand", + "h2", + "http 0.2.12", + "http-body 0.4.6", + "http-body 1.0.1", + "httparse", + "hyper", + "hyper-rustls", + "once_cell", + "pin-project-lite", + "pin-utils", + "rustls", + "tokio", + "tracing", +] + +[[package]] +name = "aws-smithy-runtime-api" +version = "1.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "92165296a47a812b267b4f41032ff8069ab7ff783696d217f0994a0d7ab585cd" +dependencies = [ + "aws-smithy-async", + "aws-smithy-types", + "bytes", + "http 0.2.12", + "http 1.1.0", + "pin-project-lite", + "tokio", + "tracing", + "zeroize", +] + +[[package]] +name = "aws-smithy-types" +version = "1.2.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4fbd94a32b3a7d55d3806fe27d98d3ad393050439dd05eb53ece36ec5e3d3510" +dependencies = [ + "base64-simd", + "bytes", + "bytes-utils", + "futures-core", + "http 0.2.12", + "http 1.1.0", + "http-body 0.4.6", + "http-body 1.0.1", + "http-body-util", + "itoa", + "num-integer", + "pin-project-lite", + "pin-utils", + "ryu", + "serde", + "time", + "tokio", + "tokio-util", +] + +[[package]] +name = "aws-smithy-xml" +version = "0.60.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ab0b0166827aa700d3dc519f72f8b3a91c35d0b8d042dc5d643a91e6f80648fc" +dependencies = [ + "xmlparser", +] + +[[package]] +name = "aws-types" +version = "1.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5221b91b3e441e6675310829fd8984801b772cb1546ef6c0e54dec9f1ac13fef" +dependencies = [ + "aws-credential-types", + "aws-smithy-async", + "aws-smithy-runtime-api", + "aws-smithy-types", + "rustc_version", + "tracing", +] + +[[package]] +name = "backtrace" +version = "0.3.74" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8d82cb332cdfaed17ae235a638438ac4d4839913cc2af585c3c6746e8f8bee1a" +dependencies = [ + "addr2line", + "cfg-if", + "libc", + "miniz_oxide", + "object", + "rustc-demangle", + "windows-targets", +] + +[[package]] +name = "base16ct" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "349a06037c7bf932dd7e7d1f653678b2038b9ad46a74102f1fc7bd7872678cce" + +[[package]] +name = "base64" +version = "0.21.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d297deb1925b89f2ccc13d7635fa0714f12c87adce1c75356b39ca9b7178567" + +[[package]] +name = "base64-simd" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "339abbe78e73178762e23bea9dfd08e697eb3f3301cd4be981c0f78ba5859195" +dependencies = [ + "outref", + "vsimd", +] + +[[package]] +name = "base64ct" +version = "1.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8c3c1a368f70d6cf7302d78f8f7093da241fb8e8807c05cc9e51a125895a6d5b" + +[[package]] +name = "bitflags" +version = "2.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b048fb63fd8b5923fc5aa7b340d8e156aec7ec02f0c78fa8a6ddc2613f6f71de" + +[[package]] +name = "block-buffer" +version = "0.10.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71" +dependencies = [ + "generic-array", +] + +[[package]] +name = "bumpalo" +version = "3.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c" + +[[package]] +name = "byteorder" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" + +[[package]] +name = "bytes" +version = "1.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ac0150caa2ae65ca5bd83f25c7de183dea78d4d366469f148435e2acfbad0da" + +[[package]] +name = "bytes-utils" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7dafe3a8757b027e2be6e4e5601ed563c55989fcf1546e933c66c8eb3a058d35" +dependencies = [ + "bytes", + "either", +] + +[[package]] +name = "cc" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fd9de9f2205d5ef3fd67e685b0df337994ddd4495e2a28d185500d0e1edfea47" +dependencies = [ + "shlex", +] + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "clap" +version = "4.5.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fb3b4b9e5a7c7514dfa52869339ee98b3156b0bfb4e8a77c4ff4babb64b1604f" +dependencies = [ + "clap_builder", + "clap_derive", +] + +[[package]] +name = "clap_builder" +version = "4.5.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b17a95aa67cc7b5ebd32aa5370189aa0d79069ef1c64ce893bd30fb24bff20ec" +dependencies = [ + "anstream", + "anstyle", + "clap_lex", + "strsim", +] + +[[package]] +name = "clap_derive" +version = "4.5.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ac6a0c7b1a9e9a5186361f67dfa1b88213572f427fb9ab038efb2bd8c582dab" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "clap_lex" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "afb84c814227b90d6895e01398aee0d8033c00e7466aca416fb6a8e0eb19d8a7" + +[[package]] +name = "colorchoice" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b63caa9aa9397e2d9480a9b13673856c78d8ac123288526c37d7839f2a86990" + +[[package]] +name = "console" +version = "0.15.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0e1f83fc076bd6dd27517eacdf25fef6c4dfe5f1d7448bafaaf3a26f13b5e4eb" +dependencies = [ + "encode_unicode", + "lazy_static", + "libc", + "unicode-width 0.1.14", + "windows-sys 0.52.0", +] + +[[package]] +name = "const-oid" +version = "0.9.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c2459377285ad874054d797f3ccebf984978aa39129f6eafde5cdc8315b612f8" + +[[package]] +name = "core-foundation" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91e195e091a93c46f7102ec7818a2aa394e1e1771c3ab4825963fa03e45afb8f" +dependencies = [ + "core-foundation-sys", + "libc", +] + +[[package]] +name = "core-foundation-sys" +version = "0.8.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" + +[[package]] +name = "cpufeatures" +version = "0.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "16b80225097f2e5ae4e7179dd2266824648f3e2f49d9134d584b76389d31c4c3" +dependencies = [ + "libc", +] + +[[package]] +name = "crc32c" +version = "0.6.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3a47af21622d091a8f0fb295b88bc886ac74efcc613efc19f5d0b21de5c89e47" +dependencies = [ + "rustc_version", +] + +[[package]] +name = "crc32fast" +version = "1.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a97769d94ddab943e4510d138150169a2758b5ef3eb191a9ee688de3e23ef7b3" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "crypto-bigint" +version = "0.4.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ef2b4b23cddf68b89b8f8069890e8c270d54e2d5fe1b143820234805e4cb17ef" +dependencies = [ + "generic-array", + "rand_core", + "subtle", + "zeroize", +] + +[[package]] +name = "crypto-bigint" +version = "0.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0dc92fb57ca44df6db8059111ab3af99a63d5d0f8375d9972e319a379c6bab76" +dependencies = [ + "rand_core", + "subtle", +] + +[[package]] +name = "crypto-common" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3" +dependencies = [ + "generic-array", + "typenum", +] + +[[package]] +name = "der" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1a467a65c5e759bce6e65eaf91cc29f466cdc57cb65777bd646872a8a1fd4de" +dependencies = [ + "const-oid", + "zeroize", +] + +[[package]] +name = "deranged" +version = "0.3.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b42b6fa04a440b495c8b04d0e71b707c585f83cb9cb28cf8cd0d976c315e31b4" +dependencies = [ + "powerfmt", +] + +[[package]] +name = "digest" +version = "0.10.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" +dependencies = [ + "block-buffer", + "crypto-common", + "subtle", +] + +[[package]] +name = "displaydoc" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "ecdsa" +version = "0.14.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "413301934810f597c1d19ca71c8710e99a3f1ba28a0d2ebc01551a2daeea3c5c" +dependencies = [ + "der", + "elliptic-curve", + "rfc6979", + "signature", +] + +[[package]] +name = "either" +version = "1.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0" + +[[package]] +name = "elliptic-curve" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e7bb888ab5300a19b8e5bceef25ac745ad065f3c9f7efc6de1b91958110891d3" +dependencies = [ + "base16ct", + "crypto-bigint 0.4.9", + "der", + "digest", + "ff", + "generic-array", + "group", + "pkcs8", + "rand_core", + "sec1", + "subtle", + "zeroize", +] + +[[package]] +name = "encode_unicode" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a357d28ed41a50f9c765dbfe56cbc04a64e53e5fc58ba79fbc34c10ef3df831f" + +[[package]] +name = "equivalent" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" + +[[package]] +name = "fastrand" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "486f806e73c5707928240ddc295403b1b93c96a02038563881c4a2fd84b81ac4" + +[[package]] +name = "ff" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d013fc25338cc558c5c2cfbad646908fb23591e2404481826742b651c9af7160" +dependencies = [ + "rand_core", + "subtle", +] + +[[package]] +name = "fnv" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" + +[[package]] +name = "foldhash" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f81ec6369c545a7d40e4589b5597581fa1c441fe1cce96dd1de43159910a36a2" + +[[package]] +name = "form_urlencoded" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e13624c2627564efccf4934284bdd98cbaa14e79b0b5a141218e507b3a823456" +dependencies = [ + "percent-encoding", +] + +[[package]] +name = "futures-channel" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2dff15bf788c671c1934e366d07e30c1814a8ef514e1af724a602e8a2fbe1b10" +dependencies = [ + "futures-core", +] + +[[package]] +name = "futures-core" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05f29059c0c2090612e8d742178b0580d2dc940c837851ad723096f87af6663e" + +[[package]] +name = "futures-sink" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e575fab7d1e0dcb8d0c7bcf9a63ee213816ab51902e6d244a95819acacf1d4f7" + +[[package]] +name = "futures-task" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f90f7dce0722e95104fcb095585910c0977252f286e354b5e3bd38902cd99988" + +[[package]] +name = "futures-util" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9fa08315bb612088cc391249efdc3bc77536f16c91f6cf495e6fbe85b20a4a81" +dependencies = [ + "futures-core", + "futures-task", + "pin-project-lite", + "pin-utils", +] + +[[package]] +name = "generic-array" +version = "0.14.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a" +dependencies = [ + "typenum", + "version_check", +] + +[[package]] +name = "getrandom" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7" +dependencies = [ + "cfg-if", + "libc", + "wasi", +] + +[[package]] +name = "gimli" +version = "0.31.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "07e28edb80900c19c28f1072f2e8aeca7fa06b23cd4169cefe1af5aa3260783f" + +[[package]] +name = "group" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5dfbfb3a6cfbd390d5c9564ab283a0349b9b9fcd46a706c1eb10e0db70bfbac7" +dependencies = [ + "ff", + "rand_core", + "subtle", +] + +[[package]] +name = "h2" +version = "0.3.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "81fe527a889e1532da5c525686d96d4c2e74cdd345badf8dfef9f6b39dd5f5e8" +dependencies = [ + "bytes", + "fnv", + "futures-core", + "futures-sink", + "futures-util", + "http 0.2.12", + "indexmap", + "slab", + "tokio", + "tokio-util", + "tracing", +] + +[[package]] +name = "hashbrown" +version = "0.15.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf151400ff0baff5465007dd2f3e717f3fe502074ca563069ce3a6629d07b289" +dependencies = [ + "allocator-api2", + "equivalent", + "foldhash", +] + +[[package]] +name = "heck" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" + +[[package]] +name = "hermit-abi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d231dfb89cfffdbc30e7fc41579ed6066ad03abda9e567ccafae602b97ec5024" + +[[package]] +name = "hex" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" + +[[package]] +name = "hmac" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c49c37c09c17a53d937dfbb742eb3a961d65a994e6bcdcf37e7399d0cc8ab5e" +dependencies = [ + "digest", +] + +[[package]] +name = "http" +version = "0.2.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "601cbb57e577e2f5ef5be8e7b83f0f63994f25aa94d673e54a92d5c516d101f1" +dependencies = [ + "bytes", + "fnv", + "itoa", +] + +[[package]] +name = "http" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "21b9ddb458710bc376481b842f5da65cdf31522de232c1ca8146abce2a358258" +dependencies = [ + "bytes", + "fnv", + "itoa", +] + +[[package]] +name = "http-body" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7ceab25649e9960c0311ea418d17bee82c0dcec1bd053b5f9a66e265a693bed2" +dependencies = [ + "bytes", + "http 0.2.12", + "pin-project-lite", +] + +[[package]] +name = "http-body" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1efedce1fb8e6913f23e0c92de8e62cd5b772a67e7b3946df930a62566c93184" +dependencies = [ + "bytes", + "http 1.1.0", +] + +[[package]] +name = "http-body-util" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "793429d76616a256bcb62c2a2ec2bed781c8307e797e2598c50010f2bee2544f" +dependencies = [ + "bytes", + "futures-util", + "http 1.1.0", + "http-body 1.0.1", + "pin-project-lite", +] + +[[package]] +name = "httparse" +version = "1.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7d71d3574edd2771538b901e6549113b4006ece66150fb69c0fb6d9a2adae946" + +[[package]] +name = "httpdate" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9" + +[[package]] +name = "hyper" +version = "0.14.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8c08302e8fa335b151b788c775ff56e7a03ae64ff85c548ee820fecb70356e85" +dependencies = [ + "bytes", + "futures-channel", + "futures-core", + "futures-util", + "h2", + "http 0.2.12", + "http-body 0.4.6", + "httparse", + "httpdate", + "itoa", + "pin-project-lite", + "socket2", + "tokio", + "tower-service", + "tracing", + "want", +] + +[[package]] +name = "hyper-rustls" +version = "0.24.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec3efd23720e2049821a693cbc7e65ea87c72f1c58ff2f9522ff332b1491e590" +dependencies = [ + "futures-util", + "http 0.2.12", + "hyper", + "log", + "rustls", + "rustls-native-certs", + "tokio", + "tokio-rustls", +] + +[[package]] +name = "icu_collections" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "db2fa452206ebee18c4b5c2274dbf1de17008e874b4dc4f0aea9d01ca79e4526" +dependencies = [ + "displaydoc", + "yoke", + "zerofrom", + "zerovec", +] + +[[package]] +name = "icu_locid" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13acbb8371917fc971be86fc8057c41a64b521c184808a698c02acc242dbf637" +dependencies = [ + "displaydoc", + "litemap", + "tinystr", + "writeable", + "zerovec", +] + +[[package]] +name = "icu_locid_transform" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "01d11ac35de8e40fdeda00d9e1e9d92525f3f9d887cdd7aa81d727596788b54e" +dependencies = [ + "displaydoc", + "icu_locid", + "icu_locid_transform_data", + "icu_provider", + "tinystr", + "zerovec", +] + +[[package]] +name = "icu_locid_transform_data" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fdc8ff3388f852bede6b579ad4e978ab004f139284d7b28715f773507b946f6e" + +[[package]] +name = "icu_normalizer" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19ce3e0da2ec68599d193c93d088142efd7f9c5d6fc9b803774855747dc6a84f" +dependencies = [ + "displaydoc", + "icu_collections", + "icu_normalizer_data", + "icu_properties", + "icu_provider", + "smallvec", + "utf16_iter", + "utf8_iter", + "write16", + "zerovec", +] + +[[package]] +name = "icu_normalizer_data" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8cafbf7aa791e9b22bec55a167906f9e1215fd475cd22adfcf660e03e989516" + +[[package]] +name = "icu_properties" +version = "1.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93d6020766cfc6302c15dbbc9c8778c37e62c14427cb7f6e601d849e092aeef5" +dependencies = [ + "displaydoc", + "icu_collections", + "icu_locid_transform", + "icu_properties_data", + "icu_provider", + "tinystr", + "zerovec", +] + +[[package]] +name = "icu_properties_data" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67a8effbc3dd3e4ba1afa8ad918d5684b8868b3b26500753effea8d2eed19569" + +[[package]] +name = "icu_provider" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ed421c8a8ef78d3e2dbc98a973be2f3770cb42b606e3ab18d6237c4dfde68d9" +dependencies = [ + "displaydoc", + "icu_locid", + "icu_provider_macros", + "stable_deref_trait", + "tinystr", + "writeable", + "yoke", + "zerofrom", + "zerovec", +] + +[[package]] +name = "icu_provider_macros" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ec89e9337638ecdc08744df490b221a7399bf8d164eb52a665454e60e075ad6" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "idna" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "686f825264d630750a544639377bae737628043f20d38bbc029e8f29ea968a7e" +dependencies = [ + "idna_adapter", + "smallvec", + "utf8_iter", +] + +[[package]] +name = "idna_adapter" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "daca1df1c957320b2cf139ac61e7bd64fed304c5040df000a745aa1de3b4ef71" +dependencies = [ + "icu_normalizer", + "icu_properties", +] + +[[package]] +name = "indexmap" +version = "2.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "707907fe3c25f5424cce2cb7e1cbcafee6bdbe735ca90ef77c29e84591e5b9da" +dependencies = [ + "equivalent", + "hashbrown", +] + +[[package]] +name = "indicatif" +version = "0.17.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cbf675b85ed934d3c67b5c5469701eec7db22689d0a2139d856e0925fa28b281" +dependencies = [ + "console", + "number_prefix", + "portable-atomic", + "unicode-width 0.2.0", + "web-time", +] + +[[package]] +name = "is_terminal_polyfill" +version = "1.70.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" + +[[package]] +name = "itoa" +version = "1.0.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d75a2a4b1b190afb6f5425f10f6a8f959d2ea0b9c2b1d79553551850539e4674" + +[[package]] +name = "js-sys" +version = "0.3.72" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a88f1bda2bd75b0452a14784937d796722fdebfe50df998aeb3f0b7603019a9" +dependencies = [ + "wasm-bindgen", +] + +[[package]] +name = "lazy_static" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" + +[[package]] +name = "libc" +version = "0.2.165" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fcb4d3d38eab6c5239a362fa8bae48c03baf980a6e7079f063942d563ef3533e" + +[[package]] +name = "litemap" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ee93343901ab17bd981295f2cf0026d4ad018c7c31ba84549a4ddbb47a45104" + +[[package]] +name = "lock_api" +version = "0.4.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "07af8b9cdd281b7915f413fa73f29ebd5d55d0d3f0155584dade1ff18cea1b17" +dependencies = [ + "autocfg", + "scopeguard", +] + +[[package]] +name = "log" +version = "0.4.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24" + +[[package]] +name = "lru" +version = "0.12.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "234cf4f4a04dc1f57e24b96cc0cd600cf2af460d4161ac5ecdd0af8e1f3b2a38" +dependencies = [ + "hashbrown", +] + +[[package]] +name = "md-5" +version = "0.10.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d89e7ee0cfbedfc4da3340218492196241d89eefb6dab27de5df917a6d2e78cf" +dependencies = [ + "cfg-if", + "digest", +] + +[[package]] +name = "memchr" +version = "2.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" + +[[package]] +name = "miniz_oxide" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2d80299ef12ff69b16a84bb182e3b9df68b5a91574d3d4fa6e41b65deec4df1" +dependencies = [ + "adler2", +] + +[[package]] +name = "mio" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "80e04d1dcff3aae0704555fe5fee3bcfaf3d1fdf8a7e521d5b9d2b42acb52cec" +dependencies = [ + "hermit-abi", + "libc", + "wasi", + "windows-sys 0.52.0", +] + +[[package]] +name = "num-conv" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9" + +[[package]] +name = "num-integer" +version = "0.1.46" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f" +dependencies = [ + "num-traits", +] + +[[package]] +name = "num-traits" +version = "0.2.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" +dependencies = [ + "autocfg", +] + +[[package]] +name = "number_prefix" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3" + +[[package]] +name = "object" +version = "0.36.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aedf0a2d09c573ed1d8d85b30c119153926a2b36dce0ab28322c09a117a4683e" +dependencies = [ + "memchr", +] + +[[package]] +name = "once_cell" +version = "1.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775" + +[[package]] +name = "openssl-probe" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf" + +[[package]] +name = "outref" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4030760ffd992bef45b0ae3f10ce1aba99e33464c90d14dd7c039884963ddc7a" + +[[package]] +name = "p256" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "51f44edd08f51e2ade572f141051021c5af22677e42b7dd28a88155151c33594" +dependencies = [ + "ecdsa", + "elliptic-curve", + "sha2", +] + +[[package]] +name = "parking_lot" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1bf18183cf54e8d6059647fc3063646a1801cf30896933ec2311622cc4b9a27" +dependencies = [ + "lock_api", + "parking_lot_core", +] + +[[package]] +name = "parking_lot_core" +version = "0.9.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e401f977ab385c9e4e3ab30627d6f26d00e2c73eef317493c4ec6d468726cf8" +dependencies = [ + "cfg-if", + "libc", + "redox_syscall", + "smallvec", + "windows-targets", +] + +[[package]] +name = "percent-encoding" +version = "2.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" + +[[package]] +name = "pin-project" +version = "1.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be57f64e946e500c8ee36ef6331845d40a93055567ec57e8fae13efd33759b95" +dependencies = [ + "pin-project-internal", +] + +[[package]] +name = "pin-project-internal" +version = "1.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c0f5fad0874fc7abcd4d750e76917eaebbecaa2c20bde22e1dbeeba8beb758c" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "pin-project-lite" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "915a1e146535de9163f3987b8944ed8cf49a18bb0056bcebcdcece385cece4ff" + +[[package]] +name = "pin-utils" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" + +[[package]] +name = "pkcs8" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9eca2c590a5f85da82668fa685c09ce2888b9430e83299debf1f34b65fd4a4ba" +dependencies = [ + "der", + "spki", +] + +[[package]] +name = "portable-atomic" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "280dc24453071f1b63954171985a0b0d30058d287960968b9b2aca264c8d4ee6" + +[[package]] +name = "powerfmt" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391" + +[[package]] +name = "ppv-lite86" +version = "0.2.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77957b295656769bb8ad2b6a6b09d897d94f05c41b069aede1fcdaa675eaea04" +dependencies = [ + "zerocopy", +] + +[[package]] +name = "proc-macro2" +version = "1.0.92" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37d3544b3f2748c54e147655edb5025752e2303145b5aefb3c3ea2c78b973bb0" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.37" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "rand" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" +dependencies = [ + "libc", + "rand_chacha", + "rand_core", +] + +[[package]] +name = "rand_chacha" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" +dependencies = [ + "ppv-lite86", + "rand_core", +] + +[[package]] +name = "rand_core" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" +dependencies = [ + "getrandom", +] + +[[package]] +name = "redox_syscall" +version = "0.5.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b6dfecf2c74bce2466cabf93f6664d6998a69eb21e39f4207930065b27b771f" +dependencies = [ + "bitflags", +] + +[[package]] +name = "regex-lite" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "53a49587ad06b26609c52e423de037e7f57f20d53535d66e08c695f347df952a" + +[[package]] +name = "rfc6979" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7743f17af12fa0b03b803ba12cd6a8d9483a587e89c69445e3909655c0b9fabb" +dependencies = [ + "crypto-bigint 0.4.9", + "hmac", + "zeroize", +] + +[[package]] +name = "ring" +version = "0.17.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c17fa4cb658e3583423e915b9f3acc01cceaee1860e33d59ebae66adc3a2dc0d" +dependencies = [ + "cc", + "cfg-if", + "getrandom", + "libc", + "spin", + "untrusted", + "windows-sys 0.52.0", +] + +[[package]] +name = "rustc-demangle" +version = "0.1.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f" + +[[package]] +name = "rustc_version" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92" +dependencies = [ + "semver", +] + +[[package]] +name = "rustls" +version = "0.21.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f56a14d1f48b391359b22f731fd4bd7e43c97f3c50eee276f3aa09c94784d3e" +dependencies = [ + "log", + "ring", + "rustls-webpki", + "sct", +] + +[[package]] +name = "rustls-native-certs" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a9aace74cb666635c918e9c12bc0d348266037aa8eb599b5cba565709a8dff00" +dependencies = [ + "openssl-probe", + "rustls-pemfile", + "schannel", + "security-framework", +] + +[[package]] +name = "rustls-pemfile" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1c74cae0a4cf6ccbbf5f359f08efdf8ee7e1dc532573bf0db71968cb56b1448c" +dependencies = [ + "base64", +] + +[[package]] +name = "rustls-webpki" +version = "0.101.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b6275d1ee7a1cd780b64aca7726599a1dbc893b1e64144529e55c3c2f745765" +dependencies = [ + "ring", + "untrusted", +] + +[[package]] +name = "ryu" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f" + +[[package]] +name = "s3_union_find" +version = "0.1.0" +dependencies = [ + "anyhow", + "aws-config", + "aws-sdk-s3", + "byteorder", + "clap", + "indicatif", + "tokio", + "tokio-retry", +] + +[[package]] +name = "schannel" +version = "0.1.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1f29ebaa345f945cec9fbbc532eb307f0fdad8161f281b6369539c8d84876b3d" +dependencies = [ + "windows-sys 0.59.0", +] + +[[package]] +name = "scopeguard" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" + +[[package]] +name = "sct" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da046153aa2352493d6cb7da4b6e5c0c057d8a1d0a9aa8560baffdd945acd414" +dependencies = [ + "ring", + "untrusted", +] + +[[package]] +name = "sec1" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3be24c1842290c45df0a7bf069e0c268a747ad05a192f2fd7dcfdbc1cba40928" +dependencies = [ + "base16ct", + "der", + "generic-array", + "pkcs8", + "subtle", + "zeroize", +] + +[[package]] +name = "security-framework" +version = "2.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "897b2245f0b511c87893af39b033e5ca9cce68824c4d7e7630b5a1d339658d02" +dependencies = [ + "bitflags", + "core-foundation", + "core-foundation-sys", + "libc", + "security-framework-sys", +] + +[[package]] +name = "security-framework-sys" +version = "2.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa39c7303dc58b5543c94d22c1766b0d31f2ee58306363ea622b10bbc075eaa2" +dependencies = [ + "core-foundation-sys", + "libc", +] + +[[package]] +name = "semver" +version = "1.0.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "61697e0a1c7e512e84a621326239844a24d8207b4669b41bc18b32ea5cbf988b" + +[[package]] +name = "serde" +version = "1.0.215" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6513c1ad0b11a9376da888e3e0baa0077f1aed55c17f50e7b2397136129fb88f" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.215" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ad1e866f866923f252f05c889987993144fb74e722403468a4ebd70c3cd756c0" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "sha1" +version = "0.10.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3bf829a2d51ab4a5ddf1352d8470c140cadc8301b2ae1789db023f01cedd6ba" +dependencies = [ + "cfg-if", + "cpufeatures", + "digest", +] + +[[package]] +name = "sha2" +version = "0.10.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "793db75ad2bcafc3ffa7c68b215fee268f537982cd901d132f89c6343f3a3dc8" +dependencies = [ + "cfg-if", + "cpufeatures", + "digest", +] + +[[package]] +name = "shlex" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" + +[[package]] +name = "signal-hook-registry" +version = "1.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a9e9e0b4211b72e7b8b6e85c807d36c212bdb33ea8587f7569562a84df5465b1" +dependencies = [ + "libc", +] + +[[package]] +name = "signature" +version = "1.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "74233d3b3b2f6d4b006dc19dee745e73e2a6bfb6f93607cd3b02bd5b00797d7c" +dependencies = [ + "digest", + "rand_core", +] + +[[package]] +name = "slab" +version = "0.4.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f92a496fb766b417c996b9c5e57daf2f7ad3b0bebe1ccfca4856390e3d3bb67" +dependencies = [ + "autocfg", +] + +[[package]] +name = "smallvec" +version = "1.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67" + +[[package]] +name = "socket2" +version = "0.5.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ce305eb0b4296696835b71df73eb912e0f1ffd2556a501fcede6e0c50349191c" +dependencies = [ + "libc", + "windows-sys 0.52.0", +] + +[[package]] +name = "spin" +version = "0.9.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67" + +[[package]] +name = "spki" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67cf02bbac7a337dc36e4f5a693db6c21e7863f45070f7064577eb4367a3212b" +dependencies = [ + "base64ct", + "der", +] + +[[package]] +name = "stable_deref_trait" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" + +[[package]] +name = "strsim" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" + +[[package]] +name = "subtle" +version = "2.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" + +[[package]] +name = "syn" +version = "2.0.89" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "44d46482f1c1c87acd84dea20c1bf5ebff4c757009ed6bf19cfd36fb10e92c4e" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "synstructure" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8af7666ab7b6390ab78131fb5b0fce11d6b7a6951602017c35fa82800708971" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "time" +version = "0.3.36" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5dfd88e563464686c916c7e46e623e520ddc6d79fa6641390f2e3fa86e83e885" +dependencies = [ + "deranged", + "num-conv", + "powerfmt", + "serde", + "time-core", + "time-macros", +] + +[[package]] +name = "time-core" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ef927ca75afb808a4d64dd374f00a2adf8d0fcff8e7b184af886c3c87ec4a3f3" + +[[package]] +name = "time-macros" +version = "0.2.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f252a68540fde3a3877aeea552b832b40ab9a69e318efd078774a01ddee1ccf" +dependencies = [ + "num-conv", + "time-core", +] + +[[package]] +name = "tinystr" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9117f5d4db391c1cf6927e7bea3db74b9a1c1add8f7eda9ffd5364f40f57b82f" +dependencies = [ + "displaydoc", + "zerovec", +] + +[[package]] +name = "tokio" +version = "1.41.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22cfb5bee7a6a52939ca9224d6ac897bb669134078daa8735560897f69de4d33" +dependencies = [ + "backtrace", + "bytes", + "libc", + "mio", + "parking_lot", + "pin-project-lite", + "signal-hook-registry", + "socket2", + "tokio-macros", + "windows-sys 0.52.0", +] + +[[package]] +name = "tokio-macros" +version = "2.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "693d596312e88961bc67d7f1f97af8a70227d9f90c31bba5806eec004978d752" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "tokio-retry" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f57eb36ecbe0fc510036adff84824dd3c24bb781e21bfa67b69d556aa85214f" +dependencies = [ + "pin-project", + "rand", + "tokio", +] + +[[package]] +name = "tokio-rustls" +version = "0.24.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c28327cf380ac148141087fbfb9de9d7bd4e84ab5d2c28fbc911d753de8a7081" +dependencies = [ + "rustls", + "tokio", +] + +[[package]] +name = "tokio-util" +version = "0.7.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "61e7c3654c13bcd040d4a03abee2c75b1d14a37b423cf5a813ceae1cc903ec6a" +dependencies = [ + "bytes", + "futures-core", + "futures-sink", + "pin-project-lite", + "tokio", +] + +[[package]] +name = "tower-service" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8df9b6e13f2d32c91b9bd719c00d1958837bc7dec474d94952798cc8e69eeec3" + +[[package]] +name = "tracing" +version = "0.1.40" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c3523ab5a71916ccf420eebdf5521fcef02141234bbc0b8a49f2fdc4544364ef" +dependencies = [ + "pin-project-lite", + "tracing-attributes", + "tracing-core", +] + +[[package]] +name = "tracing-attributes" +version = "0.1.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "tracing-core" +version = "0.1.33" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e672c95779cf947c5311f83787af4fa8fffd12fb27e4993211a84bdfd9610f9c" +dependencies = [ + "once_cell", +] + +[[package]] +name = "try-lock" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b" + +[[package]] +name = "typenum" +version = "1.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42ff0bf0c66b8238c6f3b578df37d0b7848e55df8577b3f74f92a69acceeb825" + +[[package]] +name = "unicode-ident" +version = "1.0.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "adb9e6ca4f869e1180728b7950e35922a7fc6397f7b641499e8f3ef06e50dc83" + +[[package]] +name = "unicode-width" +version = "0.1.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7dd6e30e90baa6f72411720665d41d89b9a3d039dc45b8faea1ddd07f617f6af" + +[[package]] +name = "unicode-width" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fc81956842c57dac11422a97c3b8195a1ff727f06e85c84ed2e8aa277c9a0fd" + +[[package]] +name = "untrusted" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" + +[[package]] +name = "url" +version = "2.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32f8b686cadd1473f4bd0117a5d28d36b1ade384ea9b5069a1c40aefed7fda60" +dependencies = [ + "form_urlencoded", + "idna", + "percent-encoding", +] + +[[package]] +name = "urlencoding" +version = "2.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "daf8dba3b7eb870caf1ddeed7bc9d2a049f3cfdfae7cb521b087cc33ae4c49da" + +[[package]] +name = "utf16_iter" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8232dd3cdaed5356e0f716d285e4b40b932ac434100fe9b7e0e8e935b9e6246" + +[[package]] +name = "utf8_iter" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" + +[[package]] +name = "utf8parse" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" + +[[package]] +name = "uuid" +version = "1.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8c5f0a0af699448548ad1a2fbf920fb4bee257eae39953ba95cb84891a0446a" + +[[package]] +name = "version_check" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" + +[[package]] +name = "vsimd" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c3082ca00d5a5ef149bb8b555a72ae84c9c59f7250f013ac822ac2e49b19c64" + +[[package]] +name = "want" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bfa7760aed19e106de2c7c0b581b509f2f25d3dacaf737cb82ac61bc6d760b0e" +dependencies = [ + "try-lock", +] + +[[package]] +name = "wasi" +version = "0.11.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" + +[[package]] +name = "wasm-bindgen" +version = "0.2.95" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "128d1e363af62632b8eb57219c8fd7877144af57558fb2ef0368d0087bddeb2e" +dependencies = [ + "cfg-if", + "once_cell", + "wasm-bindgen-macro", +] + +[[package]] +name = "wasm-bindgen-backend" +version = "0.2.95" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb6dd4d3ca0ddffd1dd1c9c04f94b868c37ff5fac97c30b97cff2d74fce3a358" +dependencies = [ + "bumpalo", + "log", + "once_cell", + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-macro" +version = "0.2.95" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e79384be7f8f5a9dd5d7167216f022090cf1f9ec128e6e6a482a2cb5c5422c56" +dependencies = [ + "quote", + "wasm-bindgen-macro-support", +] + +[[package]] +name = "wasm-bindgen-macro-support" +version = "0.2.95" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "26c6ab57572f7a24a4985830b120de1594465e5d500f24afe89e16b4e833ef68" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-backend", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-shared" +version = "0.2.95" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "65fc09f10666a9f147042251e0dda9c18f166ff7de300607007e96bdebc1068d" + +[[package]] +name = "web-time" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a6580f308b1fad9207618087a65c04e7a10bc77e02c8e84e9b00dd4b12fa0bb" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + +[[package]] +name = "windows-sys" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-sys" +version = "0.59.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-targets" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" +dependencies = [ + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_gnullvm", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" + +[[package]] +name = "windows_i686_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" + +[[package]] +name = "windows_i686_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" + +[[package]] +name = "write16" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d1890f4022759daae28ed4fe62859b1236caebfc61ede2f63ed4e695f3f6d936" + +[[package]] +name = "writeable" +version = "0.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e9df38ee2d2c3c5948ea468a8406ff0db0b29ae1ffde1bcf20ef305bcc95c51" + +[[package]] +name = "xmlparser" +version = "0.13.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "66fee0b777b0f5ac1c69bb06d361268faafa61cd4682ae064a171c16c433e9e4" + +[[package]] +name = "yoke" +version = "0.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "120e6aef9aa629e3d4f52dc8cc43a015c7724194c97dfaf45180d2daf2b77f40" +dependencies = [ + "serde", + "stable_deref_trait", + "yoke-derive", + "zerofrom", +] + +[[package]] +name = "yoke-derive" +version = "0.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2380878cad4ac9aac1e2435f3eb4020e8374b5f13c296cb75b4620ff8e229154" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "synstructure", +] + +[[package]] +name = "zerocopy" +version = "0.7.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0" +dependencies = [ + "byteorder", + "zerocopy-derive", +] + +[[package]] +name = "zerocopy-derive" +version = "0.7.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "zerofrom" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cff3ee08c995dee1859d998dea82f7374f2826091dd9cd47def953cae446cd2e" +dependencies = [ + "zerofrom-derive", +] + +[[package]] +name = "zerofrom-derive" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "595eed982f7d355beb85837f651fa22e90b3c044842dc7f2c2842c086f295808" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "synstructure", +] + +[[package]] +name = "zeroize" +version = "1.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ced3678a2879b30306d323f4542626697a464a97c0a07c9aebf7ebca65cd4dde" + +[[package]] +name = "zerovec" +version = "0.10.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aa2b893d79df23bfb12d5461018d408ea19dfafe76c2c7ef6d4eba614f8ff079" +dependencies = [ + "yoke", + "zerofrom", + "zerovec-derive", +] + +[[package]] +name = "zerovec-derive" +version = "0.10.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6eafa6dfb17584ea3e2bd6e76e0cc15ad7af12b09abdd1ca55961bed9b1063c6" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] diff --git a/src/datatrove/tools/fast_mh3/Cargo.toml b/src/datatrove/tools/fast_mh3/Cargo.toml new file mode 100644 index 00000000..016d401d --- /dev/null +++ b/src/datatrove/tools/fast_mh3/Cargo.toml @@ -0,0 +1,35 @@ +[package] +name = "s3_union_find" +version = "0.1.0" +edition = "2021" + +[dependencies] +# AWS SDK +aws-config = { version = "1.1.1", features = ["behavior-version-latest"] } +aws-sdk-s3 = "1.1.1" + +# CLI argument parsing +clap = { version = "4.4.2", features = ["derive"] } + +# Error handling +anyhow = "1.0.75" + +# Byte reading/writing +byteorder = "1.5.0" + +# Progress bars +indicatif = "0.17.7" + +# Async runtime and utilities +tokio = { version = "1.33.0", features = ["full"] } + +# Retries +tokio-retry = "0.3" + +[[bin]] +name = "s3" +path = "src/s3_union_find.rs" + +[[bin]] +name = "local" +path = "src/local_union_find.rs" \ No newline at end of file diff --git a/src/datatrove/tools/fast_mh3/README.md b/src/datatrove/tools/fast_mh3/README.md new file mode 100644 index 00000000..843e2c59 --- /dev/null +++ b/src/datatrove/tools/fast_mh3/README.md @@ -0,0 +1,42 @@ +This is a fast and memory efficient implementation of MinHash step 3 written in Rust. + +Build and run with +``` +cargo build --release +./target/release/s3 --help +``` + +Two versions are available: +- `s3` reads and writes the data directly to s3 +- `local` reads and writes the data from/to the local filesystem + +Here's an example of a config with the python version and the equivalent rust command: +```python +BASE_PATH = "s3://some-bucket/minhash" +s3 = SlurmPipelineExecutor( + job_name=f"mh3", + pipeline=[ + MinhashDedupCluster( + input_folder=f"{BASE_PATH}/buckets", + output_folder=f"{BASE_PATH}/remove_ids", + save_cluster_size=True + ), + ], + tasks=1, + cpus_per_task=2, + mem_per_cpu_gb=450, + logging_dir=f"logs/clusters", + partition="hopper-cpu", + time="100:00:00" +).run() +``` + +Assuming step 2 was run with `minhash_config.num_buckets * 50 = 700` tasks + +``` +./target/release/s3 --input-folder s3://some-bucket/minhash/buckets/ --output-folder s3://some-bucket/minhash/remove_ids/ --total-files 700 --downloads 20 +``` +Or if running locally: +``` +./target/release/local --input-folder /fsx/some-path/minhash/buckets/ --output-folder /fsx/some-path/minhash/remove_ids/ --total-files 700 --downloads 20 +``` \ No newline at end of file diff --git a/src/datatrove/tools/fast_mh3/src/local_union_find.rs b/src/datatrove/tools/fast_mh3/src/local_union_find.rs new file mode 100644 index 00000000..e068eb85 --- /dev/null +++ b/src/datatrove/tools/fast_mh3/src/local_union_find.rs @@ -0,0 +1,376 @@ +use std::fs::{self, File}; +use std::io::{BufReader, BufWriter, Write}; +use std::path::{Path, PathBuf}; +use std::collections::HashMap; +use anyhow::{Result}; +use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt}; +use clap::Parser; +use indicatif::{ProgressBar, ProgressStyle}; +use tokio::task; +use std::sync::{Arc, Mutex}; +// use tokio::time::{Duration, sleep}; +use tokio::sync::Semaphore; + +// fn format_duration(duration: Duration) -> String { +// let secs = duration.as_secs(); +// let hours = secs / 3600; +// let minutes = (secs % 3600) / 60; +// let seconds = secs % 60; +// format!("{:02}:{:02}:{:02}", hours, minutes, seconds) +// } + +#[derive(Parser, Debug)] +#[command(version, about, long_about = None)] +struct Args { + /// Input folder path + #[arg(long)] + input_folder: String, + + /// Output folder path + #[arg(long)] + output_folder: String, + + /// Total number of files to process + #[arg(long)] + total_files: usize, + + /// Total number of concurrent operations + #[arg(long, default_value = "0")] + concurrent_ops: usize, +} + +#[derive(Debug)] +struct UnionFindData { + union_set: HashMap<(u32, u32), (u32, u32)>, + set_size: HashMap<(u32, u32), usize>, +} + +#[derive(Debug)] +struct UnionFind { + data: Arc>, +} + +impl UnionFind { + fn new() -> Self { + UnionFind { + data: Arc::new(Mutex::new(UnionFindData { + union_set: HashMap::new(), + set_size: HashMap::new(), + })), + } + } +} + +async fn list_files(input_folder: &str, total_files: usize) -> Result> { + let mut files: Vec = fs::read_dir(input_folder)? + .filter_map(|entry| entry.ok()) + .map(|entry| entry.path()) + .filter(|path| path.is_file()) + .collect(); + + files.sort(); + + if files.len() != total_files { + anyhow::bail!( + "Expected {} files, found {} in {}", + total_files, + files.len(), + input_folder + ); + } + + Ok(files) +} + +fn read_and_parse_file(file_path: &Path) -> Result> { + let file = File::open(file_path)?; + let mut reader = BufReader::new(file); + let mut tuples = Vec::new(); + + while let (Ok(f1), Ok(d1), Ok(f2), Ok(d2)) = ( + reader.read_u32::(), + reader.read_u32::(), + reader.read_u32::(), + reader.read_u32::(), + ) { + tuples.push((f1, d1, f2, d2)); + } + + Ok(tuples) +} + +async fn process_single_file( + output_folder: &Path, + file_number: u32, + union_find: &Arc, + pb: &ProgressBar, +) -> Result<(usize, usize)> { + let mut to_remove = 0; + let mut clusters = 0; + + let nodes_data = { + let mut docs = union_find.union_set.keys() + .filter(|(f, _)| *f == file_number) + .map(|(_, d)| *d) + .collect::>(); + docs.sort_unstable(); + + docs.into_iter().map(|doc| { + let node = (file_number, doc); + let mut current = node; + while let Some(&parent) = union_find.union_set.get(¤t) { + if parent == current { + break; + } + current = parent; + } + let root = current; + let size = *union_find.set_size.get(&root).unwrap_or(&1); + (doc, root, size) + }).collect::>() + }; + + let sizes_path = output_folder.join(format!("{:06}.sizes", file_number)); + let remove_path = output_folder.join(format!("{:06}.remove", file_number)); + + let mut sizes_writer = BufWriter::new(File::create(sizes_path)?); + let mut remove_writer = BufWriter::new(File::create(remove_path)?); + + for (doc, root, size) in nodes_data { + let node = (file_number, doc); + + // Write sizes + sizes_writer.write_u32::(doc)?; + sizes_writer.write_u32::(size as u32)?; + + // Handle removal markers + if node != root { + remove_writer.write_u32::(doc)?; + to_remove += 1; + } + + if node == root { + clusters += 1; + } + + pb.inc(1); + } + + sizes_writer.flush()?; + remove_writer.flush()?; + + Ok((to_remove, clusters)) +} + +async fn process_post_union( + output_folder: &Path, + union_find: UnionFind, +) -> Result<(usize, usize)> { + let data = union_find.data.lock().unwrap(); + let mut files: Vec<_> = data.union_set.keys() + .map(|(f, _)| *f) + .collect::>() + .into_iter() + .collect(); + let total_nodes = data.union_set.len(); + drop(data); + + let union_find_data = Arc::new(Arc::try_unwrap(union_find.data) + .expect("All threads should be finished") + .into_inner() + .unwrap()); + + files.sort_unstable(); + + println!("Processing {} files in parallel...", files.len()); + let pb = Arc::new(ProgressBar::new(total_nodes as u64)); + pb.set_style( + ProgressStyle::default_bar() + .template("{spinner:.green} [{elapsed_precise}] [{bar:40.cyan/blue}] {pos}/{len} ({eta})") + .unwrap() + .progress_chars("#>-"), + ); + +// let pb_clone = Arc::clone(&pb); +// tokio::spawn(async move { +// while !pb_clone.is_finished() { +// let elapsed = pb_clone.elapsed(); +// let eta = pb_clone.eta(); +// // eprintln!( +// "Progress: {}/{} | Elapsed: {} | Remaining: {}", +// pb_clone.position(), +// pb_clone.length().unwrap_or(0), +// format_duration(elapsed), +// format_duration(eta) +// ); +// sleep(Duration::from_secs(5)).await; +// } +// }); + + let semaphore = Arc::new(Semaphore::new(100)); + let mut handles = Vec::new(); + + for file_number in files { + let output_folder = output_folder.to_path_buf(); + let union_find_data = Arc::clone(&union_find_data); + let pb = pb.clone(); + let semaphore = Arc::clone(&semaphore); + + let handle = task::spawn(async move { + let _permit = semaphore.acquire().await?; + process_single_file( + &output_folder, + file_number, + &union_find_data, + &pb, + ).await + }); + handles.push(handle); + } + + let mut total_to_remove = 0; + let mut total_clusters = 0; + + for handle in handles { + let (to_remove, clusters) = handle.await??; + total_to_remove += to_remove; + total_clusters += clusters; + } + + pb.finish_with_message("Output writing complete"); + + Ok((total_to_remove, total_clusters)) +} + +#[tokio::main] +async fn main() -> Result<()> { + let args = Args::parse(); + + // Create output directory if it doesn't exist + fs::create_dir_all(&args.output_folder)?; + + let files = list_files(&args.input_folder, args.total_files).await?; + + let union_find = UnionFind::new(); + let semaphore = Arc::new(if args.concurrent_ops == 0 { + Semaphore::new(args.total_files) // Effectively unlimited + } else { + Semaphore::new(args.concurrent_ops) + }); + + println!("Processing {} input files...", files.len()); + let pb = Arc::new(ProgressBar::new(files.len() as u64)); + pb.set_style( + ProgressStyle::default_bar() + .template("{spinner:.green} [{elapsed_precise}] [{bar:40.cyan/blue}] {pos}/{len} ({eta})") + .unwrap() + .progress_chars("#>-"), + ); + +// let pb_clone = Arc::clone(&pb); +// tokio::spawn(async move { +// while !pb_clone.is_finished() { +// let elapsed = pb_clone.elapsed(); +// let eta = pb_clone.eta(); +// eprintln!( +// "Progress: {}/{} | Elapsed: {} | Remaining: {}", +// pb_clone.position(), +// pb_clone.length().unwrap_or(0), +// format_duration(elapsed), +// format_duration(eta) +// ); +// sleep(Duration::from_secs(5)).await; +// } +// }); + + let mut handles = Vec::new(); + + for file_path in files { + let union_find = Arc::clone(&union_find.data); + let pb = pb.clone(); + let semaphore = Arc::clone(&semaphore); + let file_path = file_path.clone(); + + let handle = task::spawn(async move { + let _permit = semaphore.acquire().await?; + let tuples = read_and_parse_file(&file_path)?; + + let mut data = union_find.lock().unwrap(); + for (f1, d1, f2, d2) in tuples { + let v_a = (f1, d1); + let v_b = (f2, d2); + + let root_a = { + let mut current = v_a; + let mut path = Vec::new(); + while let Some(&parent) = data.union_set.get(¤t) { + if parent == current { + break; + } + path.push(current); + current = parent; + } + if !data.union_set.contains_key(¤t) { + data.union_set.insert(current, current); + } + for node in path { + data.union_set.insert(node, current); + } + current + }; + + let root_b = { + let mut current = v_b; + let mut path = Vec::new(); + while let Some(&parent) = data.union_set.get(¤t) { + if parent == current { + break; + } + path.push(current); + current = parent; + } + if !data.union_set.contains_key(¤t) { + data.union_set.insert(current, current); + } + for node in path { + data.union_set.insert(node, current); + } + current + }; + + if root_a != root_b { + let size_a = *data.set_size.get(&root_a).unwrap_or(&1); + let size_b = *data.set_size.get(&root_b).unwrap_or(&1); + + let (big_root, small_root) = if size_a >= size_b { + (root_a, root_b) + } else { + (root_b, root_a) + }; + + data.union_set.insert(small_root, big_root); + data.set_size.insert(big_root, size_a + size_b); + data.set_size.remove(&small_root); + } + } + drop(data); + pb.inc(1); + Ok::<(), anyhow::Error>(()) + }); + + handles.push(handle); + } + + for handle in handles { + handle.await??; + } + pb.finish_with_message("File processing complete"); + + let (to_remove, clusters) = process_post_union(Path::new(&args.output_folder), union_find).await?; + + println!("Processing complete:"); + println!(" Total clusters: {}", clusters); + println!(" Documents to remove: {}", to_remove); + + Ok(()) +} \ No newline at end of file diff --git a/src/datatrove/tools/fast_mh3/src/s3_union_find.rs b/src/datatrove/tools/fast_mh3/src/s3_union_find.rs new file mode 100644 index 00000000..9b95b1ea --- /dev/null +++ b/src/datatrove/tools/fast_mh3/src/s3_union_find.rs @@ -0,0 +1,568 @@ +use std::io::Cursor; +use std::collections::HashMap; +use anyhow::{Context, Result}; +use aws_sdk_s3::Client; +use aws_sdk_s3::primitives::ByteStream; +use aws_sdk_s3::types::{CompletedMultipartUpload, CompletedPart}; +use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt}; +use clap::Parser; +use indicatif::{ProgressBar, ProgressStyle}; +use tokio::task; +use std::sync::{Arc, Mutex}; +use tokio_retry::Retry; +use tokio_retry::strategy::{ExponentialBackoff, jitter}; +use tokio::time::{Duration, sleep}; +use tokio::sync::Semaphore; + +fn format_duration(duration: Duration) -> String { + let secs = duration.as_secs(); + let hours = secs / 3600; + let minutes = (secs % 3600) / 60; + let seconds = secs % 60; + format!("{:02}:{:02}:{:02}", hours, minutes, seconds) +} + +async fn with_retry(f: F) -> Result +where + F: Fn() -> Fut, + Fut: std::future::Future>, +{ + let retry_strategy = ExponentialBackoff::from_millis(1000) + .max_delay(Duration::from_secs(300)) + .map(jitter) + .take(3); + + Retry::spawn(retry_strategy, || async { + f().await + }).await +} + +#[derive(Parser, Debug)] +#[command(version, about, long_about = None)] +struct Args { + /// Input S3 folder path (e.g., s3://bucket/input/) + #[arg(long)] + input_folder: String, + + /// Output S3 folder path (e.g., s3://bucket/output/) + #[arg(long)] + output_folder: String, + + /// Total number of files to process + #[arg(long)] + total_files: usize, + + /// Total number of concurrent downloads + #[arg(long, default_value = "0")] + downloads: usize, +} + +#[derive(Debug, Clone)] +struct S3Path { + bucket: String, + prefix: String, +} + +impl S3Path { + fn from_path(path: &str) -> Result { + let parts: Vec<&str> = path.trim_start_matches("s3://").split('/').collect(); + if parts.len() < 2 { + anyhow::bail!("Invalid S3 path: {}", path); + } + Ok(S3Path { + bucket: parts[0].to_string(), + prefix: parts[1..].join("/"), + }) + } + + fn with_key(&self, key: &str) -> String { + format!("{}/{}", self.prefix.trim_end_matches('/'), key) + } +} + +#[derive(Debug)] +struct UnionFindData { + union_set: HashMap<(u32, u32), (u32, u32)>, + set_size: HashMap<(u32, u32), usize>, +} + +#[derive(Debug)] +struct UnionFind { + data: Arc>, +} + +impl UnionFind { + fn new() -> Self { + UnionFind { + data: Arc::new(Mutex::new(UnionFindData { + union_set: HashMap::new(), + set_size: HashMap::new(), + })), + } + } +} + +struct S3StreamWriter { + client: Client, + bucket: String, + key: String, + upload_id: String, + buffer: Vec, + part_number: i32, + completed_parts: Vec, + buffer_threshold: usize, +} + +impl S3StreamWriter { + async fn new( + client: &Client, + bucket: &str, + key: &str, + buffer_threshold: usize, + ) -> Result { + let create_multipart_upload_output = with_retry(|| async { + client + .create_multipart_upload() + .bucket(bucket) + .key(key) + .send() + .await + .context("Failed to create multipart upload") + }).await?; + + Ok(Self { + client: client.clone(), + bucket: bucket.to_string(), + key: key.to_string(), + upload_id: create_multipart_upload_output.upload_id().unwrap().to_string(), + buffer: Vec::new(), + part_number: 1, + completed_parts: Vec::new(), + buffer_threshold, + }) + } + + async fn write(&mut self, data: &[u8]) -> Result<()> { + self.buffer.extend_from_slice(data); + + if self.buffer.len() >= self.buffer_threshold { + self.flush().await?; + } + + Ok(()) + } + + async fn flush(&mut self) -> Result<()> { + if self.buffer.is_empty() { + return Ok(()); + } + + let buffer_clone = self.buffer.clone(); + let upload_part_output = with_retry(|| async { + let part_body = ByteStream::from(buffer_clone.clone()); + self.client + .upload_part() + .bucket(&self.bucket) + .key(&self.key) + .upload_id(&self.upload_id) + .part_number(self.part_number) + .body(part_body) + .send() + .await + .context("Failed to upload part") + }).await?; + + let completed_part = CompletedPart::builder() + .e_tag(upload_part_output.e_tag().unwrap_or_default()) + .part_number(self.part_number) + .build(); + + self.completed_parts.push(completed_part); + self.part_number += 1; + self.buffer.clear(); + + Ok(()) + } + + async fn finalize(mut self) -> Result<()> { + self.flush().await?; + + let completed_multipart_upload = CompletedMultipartUpload::builder() + .set_parts(Some(self.completed_parts.clone())) + .build(); + + with_retry(|| async { + self.client + .complete_multipart_upload() + .bucket(&self.bucket) + .key(&self.key) + .upload_id(&self.upload_id) + .multipart_upload(completed_multipart_upload.clone()) + .send() + .await + .context("Failed to complete multipart upload") + }).await?; + + Ok(()) + } +} + +async fn list_s3_files(client: &Client, s3_path: &S3Path, total_files: usize) -> Result> { + let resp = with_retry(|| async { + client + .list_objects_v2() + .bucket(&s3_path.bucket) + .prefix(&s3_path.prefix) + .send() + .await + .context("Failed to list S3 objects") + }).await?; + + let mut files: Vec = resp + .contents() + .iter() + .filter_map(|obj| obj.key() + .map(|key| format!("s3://{}/{}", s3_path.bucket, key))) + .collect(); + + files.sort(); + + if files.len() != total_files { + anyhow::bail!( + "Expected {} files, found {} in s3://{}/{}", + total_files, + files.len(), + s3_path.bucket, + s3_path.prefix + ); + } + + Ok(files) +} + +async fn download_and_parse_file(client: &Client, file_path: &str) -> Result> { + let s3_path = S3Path::from_path(file_path)?; + + let resp = with_retry(|| async { + client + .get_object() + .bucket(&s3_path.bucket) + .key(&s3_path.prefix) + .send() + .await + .context("Failed to download S3 object") + }).await?; + + let body = resp.body.collect().await?.into_bytes(); + let mut reader = Cursor::new(body); + let mut tuples = Vec::new(); + + while let (Ok(f1), Ok(d1), Ok(f2), Ok(d2)) = ( + reader.read_u32::(), + reader.read_u32::(), + reader.read_u32::(), + reader.read_u32::(), + ) { + tuples.push((f1, d1, f2, d2)); + } + + Ok(tuples) +} + +async fn process_single_file( + client: &Client, + output_path: &S3Path, + file_number: u32, + union_find: &Arc, + pb: &ProgressBar, +) -> Result<(usize, usize)> { + let mut to_remove = 0; + let mut clusters = 0; + const BUFFER_THRESHOLD: usize = 5 * 1024 * 1024; + + let nodes_data = { + let mut docs = union_find.union_set.keys() + .filter(|(f, _)| *f == file_number) + .map(|(_, d)| *d) + .collect::>(); + docs.sort_unstable(); + + docs.into_iter().map(|doc| { + let node = (file_number, doc); + let mut current = node; + while let Some(&parent) = union_find.union_set.get(¤t) { + if parent == current { + break; + } + current = parent; + } + let root = current; + let size = *union_find.set_size.get(&root).unwrap_or(&1); + (doc, root, size) + }).collect::>() + }; + + let mut sizes_writer = S3StreamWriter::new( + client, + &output_path.bucket, + &output_path.with_key(&format!("{:06}.sizes", file_number)), + BUFFER_THRESHOLD, + ).await?; + + let mut remove_writer = S3StreamWriter::new( + client, + &output_path.bucket, + &output_path.with_key(&format!("{:06}.remove", file_number)), + BUFFER_THRESHOLD, + ).await?; + + for (doc, root, size) in nodes_data { + let node = (file_number, doc); + + // Write sizes + let mut buffer = Vec::new(); + buffer.write_u32::(doc)?; + buffer.write_u32::(size as u32)?; + sizes_writer.write(&buffer).await?; + + // Handle removal markers + if node != root { + let mut remove_buffer = Vec::new(); + remove_buffer.write_u32::(doc)?; + remove_writer.write(&remove_buffer).await?; + to_remove += 1; + } + + if node == root { + clusters += 1; + } + + pb.inc(1); + } + + sizes_writer.finalize().await?; + remove_writer.finalize().await?; + + Ok((to_remove, clusters)) +} + +async fn process_post_union( + client: &Client, + output_path: &S3Path, + union_find: UnionFind, // Changed from &UnionFind to take ownership +) -> Result<(usize, usize)> { + let data = union_find.data.lock().unwrap(); + let mut files: Vec<_> = data.union_set.keys() + .map(|(f, _)| *f) + .collect::>() + .into_iter() + .collect(); + let total_nodes = data.union_set.len(); + drop(data); + + // Convert to immutable Arc + let union_find_data = Arc::new(Arc::try_unwrap(union_find.data) + .expect("All threads should be finished") + .into_inner() + .unwrap()); + + files.sort_unstable(); + + println!("Processing {} files in parallel...", files.len()); + let pb = Arc::new(ProgressBar::new(total_nodes as u64)); + pb.set_style( + ProgressStyle::default_bar() + .template("{spinner:.green} [{elapsed_precise}] [{bar:40.cyan/blue}] {pos}/{len} ({eta})") + .unwrap() + .progress_chars("#>-"), + ); + + let pb_clone = Arc::clone(&pb); + tokio::spawn(async move { + while !pb_clone.is_finished() { + let elapsed = pb_clone.elapsed(); // Time elapsed since progress bar creation + let eta = pb_clone.eta(); // Estimated time remaining + eprintln!( + "Progress: {}/{} | Elapsed: {} | Remaining: {}", + pb_clone.position(), + pb_clone.length().unwrap_or(0), + format_duration(elapsed), + format_duration(eta) + ); + sleep(Duration::from_secs(5)).await; + } + }); + + + let semaphore = Arc::new(Semaphore::new(100)); + + + let mut handles = Vec::new(); + for file_number in files { + let client = client.clone(); + let output_path = output_path.clone(); + let union_find_data = Arc::clone(&union_find_data); + let pb = pb.clone(); + let semaphore = Arc::clone(&semaphore); + + let handle = task::spawn(async move { + let _permit = semaphore.acquire().await?; + process_single_file( + &client, + &output_path, + file_number, + &union_find_data, + &pb, + ).await + }); + handles.push(handle); + } + + let mut total_to_remove = 0; + let mut total_clusters = 0; + + for handle in handles { + let (to_remove, clusters) = handle.await??; + total_to_remove += to_remove; + total_clusters += clusters; + } + + pb.finish_with_message("Output writing complete"); + + Ok((total_to_remove, total_clusters)) +} + +#[tokio::main] +async fn main() -> Result<()> { + let args = Args::parse(); + + let config = aws_config::load_defaults(aws_config::BehaviorVersion::latest()).await; + let client = Client::new(&config); + + let input_path = S3Path::from_path(&args.input_folder)?; + let output_path = S3Path::from_path(&args.output_folder)?; + + let files = list_s3_files(&client, &input_path, args.total_files).await?; + + let union_find = UnionFind::new(); + let semaphore = Arc::new(if args.downloads == 0 { + Semaphore::new(args.total_files) // Effectively unlimited + } else { + Semaphore::new(args.downloads) + }); + + println!("Processing {} input files...", files.len()); + let pb = Arc::new(ProgressBar::new(files.len() as u64)); + pb.set_style( + ProgressStyle::default_bar() + .template("{spinner:.green} [{elapsed_precise}] [{bar:40.cyan/blue}] {pos}/{len} ({eta})") + .unwrap() + .progress_chars("#>-"), + ); + + let pb_clone = Arc::clone(&pb); + tokio::spawn(async move { + while !pb_clone.is_finished() { + let elapsed = pb_clone.elapsed(); // Time elapsed since progress bar creation + let eta = pb_clone.eta(); // Estimated time remaining + eprintln!( + "Progress: {}/{} | Elapsed: {} | Remaining: {}", + pb_clone.position(), + pb_clone.length().unwrap_or(0), + format_duration(elapsed), + format_duration(eta) + ); + sleep(Duration::from_secs(5)).await; + } + }); + let mut handles = Vec::new(); + + for file_path in files { + let client = client.clone(); + let union_find = Arc::clone(&union_find.data); + let pb = pb.clone(); + let semaphore = Arc::clone(&semaphore); + + let handle = task::spawn(async move { + let _permit = semaphore.acquire().await?; + let tuples = download_and_parse_file(&client, &file_path).await?; + + let mut data = union_find.lock().unwrap(); + for (f1, d1, f2, d2) in tuples { + let v_a = (f1, d1); + let v_b = (f2, d2); + + let root_a = { + let mut current = v_a; + let mut path = Vec::new(); + while let Some(&parent) = data.union_set.get(¤t) { + if parent == current { + break; + } + path.push(current); + current = parent; + } + if !data.union_set.contains_key(¤t) { + data.union_set.insert(current, current); + } + for node in path { + data.union_set.insert(node, current); + } + current + }; + + let root_b = { + let mut current = v_b; + let mut path = Vec::new(); + while let Some(&parent) = data.union_set.get(¤t) { + if parent == current { + break; + } + path.push(current); + current = parent; + } + if !data.union_set.contains_key(¤t) { + data.union_set.insert(current, current); + } + for node in path { + data.union_set.insert(node, current); + } + current + }; + + if root_a != root_b { + let size_a = *data.set_size.get(&root_a).unwrap_or(&1); + let size_b = *data.set_size.get(&root_b).unwrap_or(&1); + + let (big_root, small_root) = if size_a >= size_b { + (root_a, root_b) + } else { + (root_b, root_a) + }; + + data.union_set.insert(small_root, big_root); + data.set_size.insert(big_root, size_a + size_b); + data.set_size.remove(&small_root); + } + } + drop(data); + pb.inc(1); + Ok::<(), anyhow::Error>(()) + }); + + handles.push(handle); + } + + for handle in handles { + handle.await??; + } + pb.finish_with_message("File processing complete"); + + let (to_remove, clusters) = process_post_union(&client, &output_path, union_find).await?; + + println!("Processing complete:"); + println!(" Total clusters: {}", clusters); + println!(" Documents to remove: {}", to_remove); + + Ok(()) +} \ No newline at end of file diff --git a/src/datatrove/utils/japanese_tokenizer.py b/src/datatrove/utils/japanese_tokenizer.py new file mode 100644 index 00000000..ce1f4808 --- /dev/null +++ b/src/datatrove/utils/japanese_tokenizer.py @@ -0,0 +1,310 @@ +""" +Temporary fix for the memory leak in spacy's Japanese tokenizer. +Once the fix for https://github.com/explosion/spaCy/issues/13684 is released, +this can be removed. +""" + +import re +from collections import namedtuple +from pathlib import Path +from typing import Any, Dict, Optional, Union + +import srsly +from spacy import util +from spacy.errors import Errors +from spacy.lang.ja.stop_words import STOP_WORDS +from spacy.lang.ja.syntax_iterators import SYNTAX_ITERATORS +from spacy.lang.ja.tag_bigram_map import TAG_BIGRAM_MAP +from spacy.lang.ja.tag_map import TAG_MAP +from spacy.lang.ja.tag_orth_map import TAG_ORTH_MAP +from spacy.language import BaseDefaults, Language +from spacy.scorer import Scorer +from spacy.symbols import POS +from spacy.tokens import Doc +from spacy.training import validate_examples +from spacy.util import DummyTokenizer, load_config_from_str, registry +from spacy.vocab import Vocab + + +DEFAULT_CONFIG = """ +[nlp] + +[nlp.tokenizer] +@tokenizers = "spacy.ja.JapaneseTokenizer" +split_mode = null +""" + + +@registry.tokenizers("spacy.ja.JapaneseTokenizer") +def create_tokenizer(split_mode: Optional[str] = None): + def japanese_tokenizer_factory(nlp): + return JapaneseTokenizer(nlp.vocab, split_mode=split_mode) + + return japanese_tokenizer_factory + + +class JapaneseTokenizer(DummyTokenizer): + def __init__(self, vocab: Vocab, split_mode: Optional[str] = None) -> None: + self.vocab = vocab + self.split_mode = split_mode + self.tokenizer = try_sudachi_import(self.split_mode) + # if we're using split mode A we don't need subtokens + self.need_subtokens = not (split_mode is None or split_mode == "A") + + def __reduce__(self): + return JapaneseTokenizer, (self.vocab, self.split_mode) + + def __call__(self, text: str) -> Doc: + # convert sudachipy.morpheme.Morpheme to DetailedToken and merge continuous spaces + sudachipy_tokens = self.tokenizer.tokenize(text) + dtokens = self._get_dtokens(sudachipy_tokens) + dtokens, spaces = get_dtokens_and_spaces(dtokens, text) + + # create Doc with tag bi-gram based part-of-speech identification rules + words, tags, inflections, lemmas, norms, readings, sub_tokens_list = zip(*dtokens) if dtokens else [[]] * 7 + sub_tokens_list = list(sub_tokens_list) + doc = Doc(self.vocab, words=words, spaces=spaces) + next_pos = None # for bi-gram rules + for idx, (token, dtoken) in enumerate(zip(doc, dtokens)): + token.tag_ = dtoken.tag + if next_pos: # already identified in previous iteration + token.pos = next_pos + next_pos = None + else: + token.pos, next_pos = resolve_pos( + token.orth_, + dtoken.tag, + tags[idx + 1] if idx + 1 < len(tags) else None, + ) + # if there's no lemma info (it's an unk) just use the surface + token.lemma_ = dtoken.lemma if dtoken.lemma else dtoken.surface + morph = {} + if dtoken.inf: + # it's normal for this to be empty for non-inflecting types + morph["Inflection"] = dtoken.inf + token.norm_ = dtoken.norm + if dtoken.reading: + # punctuation is its own reading, but we don't want values like + # "=" here + morph["Reading"] = re.sub("[=|]", "_", dtoken.reading) + # ! Important to prevent the memory leak ! + # token.morph = MorphAnalysis(self.vocab, morph) + if self.need_subtokens: + doc.user_data["sub_tokens"] = sub_tokens_list + return doc + + def _get_dtokens(self, sudachipy_tokens, need_sub_tokens: bool = True): + sub_tokens_list = self._get_sub_tokens(sudachipy_tokens) if need_sub_tokens else None + dtokens = [ + DetailedToken( + token.surface(), # orth + "-".join([xx for xx in token.part_of_speech()[:4] if xx != "*"]), # tag + ";".join([xx for xx in token.part_of_speech()[4:] if xx != "*"]), # inf + token.dictionary_form(), # lemma + token.normalized_form(), + token.reading_form(), + sub_tokens_list[idx] if sub_tokens_list else None, # user_data['sub_tokens'] + ) + for idx, token in enumerate(sudachipy_tokens) + if len(token.surface()) > 0 + # remove empty tokens which can be produced with characters like … that + ] + # Sudachi normalizes internally and outputs each space char as a token. + # This is the preparation for get_dtokens_and_spaces() to merge the continuous space tokens + return [ + t + for idx, t in enumerate(dtokens) + if idx == 0 + or not t.surface.isspace() + or t.tag != "空白" + or not dtokens[idx - 1].surface.isspace() + or dtokens[idx - 1].tag != "空白" + ] + + def _get_sub_tokens(self, sudachipy_tokens): + # do nothing for default split mode + if not self.need_subtokens: + return None + + sub_tokens_list = [] # list of (list of list of DetailedToken | None) + for token in sudachipy_tokens: + sub_a = token.split(self.tokenizer.SplitMode.A) + if len(sub_a) == 1: # no sub tokens + sub_tokens_list.append(None) + elif self.split_mode == "B": + sub_tokens_list.append([self._get_dtokens(sub_a, False)]) + else: # "C" + sub_b = token.split(self.tokenizer.SplitMode.B) + if len(sub_a) == len(sub_b): + dtokens = self._get_dtokens(sub_a, False) + sub_tokens_list.append([dtokens, dtokens]) + else: + sub_tokens_list.append( + [ + self._get_dtokens(sub_a, False), + self._get_dtokens(sub_b, False), + ] + ) + return sub_tokens_list + + def score(self, examples): + validate_examples(examples, "JapaneseTokenizer.score") + return Scorer.score_tokenization(examples) + + def _get_config(self) -> Dict[str, Any]: + return {"split_mode": self.split_mode} + + def _set_config(self, config: Dict[str, Any] = {}) -> None: + self.split_mode = config.get("split_mode", None) + + def to_bytes(self, **kwargs) -> bytes: + serializers = {"cfg": lambda: srsly.json_dumps(self._get_config())} + return util.to_bytes(serializers, []) + + def from_bytes(self, data: bytes, **kwargs) -> "JapaneseTokenizer": + deserializers = {"cfg": lambda b: self._set_config(srsly.json_loads(b))} + util.from_bytes(data, deserializers, []) + self.tokenizer = try_sudachi_import(self.split_mode) + return self + + def to_disk(self, path: Union[str, Path], **kwargs) -> None: + path = util.ensure_path(path) + serializers = {"cfg": lambda p: srsly.write_json(p, self._get_config())} + util.to_disk(path, serializers, []) + + def from_disk(self, path: Union[str, Path], **kwargs) -> "JapaneseTokenizer": + path = util.ensure_path(path) + serializers = {"cfg": lambda p: self._set_config(srsly.read_json(p))} + util.from_disk(path, serializers, []) + self.tokenizer = try_sudachi_import(self.split_mode) + return self + + +class JapaneseDefaults(BaseDefaults): + config = load_config_from_str(DEFAULT_CONFIG) + stop_words = STOP_WORDS + syntax_iterators = SYNTAX_ITERATORS + writing_system = {"direction": "ltr", "has_case": False, "has_letters": False} + + +class Japanese(Language): + lang = "ja" + Defaults = JapaneseDefaults + + +# Hold the attributes we need with convenient names +DetailedToken = namedtuple("DetailedToken", ["surface", "tag", "inf", "lemma", "norm", "reading", "sub_tokens"]) + + +def try_sudachi_import(split_mode="A"): + """SudachiPy is required for Japanese support, so check for it. + It it's not available blow up and explain how to fix it. + split_mode should be one of these values: "A", "B", "C", None->"A".""" + try: + from sudachipy import dictionary, tokenizer + + split_mode = { + None: tokenizer.Tokenizer.SplitMode.A, + "A": tokenizer.Tokenizer.SplitMode.A, + "B": tokenizer.Tokenizer.SplitMode.B, + "C": tokenizer.Tokenizer.SplitMode.C, + }[split_mode] + tok = dictionary.Dictionary().create(mode=split_mode) + return tok + except ImportError: + raise ImportError( + "Japanese support requires SudachiPy and SudachiDict-core " + "(https://github.com/WorksApplications/SudachiPy). " + "Install with `pip install sudachipy sudachidict_core` or " + "install spaCy with `pip install spacy[ja]`." + ) from None + + +def resolve_pos(orth, tag, next_tag): + """If necessary, add a field to the POS tag for UD mapping. + Under Universal Dependencies, sometimes the same Unidic POS tag can + be mapped differently depending on the literal token or its context + in the sentence. This function returns resolved POSs for both token + and next_token by tuple. + """ + + # Some tokens have their UD tag decided based on the POS of the following + # token. + + # apply orth based mapping + if tag in TAG_ORTH_MAP: + orth_map = TAG_ORTH_MAP[tag] + if orth in orth_map: + return orth_map[orth], None # current_pos, next_pos + + # apply tag bi-gram mapping + if next_tag: + tag_bigram = tag, next_tag + if tag_bigram in TAG_BIGRAM_MAP: + current_pos, next_pos = TAG_BIGRAM_MAP[tag_bigram] + if current_pos is None: # apply tag uni-gram mapping for current_pos + return ( + TAG_MAP[tag][POS], + next_pos, + ) # only next_pos is identified by tag bi-gram mapping + else: + return current_pos, next_pos + + # apply tag uni-gram mapping + return TAG_MAP[tag][POS], None + + +def get_dtokens_and_spaces(dtokens, text, gap_tag="空白"): + # Compare the content of tokens and text, first + words = [x.surface for x in dtokens] + if "".join("".join(words).split()) != "".join(text.split()): + raise ValueError(Errors.E194.format(text=text, words=words)) + + text_dtokens = [] + text_spaces = [] + text_pos = 0 + # handle empty and whitespace-only texts + if len(words) == 0: + return text_dtokens, text_spaces + elif len([word for word in words if not word.isspace()]) == 0: + assert text.isspace() + text_dtokens = [DetailedToken(text, gap_tag, "", text, text, None, None)] + text_spaces = [False] + return text_dtokens, text_spaces + + # align words and dtokens by referring text, and insert gap tokens for the space char spans + for i, (word, dtoken) in enumerate(zip(words, dtokens)): + # skip all space tokens + if word.isspace(): + continue + try: + word_start = text[text_pos:].index(word) + except ValueError: + raise ValueError(Errors.E194.format(text=text, words=words)) from None + + # space token + if word_start > 0: + w = text[text_pos : text_pos + word_start] + text_dtokens.append(DetailedToken(w, gap_tag, "", w, w, None, None)) + text_spaces.append(False) + text_pos += word_start + + # content word + text_dtokens.append(dtoken) + text_spaces.append(False) + text_pos += len(word) + # poll a space char after the word + if i + 1 < len(dtokens) and dtokens[i + 1].surface == " ": + text_spaces[-1] = True + text_pos += 1 + + # trailing space token + if text_pos < len(text): + w = text[text_pos:] + text_dtokens.append(DetailedToken(w, gap_tag, "", w, w, None, None)) + text_spaces.append(False) + + return text_dtokens, text_spaces + + +__all__ = ["Japanese"] diff --git a/src/datatrove/utils/text.py b/src/datatrove/utils/text.py index a2d713bc..7ab7d3d4 100644 --- a/src/datatrove/utils/text.py +++ b/src/datatrove/utils/text.py @@ -1,11 +1,13 @@ import re import unicodedata from dataclasses import dataclass +from functools import lru_cache from itertools import tee from typing import Iterable +import regex + from datatrove.utils.typeshelper import Languages -from datatrove.utils.word_tokenizers import load_word_tokenizer PUNCTUATION = "!/—”:%1〈&(、━\\【#%「」,】;+^]~“《„';’{|∶´[=-`*.(–?!:$~«〉,><》)?)。…@_.\"}►»" + "".join( @@ -14,7 +16,169 @@ (x for a, b in ((0, 9), (11, 13), (13, 32), (127, 160)) for x in range(a, b)), ) ) -PUNCTUATION_SET = set(PUNCTUATION) +TERMINAL_PUNCTUATION = { + "᪩", + "?", + "⁈", + "𑩂", + ".", + "꩞", + "𑅃", + "﹗", + "𑂾", + "\u1b7d", + "፧", + "𑅂", + "꡶", + "꘎", + "⁉", + "࠾", + "᪨", + "𑊩", + "𑱂", + "᱿", + "𖩮", + "᥅", + "\U00011f43", + "\U00011f44", + "﹒", + "𑈹", + "𑈸", + "።", + "܂", + "؞", + "꛳", + "\U00010f88", + "𑗍", + "𐩖", + "𑙂", + "\u061d", + "꩟", + "᠉", + "\u1b7e", + "𑗗", + "᰼", + "𑻸", + "؟", + "𑪜", + "꧉", + "𑗉", + "𐽙", + "𖫵", + "𖬷", + "܀", + "꓿", + "᜵", + "𑗏", + "𑁇", + "𑗓", + "𑥄", + "៖", + "𑥆", + "𑗑", + "𑗒", + "꯫", + "۔", + "𐩗", + "\U00010f86", + "꡷", + "\u2e54", + "。", + "៕", + "߹", + "⸮", + ".", + "𑇅", + "࠹", + "𛲟", + "꫰", + "꤯", + "𐽗", + "᭞", + "𑜼", + "፨", + "𑃁", + "꣏", + "𑇟", + "𖬸", + "𑪛", + "𑜾", + "࠷", + "𝪈", + "?", + "𑃀", + "𑗃", + "!", + "։", + "꣎", + "॥", + "𑗖", + "᭛", + "᠃", + "!", + "၊", + "𖺘", + "⁇", + "𑗌", + "𑑋", + "𖭄", + "᭟", + "𑅁", + "𑙁", + "⸼", + "꩝", + "𑗋", + "。", + "꧈", + "꫱", + "𑜽", + "𐽖", + "𑂿", + "᙮", + "។", + "꛷", + "\U00010f89", + "៚", + "᥄", + "𑗕", + "𑗎", + "᪪", + "᭚", + "࠽", + "𑇞", + "𑗊", + "𐽘", + "\u2e53", + "𑗔", + "𖩯", + "𑇍", + "𑻷", + "𐽕", + "𑩃", + "।", + "𑗂", + "𑇆", + "𑁈", + "။", + "᱾", + "𑱁", + "꘏", + "܁", + "᜶", + "‼", + "𑈻", + "‽", + "᪫", + "﹖", + "𑑌", + "𑈼", + "\U00010f87", + "𑗐", + "៙", + "᰻", +} +# add other scripts +PUNCTUATION_SET = set(PUNCTUATION).union(TERMINAL_PUNCTUATION) PUNCTUATION_TRANS = str.maketrans(PUNCTUATION, " " * len(PUNCTUATION)) @@ -30,7 +194,15 @@ class TextNormConfig: DEF_TEXT_NORM_CONFIG = TextNormConfig() -NUMBERS_PATTERN = re.compile(r"\d+(\.\d+)?") +# Match digits in any script, allowing for different decimal separators +# One or more digits in any script +# Common decimal separators (period, comma, Arabic decimal, etc) +# Optional decimal part with digits +# we need regex and not re for this one to match unicode +NUMBERS_PATTERN = regex.compile( + r"\p{Nd}+([.,،٫⎖⎗⎘]{1}\p{Nd}+)?", + regex.VERBOSE | regex.UNICODE, +) WHITESPACE_PATTERN = re.compile(r"\s+") # WARNING: english specific WEEKDAYS_PATTERN = re.compile(r"monday|tuesday|wednesday|thursday|friday|saturday|sunday") @@ -98,15 +270,22 @@ def ngrams(sequence: Iterable, n: int): SPLIT_TEXT_DOCUMENTS = "DOCUMENT" SPLIT_TEXT_SENTENCES = "SENTENCE" SPLIT_TEXT_PARAGRAPHS = "PARAGRAPH" +SPLIT_TEXT_WORDS = "WORDS" +@lru_cache(5) def split_into_parts(text, mode="DOCUMENT", language=Languages.english): + from datatrove.utils.word_tokenizers import load_word_tokenizer + if mode == SPLIT_TEXT_DOCUMENTS: return [text] elif mode == SPLIT_TEXT_SENTENCES: tokenizer = load_word_tokenizer(language) spans = [b for _, b in tokenizer.span_tokenize(text)] return [text[a:b] for a, b in zip([0] + spans[:-1], spans[:-1] + [len(text)])] + elif mode == SPLIT_TEXT_WORDS: + tokenizer = load_word_tokenizer(language) + return tokenizer.word_tokenize(text) elif mode == SPLIT_TEXT_PARAGRAPHS: # merge whitespace with prev line og_lines = text.splitlines() @@ -124,3 +303,15 @@ def split_into_parts(text, mode="DOCUMENT", language=Languages.english): return lines else: raise ValueError(f"Unknown {mode=}") + + +def split_into_words(text, language=Languages.english): + return split_into_parts(text, mode=SPLIT_TEXT_WORDS, language=language) + + +def split_into_sentences(text, language=Languages.english): + return split_into_parts(text, mode=SPLIT_TEXT_SENTENCES, language=language) + + +def split_into_paragraphs(text, language=Languages.english): + return split_into_parts(text, mode=SPLIT_TEXT_PARAGRAPHS, language=language) diff --git a/src/datatrove/utils/typeshelper.py b/src/datatrove/utils/typeshelper.py index 5ea348b8..9259c8b3 100644 --- a/src/datatrove/utils/typeshelper.py +++ b/src/datatrove/utils/typeshelper.py @@ -1,121 +1,4095 @@ -from dataclasses import dataclass - - -@dataclass -class NiceRepr: - emoji: str - name: str - - def __post_init__(self): - self.name = self.name.capitalize() - - def get_name(self): - return f"---> {self.emoji} {self.name}\n" - - class Languages: - english = "en" - spanish = "es" - portuguese = "pt" - italian = "it" - french = "fr" - romanian = "ro" - german = "de" - latin = "la" - czech = "cs" - danish = "da" - finnish = "fi" - greek = "el" - norwegian = "no" - polish = "pl" - russian = "ru" - slovenian = "sl" - swedish = "sv" - turkish = "tr" - dutch = "nl" - chinese = "zh" - japanese = "ja" - vietnamese = "vi" - indonesian = "id" - persian = "fa" - korean = "ko" - arabic = "ar" - thai = "th" - hindi = "hi" - bengali = "bn" - tamil = "ta" - hungarian = "hu" - ukrainian = "uk" - slovak = "sk" - bulgarian = "bg" - catalan = "ca" - croatian = "hr" - serbian = "sr" - lithuanian = "lt" - estonian = "et" - hebrew = "he" - latvian = "lv" - serbocroatian = "sh" # Deprecated - albanian = "sq" - azerbaijani = "az" - icelandic = "is" - macedonian = "mk" - georgian = "ka" - galician = "gl" - armenian = "hy" - basque = "eu" - swahili = "sw" - malay = "ms" - tagalog = "tl" - javanese = "jv" - punjabi = "pa" - bihari = "bh" # Deprecated - gujarati = "gu" - yoruba = "yo" - marathi = "mr" - urdu = "ur" - amharic = "am" - telugu = "te" - malayalam = "ml" - kannada = "kn" - nepali = "ne" - kazakh = "kk" - belarusian = "be" - burmese = "my" - esperanto = "eo" - uzbek = "uz" - khmer = "km" - tajik = "tg" - welsh = "cy" - norwegian_nynorsk = "nn" - bosnian = "bs" - sinhala = "si" - tatar = "tt" - afrikaans = "af" - oriya = "or" - kirghiz = "ky" - irish = "ga" - occitan = "oc" - kurdish = "ku" - lao = "lo" - luxembourgish = "lb" - bashkir = "ba" - western_frisian = "fy" - pashto = "ps" - maltese = "mt" - breton = "bt" - assamese = "as" - malagasy = "mg" - divehi = "dv" - yiddish = "yi" - somali = "so" - sanskrit = "sa" - sindhi = "sd" - turkmen = "tk" + arifama_miniafia = "aai" + arifama_miniafia__latn = "aai_Latn" + ankave = "aak" + ankave__latn = "aak_Latn" + abau = "aau" + abau__latn = "aau_Latn" + amarasi = "aaz" + amarasi__latn = "aaz_Latn" + abe = "aba" + abe__latn = "aba_Latn" + abidji = "abi" + abidji__latn = "abi_Latn" + abkhazian = "abk" + abkhazian__cyrl = "abk_Cyrl" + abua = "abn" + abua__latn = "abn_Latn" + abaza = "abq" + abaza__cyrl = "abq_Cyrl" + ambonese_malay = "abs" + ambonese_malay__latn = "abs_Latn" + ambulas = "abt" + ambulas__latn = "abt_Latn" + inabaknon = "abx" + inabaknon__latn = "abx_Latn" + aneme_wake = "aby" + aneme_wake__latn = "aby_Latn" + abui = "abz" + abui__latn = "abz_Latn" + achagua = "aca" + achagua__latn = "aca_Latn" + gikyode = "acd" + gikyode__latn = "acd_Latn" + achinese = "ace" + achinese__latn = "ace_Latn" + achinese__arab = "ace_Arab" + saint_lucian_creole_french = "acf" + saint_lucian_creole_french__latn = "acf_Latn" + acoli = "ach" + acoli__latn = "ach_Latn" + mesopotamian_arabic = "acm" + mesopotamian_arabic__arab = "acm_Arab" + achang = "acn" + achang__latn = "acn_Latn" + achi = "acr" + achi__latn = "acr_Latn" + achuar_shiwiar = "acu" + achuar_shiwiar__latn = "acu_Latn" + adangme = "ada" + adangme__latn = "ada_Latn" + adele = "ade" + adele__latn = "ade_Latn" + adhola = "adh" + adhola__latn = "adh_Latn" + adi = "adi" + adi__latn = "adi_Latn" + adioukrou = "adj" + adioukrou__latn = "adj_Latn" + galo = "adl" + galo__latn = "adl_Latn" + adyghe = "ady" + adyghe__cyrl = "ady_Cyrl" + adzera = "adz" + adzera__latn = "adz_Latn" + tunisian_arabic = "aeb" + tunisian_arabic__arab = "aeb_Arab" + eastern_arrernte = "aer" + eastern_arrernte__latn = "aer_Latn" + akeu = "aeu" + akeu__latn = "aeu_Latn" + amele = "aey" + amele__latn = "aey_Latn" + afrikaans = "afr" + afrikaans__latn = "afr_Latn" + agarabi = "agd" + agarabi__latn = "agd_Latn" + angor = "agg" + angor__latn = "agg_Latn" + angaataha = "agm" + angaataha__latn = "agm_Latn" + agutaynen = "agn" + agutaynen__latn = "agn_Latn" + aguaruna = "agr" + aguaruna__latn = "agr_Latn" + central_cagayan_agta = "agt" + central_cagayan_agta__latn = "agt_Latn" + aguacateco = "agu" + aguacateco__latn = "agu_Latn" + kahua = "agw" + kahua__latn = "agw_Latn" + aghul = "agx" + aghul__cyrl = "agx_Cyrl" + ahanta = "aha" + ahanta__latn = "aha_Latn" + akha = "ahk" + akha__latn = "ahk_Latn" + arosi = "aia" + arosi__latn = "aia_Latn" + assyrian_neo_aramaic = "aii" + assyrian_neo_aramaic__syrc = "aii_Syrc" + aimol = "aim" + aimol__latn = "aim_Latn" + ainu_japan = "ain" + ainu_japan__latn = "ain_Latn" + aja_benin = "ajg" + aja_benin__latn = "ajg_Latn" + ajie = "aji" + ajie__latn = "aji_Latn" + amri_karbi = "ajz" + amri_karbi__latn = "ajz_Latn" + akan = "aka" + akan__latn = "aka_Latn" + batak_angkola = "akb" + batak_angkola__latn = "akb_Latn" + akawaio = "ake" + akawaio__latn = "ake_Latn" + angal_heneng = "akh" + angal_heneng__latn = "akh_Latn" + siwu = "akp" + siwu__latn = "akp_Latn" + alladian = "ald" + alladian__latn = "ald_Latn" + alangan = "alj" + alangan__latn = "alj_Latn" + gheg_albanian = "aln" + gheg_albanian__latn = "aln_Latn" + alune = "alp" + alune__latn = "alp_Latn" + algonquin = "alq" + algonquin__latn = "alq_Latn" + tosk_albanian = "als" + tosk_albanian__latn = "als_Latn" + southern_altai = "alt" + southern_altai__cyrl = "alt_Cyrl" + alyawarr = "aly" + alyawarr__latn = "aly_Latn" + alur = "alz" + alur__latn = "alz_Latn" + yanesha_ = "ame" + yanesha___latn = "ame_Latn" + hamer_banna = "amf" + hamer_banna__latn = "amf_Latn" + amharic = "amh" + amharic__ethi = "amh_Ethi" + amis = "ami" + amis__latn = "ami_Latn" + ambai = "amk" + ambai__latn = "amk_Latn" + ama_papua_new_guinea = "amm" + ama_papua_new_guinea__latn = "amm_Latn" + amanab = "amn" + amanab__latn = "amn_Latn" + alamblak = "amp" + alamblak__latn = "amp_Latn" + amarakaeri = "amr" + amarakaeri__latn = "amr_Latn" + guerrero_amuzgo = "amu" + guerrero_amuzgo__latn = "amu_Latn" + anmatyerre = "amx" + anmatyerre__latn = "amx_Latn" + old_english_ca_450_1100 = "ang" + old_english_ca_450_1100__latn = "ang_Latn" + anal = "anm" + anal__latn = "anm_Latn" + obolo = "ann" + obolo__latn = "ann_Latn" + angika = "anp" + angika__deva = "anp_Deva" + denya = "anv" + denya__latn = "anv_Latn" + anyin = "any" + anyin__latn = "any_Latn" + anindilyakwa = "aoi" + anindilyakwa__latn = "aoi_Latn" + mufian = "aoj" + mufian__latn = "aoj_Latn" + omie = "aom" + omie__latn = "aom_Latn" + uab_meto = "aoz" + uab_meto__latn = "aoz_Latn" + sa_a = "apb" + sa_a__latn = "apb_Latn" + levantine_arabic = "apc" + levantine_arabic__arab = "apc_Arab" + bukiyip = "ape" + bukiyip__latn = "ape_Latn" + apinaye = "apn" + apinaye__latn = "apn_Latn" + arop_lokep = "apr" + arop_lokep__latn = "apr_Latn" + apatani = "apt" + apatani__latn = "apt_Latn" + apurina = "apu" + apurina__latn = "apu_Latn" + western_apache = "apw" + western_apache__latn = "apw_Latn" + apalai = "apy" + apalai__latn = "apy_Latn" + safeyoka = "apz" + safeyoka__latn = "apz_Latn" + akuntsu = "aqz" + akuntsu__latn = "aqz_Latn" + arabic = "ara" + arabic__arab = "ara_Arab" + arabic__latn = "ara_Latn" + standard_arabic = "arb" + standard_arabic__arab = "arb_Arab" + standard_arabic__latn = "arb_Latn" + western_arrarnta = "are" + western_arrarnta__latn = "are_Latn" + aragonese = "arg" + aragonese__latn = "arg_Latn" + arabela = "arl" + arabela__latn = "arl_Latn" + mapudungun = "arn" + mapudungun__latn = "arn_Latn" + arapaho = "arp" + arapaho__latn = "arp_Latn" + algerian_arabic = "arq" + algerian_arabic__arab = "arq_Arab" + karo_brazil = "arr" + karo_brazil__latn = "arr_Latn" + najdi_arabic = "ars" + najdi_arabic__arab = "ars_Arab" + moroccan_arabic = "ary" + moroccan_arabic__arab = "ary_Arab" + egyptian_arabic = "arz" + egyptian_arabic__arab = "arz_Arab" + cishingini = "asg" + cishingini__latn = "asg_Latn" + assamese = "asm" + assamese__beng = "asm_Beng" + assamese__latn = "asm_Latn" + dano = "aso" + dano__latn = "aso_Latn" + asturian = "ast" + asturian__latn = "ast_Latn" + pele_ata = "ata" + pele_ata__latn = "ata_Latn" + zaiwa = "atb" + zaiwa__latn = "atb_Latn" + ata_manobo = "atd" + ata_manobo__latn = "atd_Latn" + ivbie_north_okpela_arhe = "atg" + ivbie_north_okpela_arhe__latn = "atg_Latn" + attie = "ati" + attie__latn = "ati_Latn" + atikamekw = "atj" + atikamekw__latn = "atj_Latn" + aralle_tabulahan = "atq" + aralle_tabulahan__latn = "atq_Latn" + pamplona_atta = "att" + pamplona_atta__latn = "att_Latn" + waorani = "auc" + waorani__latn = "auc_Latn" + anuki = "aui" + anuki__latn = "aui_Latn" + awiyaana = "auy" + awiyaana__latn = "auy_Latn" + avaric = "ava" + avaric__cyrl = "ava_Cyrl" + kotava = "avk" + kotava__latn = "avk_Latn" + avatime = "avn" + avatime__latn = "avn_Latn" + au = "avt" + au__latn = "avt_Latn" + avokaya = "avu" + avokaya__latn = "avu_Latn" + awadhi = "awa" + awadhi__deva = "awa_Deva" + awa_papua_new_guinea = "awb" + awa_papua_new_guinea__latn = "awb_Latn" + aekyom = "awi" + aekyom__latn = "awi_Latn" + awara = "awx" + awara__latn = "awx_Latn" + aymara = "aym" + aymara__latn = "aym_Latn" + ayoreo = "ayo" + ayoreo__latn = "ayo_Latn" + north_mesopotamian_arabic = "ayp" + north_mesopotamian_arabic__arab = "ayp_Arab" + central_aymara = "ayr" + central_aymara__latn = "ayr_Latn" south_azerbaijani = "azb" - sorani = "ckb" + south_azerbaijani__arab = "azb_Arab" + azerbaijani = "aze" + azerbaijani__latn = "aze_Latn" + azerbaijani__arab = "aze_Arab" + azerbaijani__cyrl = "aze_Cyrl" + san_pedro_amuzgos_amuzgo = "azg" + san_pedro_amuzgos_amuzgo__latn = "azg_Latn" + north_azerbaijani = "azj" + north_azerbaijani__latn = "azj_Latn" + north_azerbaijani__cyrl = "azj_Cyrl" + highland_puebla_nahuatl = "azz" + highland_puebla_nahuatl__latn = "azz_Latn" + bashkir = "bak" + bashkir__cyrl = "bak_Cyrl" + baluchi = "bal" + baluchi__arab = "bal_Arab" + bambara = "bam" + bambara__latn = "bam_Latn" + balinese = "ban" + balinese__latn = "ban_Latn" + waimaha = "bao" + waimaha__latn = "bao_Latn" + bavarian = "bar" + bavarian__latn = "bar_Latn" + basa_cameroon = "bas" + basa_cameroon__latn = "bas_Latn" + vengo = "bav" + vengo__latn = "bav_Latn" + baatonum = "bba" + baatonum__latn = "bba_Latn" + barai = "bbb" + barai__latn = "bbb_Latn" + batak_toba = "bbc" + batak_toba__latn = "bbc_Latn" + ghomala_ = "bbj" + ghomala___latn = "bbj_Latn" + babanki = "bbk" + babanki__latn = "bbk_Latn" + northern_bobo_madare = "bbo" + northern_bobo_madare__latn = "bbo_Latn" + girawa = "bbr" + girawa__latn = "bbr_Latn" + southern_balochi = "bcc" + southern_balochi__arab = "bcc_Arab" + bariai = "bch" + bariai__latn = "bch_Latn" + baoule = "bci" + baoule__latn = "bci_Latn" + central_bikol = "bcl" + central_bikol__latn = "bcl_Latn" + kaluli = "bco" + kaluli__latn = "bco_Latn" + bana = "bcw" + bana__latn = "bcw_Latn" + bunama = "bdd" + bunama__latn = "bdd_Latn" + baka_south_sudan = "bdh" + baka_south_sudan__latn = "bdh_Latn" + bahnar = "bdq" + bahnar__latn = "bdq_Latn" + beaver = "bea" + beaver__latn = "bea_Latn" + benabena = "bef" + benabena__latn = "bef_Latn" + belarusian = "bel" + belarusian__cyrl = "bel_Cyrl" + bemba_zambia = "bem" + bemba_zambia__latn = "bem_Latn" + bengali = "ben" + bengali__beng = "ben_Beng" + bengali__latn = "ben_Latn" + beembe = "beq" + beembe__latn = "beq_Latn" + betawi = "bew" + betawi__latn = "bew_Latn" + jur_modo = "bex" + jur_modo__latn = "bex_Latn" + bafut = "bfd" + bafut__latn = "bfd_Latn" + malba_birifor = "bfo" + malba_birifor__latn = "bfo_Latn" + bawm_chin = "bgr" + bawm_chin__latn = "bgr_Latn" + tagabawa = "bgs" + tagabawa__latn = "bgs_Latn" + bughotu = "bgt" + bughotu__latn = "bgt_Latn" + banggai = "bgz" + banggai__latn = "bgz_Latn" + binandere = "bhg" + binandere__latn = "bhg_Latn" + bimin = "bhl" + bimin__latn = "bhl_Latn" + bhojpuri = "bho" + bhojpuri__deva = "bho_Deva" + bima = "bhp" + bima__latn = "bhp_Latn" + biak = "bhw" + biak__latn = "bhw_Latn" + bada_indonesia = "bhz" + bada_indonesia__latn = "bhz_Latn" + bissa = "bib" + bissa__latn = "bib_Latn" + biangai = "big" + biangai__latn = "big_Latn" + bikol = "bik" + bikol__latn = "bik_Latn" + bimoba = "bim" + bimoba__latn = "bim_Latn" + bini = "bin" + bini__latn = "bin_Latn" + bislama = "bis" + bislama__latn = "bis_Latn" + biete = "biu" + biete__latn = "biu_Latn" + southern_birifor = "biv" + southern_birifor__latn = "biv_Latn" + banjar = "bjn" + banjar__latn = "bjn_Latn" + banjar__arab = "bjn_Arab" + fanamaket = "bjp" + fanamaket__latn = "bjp_Latn" + binumarien = "bjr" + binumarien__latn = "bjr_Latn" + bedjond = "bjv" + bedjond__latn = "bjv_Latn" + binukid = "bkd" + binukid__latn = "bkd_Latn" + berik = "bkl" + berik__latn = "bkl_Latn" + bakairi = "bkq" + bakairi__latn = "bkq_Latn" + buhid = "bku" + buhid__latn = "bku_Latn" + bekwarra = "bkv" + bekwarra__latn = "bkv_Latn" + siksika = "bla" + siksika__latn = "bla_Latn" + kuwaa = "blh" + kuwaa__latn = "blh_Latn" + pa_o_karen = "blk" + pa_o_karen__mymr = "blk_Mymr" + tai_dam = "blt" + tai_dam__latn = "blt_Latn" + balangao = "blw" + balangao__latn = "blw_Latn" + balantak = "blz" + balantak__latn = "blz_Latn" + kein = "bmh" + kein__latn = "bmh_Latn" + ghayavi = "bmk" + ghayavi__latn = "bmk_Latn" + bomu = "bmq" + bomu__latn = "bmq_Latn" + muinane = "bmr" + muinane__latn = "bmr_Latn" + somba_siawari = "bmu" + somba_siawari__latn = "bmu_Latn" + bum = "bmv" + bum__latn = "bmv_Latn" + bontok = "bnc" + bontok__latn = "bnc_Latn" + eastern_tawbuid = "bnj" + eastern_tawbuid__latn = "bnj_Latn" + bantoanon = "bno" + bantoanon__latn = "bno_Latn" + bola = "bnp" + bola__latn = "bnp_Latn" + bora = "boa" + bora__latn = "boa_Latn" + tibetan = "bod" + tibetan__tibt = "bod_Tibt" + anjam = "boj" + anjam__latn = "boj_Latn" + berom = "bom" + berom__latn = "bom_Latn" + bine = "bon" + bine__latn = "bon_Latn" + bororo = "bor" + bororo__latn = "bor_Latn" + bosnian = "bos" + bosnian__latn = "bos_Latn" + tuwuli = "bov" + tuwuli__latn = "bov_Latn" + buamu = "box" + buamu__latn = "box_Latn" + koronadal_blaan = "bpr" + koronadal_blaan__latn = "bpr_Latn" + sarangani_blaan = "bps" + sarangani_blaan__latn = "bps_Latn" + bishnupriya = "bpy" + bishnupriya__beng = "bpy_Beng" + boko_benin = "bqc" + boko_benin__latn = "bqc_Latn" + bandial = "bqj" + bandial__latn = "bqj_Latn" + busa = "bqp" + busa__latn = "bqp_Latn" + breton = "bre" + breton__latn = "bre_Latn" + brahui = "brh" + brahui__arab = "brh_Arab" + eastern_bru = "bru" + eastern_bru__latn = "bru_Latn" + bodo_india = "brx" + bodo_india__latn = "brx_Latn" + bodo_india__deva = "brx_Deva" + bassari = "bsc" + bassari__latn = "bsc_Latn" + barasana_eduria = "bsn" + barasana_eduria__latn = "bsn_Latn" + baga_sitemu = "bsp" + baga_sitemu__latn = "bsp_Latn" + bassa = "bsq" + bassa__latn = "bsq_Latn" + akoose = "bss" + akoose__latn = "bss_Latn" + batak_dairi = "btd" + batak_dairi__latn = "btd_Latn" + biatah_bidayuh = "bth" + biatah_bidayuh__latn = "bth_Latn" + batak_simalungun = "bts" + batak_simalungun__latn = "bts_Latn" + bete_bendi = "btt" + bete_bendi__latn = "btt_Latn" + batak_karo = "btx" + batak_karo__latn = "btx_Latn" + buriat = "bua" + buriat__cyrl = "bua_Cyrl" + ntcham = "bud" + ntcham__latn = "bud_Latn" + buginese = "bug" + buginese__latn = "bug_Latn" + bugawac = "buk" + bugawac__latn = "buk_Latn" + bulgarian = "bul" + bulgarian__cyrl = "bul_Cyrl" + bulu_cameroon = "bum" + bulu_cameroon__latn = "bum_Latn" + bokobaru = "bus" + bokobaru__latn = "bus_Latn" + baelelea = "bvc" + baelelea__latn = "bvc_Latn" + baeggu = "bvd" + baeggu__latn = "bvd_Latn" + burarra = "bvr" + burarra__latn = "bvr_Latn" + bauzi = "bvz" + bauzi__latn = "bvz_Latn" + bwaidoka = "bwd" + bwaidoka__latn = "bwd_Latn" + baniwa = "bwi" + baniwa__latn = "bwi_Latn" + southern_bobo_madare = "bwq" + southern_bobo_madare__latn = "bwq_Latn" + buli_ghana = "bwu" + buli_ghana__latn = "bwu_Latn" + buhutu = "bxh" + buhutu__latn = "bxh_Latn" + russia_buriat = "bxr" + russia_buriat__cyrl = "bxr_Cyrl" + baruya = "byr" + baruya__latn = "byr_Latn" + medumba = "byv" + medumba__latn = "byv_Latn" + qaqet = "byx" + qaqet__latn = "byx_Latn" + bribri = "bzd" + bribri__latn = "bzd_Latn" + mapos_buang = "bzh" + mapos_buang__latn = "bzh_Latn" + bisu = "bzi" + bisu__thai = "bzi_Thai" + belize_kriol_english = "bzj" + belize_kriol_english__latn = "bzj_Latn" + chorti = "caa" + chorti__latn = "caa_Latn" + garifuna = "cab" + garifuna__latn = "cab_Latn" + chuj = "cac" + chuj__latn = "cac_Latn" + southern_carrier = "caf" + southern_carrier__latn = "caf_Latn" + nivacle = "cag" + nivacle__latn = "cag_Latn" + kaqchikel = "cak" + kaqchikel__latn = "cak_Latn" + chacobo = "cao" + chacobo__latn = "cao_Latn" + chipaya = "cap" + chipaya__latn = "cap_Latn" + car_nicobarese = "caq" + car_nicobarese__latn = "caq_Latn" + galibi_carib = "car" + galibi_carib__latn = "car_Latn" + tsimane = "cas" + tsimane__latn = "cas_Latn" + catalan = "cat" + catalan__latn = "cat_Latn" + cavinena = "cav" + cavinena__latn = "cav_Latn" + chiquitano = "cax" + chiquitano__latn = "cax_Latn" + carapana = "cbc" + carapana__latn = "cbc_Latn" + chachi = "cbi" + chachi__latn = "cbi_Latn" + chavacano = "cbk" + chavacano__latn = "cbk_Latn" + cashibo_cacataibo = "cbr" + cashibo_cacataibo__latn = "cbr_Latn" + cashinahua = "cbs" + cashinahua__latn = "cbs_Latn" + chayahuita = "cbt" + chayahuita__latn = "cbt_Latn" + candoshi_shapra = "cbu" + candoshi_shapra__latn = "cbu_Latn" + cacua = "cbv" + cacua__latn = "cbv_Latn" + chopi = "cce" + chopi__latn = "cce_Latn" + comaltepec_chinantec = "cco" + comaltepec_chinantec__latn = "cco_Latn" + chakma = "ccp" + chakma__latn = "ccp_Latn" + chiru = "cdf" + chiru__latn = "cdf_Latn" cebuano = "ceb" - war = "war" + cebuano__latn = "ceb_Latn" + chamacoco = "ceg" + chamacoco__latn = "ceg_Latn" + eastern_khumi_chin = "cek" + eastern_khumi_chin__latn = "cek_Latn" + czech = "ces" + czech__latn = "ces_Latn" + falam_chin = "cfm" + falam_chin__latn = "cfm_Latn" + kagayanen = "cgc" + kagayanen__latn = "cgc_Latn" + chiga = "cgg" + chiga__latn = "cgg_Latn" + chamorro = "cha" + chamorro__latn = "cha_Latn" + highland_oaxaca_chontal = "chd" + highland_oaxaca_chontal__latn = "chd_Latn" + chechen = "che" + chechen__cyrl = "che_Cyrl" + tabasco_chontal = "chf" + tabasco_chontal__latn = "chf_Latn" + ojitlan_chinantec = "chj" + ojitlan_chinantec__latn = "chj_Latn" + chuukese = "chk" + chuukese__latn = "chk_Latn" + mari_russia = "chm" + mari_russia__cyrl = "chm_Cyrl" + choctaw = "cho" + choctaw__latn = "cho_Latn" + quiotepec_chinantec = "chq" + quiotepec_chinantec__latn = "chq_Latn" + cherokee = "chr" + cherokee__cher = "chr_Cher" + cherokee__latn = "chr_Latn" + church_slavic = "chu" + church_slavic__cyrl = "chu_Cyrl" + chuvash = "chv" + chuvash__cyrl = "chv_Cyrl" + chuwabu = "chw" + chuwabu__latn = "chw_Latn" + ozumacin_chinantec = "chz" + ozumacin_chinantec__latn = "chz_Latn" + chokwe = "cjk" + chokwe__latn = "cjk_Latn" + asheninka_pajonal = "cjo" + asheninka_pajonal__latn = "cjo_Latn" + cabecar = "cjp" + cabecar__latn = "cjp_Latn" + shor = "cjs" + shor__cyrl = "cjs_Cyrl" + chuave = "cjv" + chuave__latn = "cjv_Latn" + central_kurdish = "ckb" + central_kurdish__arab = "ckb_Arab" + chakavian = "ckm" + chakavian__latn = "ckm_Latn" + anufo = "cko" + anufo__latn = "cko_Latn" + chukot = "ckt" + chukot__cyrl = "ckt_Cyrl" + lealao_chinantec = "cle" + lealao_chinantec__latn = "cle_Latn" + caluyanun = "clu" + caluyanun__latn = "clu_Latn" + eastern_highland_chatino = "cly" + eastern_highland_chatino__latn = "cly_Latn" + cerma = "cme" + cerma__latn = "cme_Latn" + mandarin_chinese = "cmn" + mandarin_chinese__hani = "cmn_Hani" + central_mnong = "cmo" + central_mnong__latn = "cmo_Latn" + central_mnong__khmr = "cmo_Khmr" + mro_khimi_chin = "cmr" + mro_khimi_chin__latn = "cmr_Latn" + hakha_chin = "cnh" + hakha_chin__latn = "cnh_Latn" + ashaninka = "cni" + ashaninka__latn = "cni_Latn" + khumi_chin = "cnk" + khumi_chin__latn = "cnk_Latn" + lalana_chinantec = "cnl" + lalana_chinantec__latn = "cnl_Latn" + montenegrin = "cnr" + montenegrin__latn = "cnr_Latn" + tepetotutla_chinantec = "cnt" + tepetotutla_chinantec__latn = "cnt_Latn" + ngawn_chin = "cnw" + ngawn_chin__latn = "cnw_Latn" + koreguaje = "coe" + koreguaje__latn = "coe_Latn" + colorado = "cof" + colorado__latn = "cof_Latn" + santa_teresa_cora = "cok" + santa_teresa_cora__latn = "cok_Latn" + cofan = "con" + cofan__latn = "con_Latn" + coptic = "cop" + coptic__copt = "cop_Copt" + cornish = "cor" + cornish__latn = "cor_Latn" + corsican = "cos" + corsican__latn = "cos_Latn" + caquinte = "cot" + caquinte__latn = "cot_Latn" + wamey = "cou" + wamey__latn = "cou_Latn" + palantla_chinantec = "cpa" + palantla_chinantec__latn = "cpa_Latn" + ucayali_yurua_asheninka = "cpb" + ucayali_yurua_asheninka__latn = "cpb_Latn" + ajyininka_apurucayali = "cpc" + ajyininka_apurucayali__latn = "cpc_Latn" + pichis_asheninka = "cpu" + pichis_asheninka__latn = "cpu_Latn" + south_ucayali_asheninka = "cpy" + south_ucayali_asheninka__latn = "cpy_Latn" + cree = "cre" + cree__cans = "cre_Cans" + cree__latn = "cre_Latn" + crimean_tatar = "crh" + crimean_tatar__latn = "crh_Latn" + crimean_tatar__cyrl = "crh_Cyrl" + saotomense = "cri" + saotomense__latn = "cri_Latn" + southern_east_cree = "crj" + southern_east_cree__cans = "crj_Cans" + plains_cree = "crk" + plains_cree__latn = "crk_Latn" + plains_cree__cans = "crk_Cans" + northern_east_cree = "crl" + northern_east_cree__cans = "crl_Cans" + moose_cree = "crm" + moose_cree__cans = "crm_Cans" + el_nayar_cora = "crn" + el_nayar_cora__latn = "crn_Latn" + seselwa_creole_french = "crs" + seselwa_creole_french__latn = "crs_Latn" + iyojwa_ja_chorote = "crt" + iyojwa_ja_chorote__latn = "crt_Latn" + carrier = "crx" + carrier__latn = "crx_Latn" + kashubian = "csb" + kashubian__latn = "csb_Latn" + jola_kasa = "csk" + jola_kasa__latn = "csk_Latn" + sochiapam_chinantec = "cso" + sochiapam_chinantec__latn = "cso_Latn" + swampy_cree = "csw" + swampy_cree__latn = "csw_Latn" + siyin_chin = "csy" + siyin_chin__latn = "csy_Latn" + tataltepec_chatino = "cta" + tataltepec_chatino__latn = "cta_Latn" + tedim_chin = "ctd" + tedim_chin__latn = "ctd_Latn" + embera_catio = "cto" + embera_catio__latn = "cto_Latn" + western_highland_chatino = "ctp" + western_highland_chatino__latn = "ctp_Latn" + chol = "ctu" + chol__latn = "ctu_Latn" + cubeo = "cub" + cubeo__latn = "cub_Latn" + usila_chinantec = "cuc" + usila_chinantec__latn = "cuc_Latn" + cuiba = "cui" + cuiba__latn = "cui_Latn" + san_blas_kuna = "cuk" + san_blas_kuna__latn = "cuk_Latn" + culina = "cul" + culina__latn = "cul_Latn" + teutila_cuicatec = "cut" + teutila_cuicatec__latn = "cut_Latn" + tepeuxila_cuicatec = "cux" + tepeuxila_cuicatec__latn = "cux_Latn" + woods_cree = "cwd" + woods_cree__cans = "cwd_Cans" + kwere = "cwe" + kwere__latn = "cwe_Latn" + kuwaataay = "cwt" + kuwaataay__latn = "cwt_Latn" + nopala_chatino = "cya" + nopala_chatino__latn = "cya_Latn" + welsh = "cym" + welsh__latn = "cym_Latn" + zotung_chin = "czt" + zotung_chin__latn = "czt_Latn" + dangaleat = "daa" + dangaleat__latn = "daa_Latn" + marik = "dad" + marik__latn = "dad_Latn" + dagbani = "dag" + dagbani__latn = "dag_Latn" + gwahatike = "dah" + gwahatike__latn = "dah_Latn" + dakota = "dak" + dakota__latn = "dak_Latn" + danish = "dan" + danish__latn = "dan_Latn" + dargwa = "dar" + dargwa__cyrl = "dar_Cyrl" + daba = "dbq" + daba__latn = "dbq_Latn" + fataluku = "ddg" + fataluku__latn = "ddg_Latn" + dendi_benin = "ddn" + dendi_benin__latn = "ddn_Latn" + dedua = "ded" + dedua__latn = "ded_Latn" + desano = "des" + desano__latn = "des_Latn" + german = "deu" + german__latn = "deu_Latn" + southern_dagaare = "dga" + southern_dagaare__latn = "dga_Latn" + casiguran_dumagat_agta = "dgc" + casiguran_dumagat_agta__latn = "dgc_Latn" + northern_dagara = "dgi" + northern_dagara__latn = "dgi_Latn" + tlicho = "dgr" + tlicho__latn = "dgr_Latn" + daga = "dgz" + daga__latn = "dgz_Latn" + dhangu_djangu = "dhg" + dhangu_djangu__latn = "dhg_Latn" + zemba = "dhm" + zemba__latn = "dhm_Latn" + dehu = "dhv" + dehu__latn = "dhv_Latn" + didinga = "did" + didinga__latn = "did_Latn" + digo = "dig" + digo__latn = "dig_Latn" + southwestern_dinka = "dik" + southwestern_dinka__latn = "dik_Latn" + dinka = "din" + dinka__latn = "din_Latn" + northeastern_dinka = "dip" + northeastern_dinka__latn = "dip_Latn" + dimli_individual_language = "diq" + dimli_individual_language__latn = "diq_Latn" + dimasa = "dis" + dimasa__latn = "dis_Latn" + diriku = "diu" + diriku__latn = "diu_Latn" + dhivehi = "div" + dhivehi__thaa = "div_Thaa" + zarma = "dje" + zarma__latn = "dje_Latn" + eastern_maroon_creole = "djk" + eastern_maroon_creole__latn = "djk_Latn" + djambarrpuyngu = "djr" + djambarrpuyngu__latn = "djr_Latn" + southeastern_dinka = "dks" + southeastern_dinka__latn = "dks_Latn" + darlong = "dln" + darlong__latn = "dln_Latn" + dungan = "dng" + dungan__cyrl = "dng_Cyrl" + dan = "dnj" + dan__latn = "dnj_Latn" + western_dani = "dnw" + western_dani__latn = "dnw_Latn" + dobu = "dob" + dobu__latn = "dob_Latn" + dogri_macrolanguage = "doi" + dogri_macrolanguage__deva = "doi_Deva" + lukpa = "dop" + lukpa__latn = "dop_Latn" + dogose = "dos" + dogose__latn = "dos_Latn" + doyayo = "dow" + doyayo__latn = "dow_Latn" + rungus = "drg" + rungus__latn = "drg_Latn" + rukai = "dru" + rukai__latn = "dru_Latn" + lower_sorbian = "dsb" + lower_sorbian__latn = "dsb_Latn" + daasanach = "dsh" + daasanach__latn = "dsh_Latn" + labuk_kinabatangan_kadazan = "dtb" + labuk_kinabatangan_kadazan__latn = "dtb_Latn" + kadazan_dusun = "dtp" + kadazan_dusun__latn = "dtp_Latn" + toro_so_dogon = "dts" + toro_so_dogon__latn = "dts_Latn" + dotyali = "dty" + dotyali__deva = "dty_Deva" + duala = "dua" + duala__latn = "dua_Latn" + umiray_dumaget_agta = "due" + umiray_dumaget_agta__latn = "due_Latn" + duruma = "dug" + duruma__latn = "dug_Latn" + dupaninan_agta = "duo" + dupaninan_agta__latn = "duo_Latn" + dii = "dur" + dii__latn = "dur_Latn" + dawro = "dwr" + dawro__latn = "dwr_Latn" + dawawa = "dww" + dawawa__latn = "dww_Latn" + djimini_senoufo = "dyi" + djimini_senoufo__latn = "dyi_Latn" + jola_fonyi = "dyo" + jola_fonyi__latn = "dyo_Latn" + dyula = "dyu" + dyula__latn = "dyu_Latn" + dzongkha = "dzo" + dzongkha__tibt = "dzo_Tibt" + eastern_bontok = "ebk" + eastern_bontok__latn = "ebk_Latn" + efik = "efi" + efik__latn = "efi_Latn" + ekajuk = "eka" + ekajuk__latn = "eka_Latn" + standard_estonian = "ekk" + standard_estonian__latn = "ekk_Latn" + koti = "eko" + koti__latn = "eko_Latn" + modern_greek_1453_ = "ell" + modern_greek_1453___grek = "ell_Grek" + emerillon = "eme" + emerillon__latn = "eme_Latn" + mussau_emira = "emi" + mussau_emira__latn = "emi_Latn" + northern_embera = "emp" + northern_embera__latn = "emp_Latn" + markweeta = "enb" + markweeta__latn = "enb_Latn" + english = "eng" + english__latn = "eng_Latn" + enlhet = "enl" + enlhet__latn = "enl_Latn" + middle_english_1100_1500 = "enm" + middle_english_1100_1500__latn = "enm_Latn" + enga = "enq" + enga__latn = "enq_Latn" + enxet = "enx" + enxet__latn = "enx_Latn" + esperanto = "epo" + esperanto__latn = "epo_Latn" + ogea = "eri" + ogea__latn = "eri_Latn" + ese_ejja = "ese" + ese_ejja__latn = "ese_Latn" + north_alaskan_inupiatun = "esi" + north_alaskan_inupiatun__latn = "esi_Latn" + northwest_alaska_inupiatun = "esk" + northwest_alaska_inupiatun__latn = "esk_Latn" + central_siberian_yupik = "ess" + central_siberian_yupik__latn = "ess_Latn" + estonian = "est" + estonian__latn = "est_Latn" + central_yupik = "esu" + central_yupik__latn = "esu_Latn" + eton_cameroon = "eto" + eton_cameroon__latn = "eto_Latn" + edolo = "etr" + edolo__latn = "etr_Latn" + ejagham = "etu" + ejagham__latn = "etu_Latn" + basque = "eus" + basque__latn = "eus_Latn" + even = "eve" + even__cyrl = "eve_Cyrl" + ewe = "ewe" + ewe__latn = "ewe_Latn" + ewondo = "ewo" + ewondo__latn = "ewo_Latn" + extremaduran = "ext" + extremaduran__latn = "ext_Latn" + ezaa = "eza" + ezaa__latn = "eza_Latn" + fasu = "faa" + fasu__latn = "faa_Latn" + wagi = "fad" + wagi__latn = "fad_Latn" + faiwol = "fai" + faiwol__latn = "fai_Latn" + south_fali = "fal" + south_fali__latn = "fal_Latn" + fang_equatorial_guinea = "fan" + fang_equatorial_guinea__latn = "fan_Latn" + faroese = "fao" + faroese__latn = "fao_Latn" + fataleka = "far" + fataleka__latn = "far_Latn" + persian = "fas" + persian__arab = "fas_Arab" + fanti = "fat" + fanti__latn = "fat_Latn" + maasina_fulfulde = "ffm" + maasina_fulfulde__latn = "ffm_Latn" + fijian = "fij" + fijian__latn = "fij_Latn" + filipino = "fil" + filipino__latn = "fil_Latn" + finnish = "fin" + finnish__latn = "fin_Latn" + tornedalen_finnish = "fit" + tornedalen_finnish__latn = "fit_Latn" + kven_finnish = "fkv" + kven_finnish__latn = "fkv_Latn" + far_western_muria = "fmu" + far_western_muria__deva = "fmu_Deva" + fon = "fon" + fon__latn = "fon_Latn" + fore = "for" + fore__latn = "for_Latn" + french = "fra" + french__latn = "fra_Latn" + fordata = "frd" + fordata__latn = "frd_Latn" + old_french_842_ca_1400 = "fro" + old_french_842_ca_1400__latn = "fro_Latn" + arpitan = "frp" + arpitan__latn = "frp_Latn" + northern_frisian = "frr" + northern_frisian__latn = "frr_Latn" + western_frisian = "fry" + western_frisian__latn = "fry_Latn" + adamawa_fulfulde = "fub" + adamawa_fulfulde__latn = "fub_Latn" + east_futuna = "fud" + east_futuna__latn = "fud_Latn" + borgu_fulfulde = "fue" + borgu_fulfulde__latn = "fue_Latn" + pular = "fuf" + pular__latn = "fuf_Latn" + western_niger_fulfulde = "fuh" + western_niger_fulfulde__latn = "fuh_Latn" + fulah = "ful" + fulah__latn = "ful_Latn" + fulah__arab = "ful_Arab" + central_eastern_niger_fulfulde = "fuq" + central_eastern_niger_fulfulde__latn = "fuq_Latn" + friulian = "fur" + friulian__latn = "fur_Latn" + nigerian_fulfulde = "fuv" + nigerian_fulfulde__latn = "fuv_Latn" + nigerian_fulfulde__arab = "fuv_Arab" + ga = "gaa" + ga__latn = "gaa_Latn" + gagauz = "gag" + gagauz__latn = "gag_Latn" + gagauz__cyrl = "gag_Cyrl" + alekano = "gah" + alekano__latn = "gah_Latn" + borei = "gai" + borei__latn = "gai_Latn" + kandawo = "gam" + kandawo__latn = "gam_Latn" + nobonob = "gaw" + nobonob__latn = "gaw_Latn" + west_central_oromo = "gaz" + west_central_oromo__latn = "gaz_Latn" + gbaya_central_african_republic = "gba" + gbaya_central_african_republic__latn = "gba_Latn" + galela = "gbi" + galela__latn = "gbi_Latn" + northern_grebo = "gbo" + northern_grebo__latn = "gbo_Latn" + gbagyi = "gbr" + gbagyi__latn = "gbr_Latn" + guadeloupean_creole_french = "gcf" + guadeloupean_creole_french__latn = "gcf_Latn" + guianese_creole_french = "gcr" + guianese_creole_french__latn = "gcr_Latn" + gude = "gde" + gude__latn = "gde_Latn" + ga_dang = "gdg" + ga_dang__latn = "gdg_Latn" + umanakaina = "gdn" + umanakaina__latn = "gdn_Latn" + wipi = "gdr" + wipi__latn = "gdr_Latn" + kire = "geb" + kire__latn = "geb_Latn" + gen = "gej" + gen__latn = "gej_Latn" + patpatar = "gfk" + patpatar__latn = "gfk_Latn" + southern_ghale = "ghe" + southern_ghale__deva = "ghe_Deva" + guhu_samane = "ghs" + guhu_samane__latn = "ghs_Latn" + gidar = "gid" + gidar__latn = "gid_Latn" + gilbertese = "gil" + gilbertese__latn = "gil_Latn" + south_giziga = "giz" + south_giziga__latn = "giz_Latn" + gonja = "gjn" + gonja__latn = "gjn_Latn" + gokana = "gkn" + gokana__latn = "gkn_Latn" + scottish_gaelic = "gla" + scottish_gaelic__latn = "gla_Latn" + irish = "gle" + irish__latn = "gle_Latn" + galician = "glg" + galician__latn = "glg_Latn" + gilaki = "glk" + gilaki__arab = "glk_Arab" + manx = "glv" + manx__latn = "glv_Latn" + middle_high_german_ca_1050_1500 = "gmh" + middle_high_german_ca_1050_1500__latn = "gmh_Latn" + gamo = "gmv" + gamo__latn = "gmv_Latn" + gamo__ethi = "gmv_Ethi" + kaansa = "gna" + kaansa__latn = "gna_Latn" + gangte = "gnb" + gangte__latn = "gnb_Latn" + zulgo_gemzek = "gnd" + zulgo_gemzek__latn = "gnd_Latn" + ngangam = "gng" + ngangam__latn = "gng_Latn" + gumatj = "gnn" + gumatj__latn = "gnn_Latn" + western_bolivian_guarani = "gnw" + western_bolivian_guarani__latn = "gnw_Latn" + guro = "goa" + guro__latn = "goa_Latn" + gofa = "gof" + gofa__latn = "gof_Latn" + gofa__ethi = "gof_Ethi" + gogo = "gog" + gogo__latn = "gog_Latn" + old_high_german_ca_750_1050 = "goh" + old_high_german_ca_750_1050__latn = "goh_Latn" + goan_konkani = "gom" + goan_konkani__latn = "gom_Latn" + goan_konkani__deva = "gom_Deva" + gondi = "gon" + gondi__telu = "gon_Telu" + gorontalo = "gor" + gorontalo__latn = "gor_Latn" + gronings = "gos" + gronings__latn = "gos_Latn" + gothic = "got" + gothic__latn = "got_Latn" + gothic__goth = "got_Goth" + gor = "gqr" + gor__latn = "gqr_Latn" + grebo = "grb" + grebo__latn = "grb_Latn" + ancient_greek_to_1453 = "grc" + ancient_greek_to_1453__grek = "grc_Grek" + guarani = "grn" + guarani__latn = "grn_Latn" + garo = "grt" + garo__beng = "grt_Beng" + southwest_gbaya = "gso" + southwest_gbaya__latn = "gso_Latn" + swiss_german = "gsw" + swiss_german__latn = "gsw_Latn" + guajajara = "gub" + guajajara__latn = "gub_Latn" + wayuu = "guc" + wayuu__latn = "guc_Latn" + yocoboue_dida = "gud" + yocoboue_dida__latn = "gud_Latn" + paraguayan_guarani = "gug" + paraguayan_guarani__latn = "gug_Latn" + guahibo = "guh" + guahibo__latn = "guh_Latn" + eastern_bolivian_guarani = "gui" + eastern_bolivian_guarani__latn = "gui_Latn" + gujarati = "guj" + gujarati__gujr = "guj_Gujr" + gujarati__latn = "guj_Latn" + gumuz = "guk" + gumuz__ethi = "guk_Ethi" + sea_island_creole_english = "gul" + sea_island_creole_english__latn = "gul_Latn" + guambiano = "gum" + guambiano__latn = "gum_Latn" + mbya_guarani = "gun" + mbya_guarani__latn = "gun_Latn" + guayabero = "guo" + guayabero__latn = "guo_Latn" + ache = "guq" + ache__latn = "guq_Latn" + farefare = "gur" + farefare__latn = "gur_Latn" + yanomamo = "guu" + yanomamo__latn = "guu_Latn" + gun = "guw" + gun__latn = "guw_Latn" + gourmanchema = "gux" + gourmanchema__latn = "gux_Latn" + gusii = "guz" + gusii__latn = "guz_Latn" + guanano = "gvc" + guanano__latn = "gvc_Latn" + golin = "gvf" + golin__latn = "gvf_Latn" + gulay = "gvl" + gulay__latn = "gvl_Latn" + kuku_yalanji = "gvn" + kuku_yalanji__latn = "gvn_Latn" + gwichʼin = "gwi" + gwichʼin__latn = "gwi_Latn" + gwere = "gwr" + gwere__latn = "gwr_Latn" + northwest_gbaya = "gya" + northwest_gbaya__latn = "gya_Latn" + ngabere = "gym" + ngabere__latn = "gym_Latn" + guarayu = "gyr" + guarayu__latn = "gyr_Latn" + gurani = "hac" + gurani__arab = "hac_Arab" + eastern_oromo = "hae" + eastern_oromo__latn = "hae_Latn" + hanga = "hag" + hanga__latn = "hag_Latn" + hakka_chinese = "hak" + hakka_chinese__hani = "hak_Hani" + hakka_chinese__latn = "hak_Latn" + haitian = "hat" + haitian__latn = "hat_Latn" + hausa = "hau" + hausa__latn = "hau_Latn" + havu = "hav" + havu__latn = "hav_Latn" + hawaiian = "haw" + hawaiian__latn = "haw_Latn" + haya = "hay" + haya__latn = "hay_Latn" + ancient_hebrew = "hbo" + ancient_hebrew__hebr = "hbo_Hebr" + serbo_croatian = "hbs" + serbo_croatian__latn = "hbs_Latn" + serbo_croatian__cyrl = "hbs_Cyrl" + huichol = "hch" + huichol__latn = "hch_Latn" + hebrew = "heb" + hebrew__hebr = "heb_Hebr" + helong = "heg" + helong__latn = "heg_Latn" + hehe = "heh" + hehe__latn = "heh_Latn" + herero = "her" + herero__latn = "her_Latn" + fiji_hindi = "hif" + fiji_hindi__latn = "hif_Latn" + kamwe = "hig" + kamwe__latn = "hig_Latn" + hiligaynon = "hil" + hiligaynon__latn = "hil_Latn" + hindi = "hin" + hindi__deva = "hin_Deva" + hindi__latn = "hin_Latn" + hixkaryana = "hix" + hixkaryana__latn = "hix_Latn" + halia = "hla" + halia__latn = "hla_Latn" + matu_chin = "hlt" + matu_chin__latn = "hlt_Latn" + hmong = "hmn" + hmong__latn = "hmn_Latn" + hiri_motu = "hmo" + hiri_motu__latn = "hmo_Latn" + hmar = "hmr" + hmar__latn = "hmr_Latn" + chhattisgarhi = "hne" + chhattisgarhi__deva = "hne_Deva" + hmong_njua = "hnj" + hmong_njua__latn = "hnj_Latn" + hanunoo = "hnn" + hanunoo__latn = "hnn_Latn" + caribbean_hindustani = "hns" + caribbean_hindustani__latn = "hns_Latn" + ho = "hoc" + ho__latn = "hoc_Latn" + ho__wara = "hoc_Wara" + hopi = "hop" + hopi__latn = "hop_Latn" + hote = "hot" + hote__latn = "hot_Latn" + hrangkhol = "hra" + hrangkhol__latn = "hra_Latn" + croatian = "hrv" + croatian__latn = "hrv_Latn" + hunsrik = "hrx" + hunsrik__latn = "hrx_Latn" + upper_sorbian = "hsb" + upper_sorbian__latn = "hsb_Latn" + minica_huitoto = "hto" + minica_huitoto__latn = "hto_Latn" + huambisa = "hub" + huambisa__latn = "hub_Latn" + huli = "hui" + huli__latn = "hui_Latn" + hungarian = "hun" + hungarian__latn = "hun_Latn" + huastec = "hus" + huastec__latn = "hus_Latn" + murui_huitoto = "huu" + murui_huitoto__latn = "huu_Latn" + san_mateo_del_mar_huave = "huv" + san_mateo_del_mar_huave__latn = "huv_Latn" + sabu = "hvn" + sabu__latn = "hvn_Latn" + hawai_i_creole_english = "hwc" + hawai_i_creole_english__latn = "hwc_Latn" + armenian = "hye" + armenian__armn = "hye_Armn" + western_armenian = "hyw" + western_armenian__armn = "hyw_Armn" + iatmul = "ian" + iatmul__latn = "ian_Latn" + iban = "iba" + iban__latn = "iba_Latn" + ibanag = "ibg" + ibanag__latn = "ibg_Latn" + igbo = "ibo" + igbo__latn = "ibo_Latn" + islander_creole_english = "icr" + islander_creole_english__latn = "icr_Latn" + ido = "ido" + ido__latn = "ido_Latn" + idoma = "idu" + idoma__latn = "idu_Latn" + amganad_ifugao = "ifa" + amganad_ifugao__latn = "ifa_Latn" + batad_ifugao = "ifb" + batad_ifugao__latn = "ifb_Latn" + ife = "ife" + ife__latn = "ife_Latn" + tuwali_ifugao = "ifk" + tuwali_ifugao__latn = "ifk_Latn" + mayoyao_ifugao = "ifu" + mayoyao_ifugao__latn = "ifu_Latn" + keley_i_kallahan = "ify" + keley_i_kallahan__latn = "ify_Latn" + igede = "ige" + igede__latn = "ige_Latn" + ignaciano = "ign" + ignaciano__latn = "ign_Latn" + eastern_canadian_inuktitut = "ike" + eastern_canadian_inuktitut__cans = "ike_Cans" + ika = "ikk" + ika__latn = "ikk_Latn" + inuinnaqtun = "ikt" + inuinnaqtun__latn = "ikt_Latn" + inuktitut = "iku" + inuktitut__cans = "iku_Cans" + inuktitut__latn = "iku_Latn" + ikwere = "ikw" + ikwere__latn = "ikw_Latn" + ila = "ilb" + ila__latn = "ilb_Latn" + interlingue = "ile" + interlingue__latn = "ile_Latn" + iloko = "ilo" + iloko__latn = "ilo_Latn" + imbongu = "imo" + imbongu__latn = "imo_Latn" + interlingua_international_auxiliary_language_association = "ina" + interlingua_international_auxiliary_language_association__latn = "ina_Latn" + inga = "inb" + inga__latn = "inb_Latn" + indonesian = "ind" + indonesian__latn = "ind_Latn" + ingush = "inh" + ingush__cyrl = "inh_Cyrl" + inoke_yate = "ino" + inoke_yate__latn = "ino_Latn" + tuma_irumu = "iou" + tuma_irumu__latn = "iou_Latn" + ipili = "ipi" + ipili__latn = "ipi_Latn" + inupiaq = "ipk" + inupiaq__latn = "ipk_Latn" + ikwo = "iqw" + ikwo__latn = "iqw_Latn" + rigwe = "iri" + rigwe__latn = "iri_Latn" + iraqw = "irk" + iraqw__latn = "irk_Latn" + iraya = "iry" + iraya__latn = "iry_Latn" + isnag = "isd" + isnag__latn = "isd_Latn" + esan = "ish" + esan__latn = "ish_Latn" + icelandic = "isl" + icelandic__latn = "isl_Latn" + isoko = "iso" + isoko__latn = "iso_Latn" + italian = "ita" + italian__latn = "ita_Latn" + itelmen = "itl" + itelmen__cyrl = "itl_Cyrl" + itawit = "itv" + itawit__latn = "itv_Latn" + iu_mien = "ium" + iu_mien__latn = "ium_Latn" + ibatan = "ivb" + ibatan__latn = "ivb_Latn" + ivatan = "ivv" + ivatan__latn = "ivv_Latn" + sepik_iwam = "iws" + sepik_iwam__latn = "iws_Latn" + ixil = "ixl" + ixil__latn = "ixl_Latn" + izere = "izr" + izere__latn = "izr_Latn" + izii = "izz" + izii__latn = "izz_Latn" + jamamadi = "jaa" + jamamadi__latn = "jaa_Latn" + popti_ = "jac" + popti___latn = "jac_Latn" + yabem = "jae" + yabem__latn = "jae_Latn" + jamaican_creole_english = "jam" + jamaican_creole_english__latn = "jam_Latn" + javanese = "jav" + javanese__latn = "jav_Latn" + javanese__java = "jav_Java" + lojban = "jbo" + lojban__latn = "jbo_Latn" + jukun_takum = "jbu" + jukun_takum__latn = "jbu_Latn" + tol = "jic" + tol__latn = "jic_Latn" + shuar = "jiv" + shuar__latn = "jiv_Latn" + machame = "jmc" + machame__latn = "jmc_Latn" + japanese = "jpn" + japanese__jpan = "jpn_Jpan" + jarai = "jra" + jarai__latn = "jra_Latn" + juang = "jun" + juang__orya = "jun_Orya" + caribbean_javanese = "jvn" + caribbean_javanese__latn = "jvn_Latn" + kara_kalpak = "kaa" + kara_kalpak__cyrl = "kaa_Cyrl" + kara_kalpak__latn = "kaa_Latn" + kabyle = "kab" + kabyle__latn = "kab_Latn" + kachin = "kac" + kachin__latn = "kac_Latn" + kalanguya = "kak" + kalanguya__latn = "kak_Latn" + kalaallisut = "kal" + kalaallisut__latn = "kal_Latn" + kamba_kenya = "kam" + kamba_kenya__latn = "kam_Latn" + kannada = "kan" + kannada__knda = "kan_Knda" + kannada__latn = "kan_Latn" + xaasongaxango = "kao" + xaasongaxango__latn = "kao_Latn" + bezhta = "kap" + bezhta__cyrl = "kap_Cyrl" + capanahua = "kaq" + capanahua__latn = "kaq_Latn" + kashmiri = "kas" + kashmiri__deva = "kas_Deva" + kashmiri__latn = "kas_Latn" + kashmiri__arab = "kas_Arab" + georgian = "kat" + georgian__geor = "kat_Geor" + kanuri = "kau" + kanuri__arab = "kau_Arab" + kanuri__latn = "kau_Latn" + kazakh = "kaz" + kazakh__cyrl = "kaz_Cyrl" + kadiweu = "kbc" + kadiweu__latn = "kbc_Latn" + kabardian = "kbd" + kabardian__cyrl = "kbd_Cyrl" + camsa = "kbh" + camsa__latn = "kbh_Latn" + iwal = "kbm" + iwal__latn = "kbm_Latn" + keliko = "kbo" + keliko__latn = "kbo_Latn" + kabiye = "kbp" + kabiye__latn = "kbp_Latn" + kamano = "kbq" + kamano__latn = "kbq_Latn" + kafa = "kbr" + kafa__latn = "kbr_Latn" + manga_kanuri = "kby" + manga_kanuri__latn = "kby_Latn" + khanty = "kca" + khanty__cyrl = "kca_Cyrl" + tyap = "kcg" + tyap__latn = "kcg_Latn" + kalanga = "kck" + kalanga__latn = "kck_Latn" + kutu = "kdc" + kutu__latn = "kdc_Latn" + makonde = "kde" + makonde__latn = "kde_Latn" + tem = "kdh" + tem__latn = "kdh_Latn" + kumam = "kdi" + kumam__latn = "kdi_Latn" + karamojong = "kdj" + karamojong__latn = "kdj_Latn" + tsikimba = "kdl" + tsikimba__latn = "kdl_Latn" + kaningdon_nindem = "kdp" + kaningdon_nindem__latn = "kdp_Latn" + karaim = "kdr" + karaim__latn = "kdr_Latn" + kabuverdianu = "kea" + kabuverdianu__latn = "kea_Latn" + kei = "kei" + kei__latn = "kei_Latn" + kekchi = "kek" + kekchi__latn = "kek_Latn" + kenyang = "ken" + kenyang__latn = "ken_Latn" + kakwa = "keo" + kakwa__latn = "keo_Latn" + kera = "ker" + kera__latn = "ker_Latn" + west_kewa = "kew" + west_kewa__latn = "kew_Latn" + kukna = "kex" + kukna__deva = "kex_Deva" + kukele = "kez" + kukele__latn = "kez_Latn" + koya = "kff" + koya__telu = "kff_Telu" + kube = "kgf" + kube__latn = "kgf_Latn" + kaiwa = "kgk" + kaiwa__latn = "kgk_Latn" + kaingang = "kgp" + kaingang__latn = "kgp_Latn" + abun = "kgr" + abun__latn = "kgr_Latn" + khasi = "kha" + khasi__latn = "kha_Latn" + halh_mongolian = "khk" + halh_mongolian__cyrl = "khk_Cyrl" + khmer = "khm" + khmer__khmr = "khm_Khmr" + koyra_chiini_songhay = "khq" + koyra_chiini_songhay__latn = "khq_Latn" + kasua = "khs" + kasua__latn = "khs_Latn" + kele_democratic_republic_of_congo = "khy" + kele_democratic_republic_of_congo__latn = "khy_Latn" + keapara = "khz" + keapara__latn = "khz_Latn" + kim = "kia" + kim__latn = "kia_Latn" + kilivila = "kij" + kilivila__latn = "kij_Latn" + kikuyu = "kik" + kikuyu__latn = "kik_Latn" + kinyarwanda = "kin" + kinyarwanda__latn = "kin_Latn" + kirghiz = "kir" + kirghiz__cyrl = "kir_Cyrl" + kirmanjki_individual_language = "kiu" + kirmanjki_individual_language__latn = "kiu_Latn" + khiamniungan_naga = "kix" + khiamniungan_naga__latn = "kix_Latn" + q_anjob_al = "kjb" + q_anjob_al__latn = "kjb_Latn" + kisar = "kje" + kisar__latn = "kje_Latn" + khakas = "kjh" + khakas__cyrl = "kjh_Cyrl" + east_kewa = "kjs" + east_kewa__latn = "kjs_Latn" + odoodee = "kkc" + odoodee__latn = "kkc_Latn" + kagulu = "kki" + kagulu__latn = "kki_Latn" + kako = "kkj" + kako__latn = "kkj_Latn" + kosarek_yale = "kkl" + kosarek_yale__latn = "kkl_Latn" + kulung_nepal = "kle" + kulung_nepal__deva = "kle_Deva" + kalenjin = "kln" + kalenjin__latn = "kln_Latn" + nukna = "klt" + nukna__latn = "klt_Latn" + maskelynes = "klv" + maskelynes__latn = "klv_Latn" + konni = "kma" + konni__latn = "kma_Latn" + kimbundu = "kmb" + kimbundu__latn = "kmb_Latn" + majukayang_kalinga = "kmd" + majukayang_kalinga__latn = "kmd_Latn" + kate = "kmg" + kate__latn = "kmg_Latn" + kalam = "kmh" + kalam__latn = "kmh_Latn" + limos_kalinga = "kmk" + limos_kalinga__latn = "kmk_Latn" + kom_india = "kmm" + kom_india__latn = "kmm_Latn" + kwoma = "kmo" + kwoma__latn = "kmo_Latn" + northern_kurdish = "kmr" + northern_kurdish__latn = "kmr_Latn" + northern_kurdish__cyrl = "kmr_Cyrl" + kamasau = "kms" + kamasau__latn = "kms_Latn" + kanite = "kmu" + kanite__latn = "kmu_Latn" + koma = "kmy" + koma__latn = "kmy_Latn" + central_kanuri = "knc" + central_kanuri__arab = "knc_Arab" + central_kanuri__latn = "knc_Latn" + kankanaey = "kne" + kankanaey__latn = "kne_Latn" + mankanya = "knf" + mankanya__latn = "knf_Latn" + koongo = "kng" + koongo__latn = "kng_Latn" + western_kanjobal = "knj" + western_kanjobal__latn = "knj_Latn" + kuranko = "knk" + kuranko__latn = "knk_Latn" + kono_sierra_leone = "kno" + kono_sierra_leone__latn = "kno_Latn" + tabo = "knv" + tabo__latn = "knv_Latn" + kendayan = "knx" + kendayan__latn = "knx_Latn" + kanyok = "kny" + kanyok__latn = "kny_Latn" + cogui = "kog" + cogui__latn = "kog_Latn" + komi_permyak = "koi" + komi_permyak__cyrl = "koi_Cyrl" + konkani_macrolanguage__latn = "kok_Latn" + konkani_macrolanguage = "kok" + konkani_macrolanguage__deva = "kok_Deva" + komi = "kom" + komi__cyrl = "kom_Cyrl" + kongo = "kon" + kongo__latn = "kon_Latn" + konzo = "koo" + konzo__latn = "koo_Latn" + korean = "kor" + korean__hang = "kor_Hang" + kosraean = "kos" + kosraean__latn = "kos_Latn" + kpelle = "kpe" + kpelle__latn = "kpe_Latn" + komba = "kpf" + komba__latn = "kpf_Latn" + kapingamarangi = "kpg" + kapingamarangi__latn = "kpg_Latn" + karaja = "kpj" + karaja__latn = "kpj_Latn" + korupun_sela = "kpq" + korupun_sela__latn = "kpq_Latn" + korafe_yegha = "kpr" + korafe_yegha__latn = "kpr_Latn" + komi_zyrian = "kpv" + komi_zyrian__cyrl = "kpv_Cyrl" + kobon = "kpw" + kobon__latn = "kpw_Latn" + mountain_koiali = "kpx" + mountain_koiali__latn = "kpx_Latn" + kupsabiny = "kpz" + kupsabiny__latn = "kpz_Latn" + mum = "kqa" + mum__latn = "kqa_Latn" + doromu_koki = "kqc" + doromu_koki__latn = "kqc_Latn" + kalagan = "kqe" + kalagan__latn = "kqe_Latn" + kakabai = "kqf" + kakabai__latn = "kqf_Latn" + kyenele = "kql" + kyenele__latn = "kql_Latn" + kaonde = "kqn" + kaonde__latn = "kqn_Latn" + eastern_krahn = "kqo" + eastern_krahn__latn = "kqo_Latn" + kimre = "kqp" + kimre__latn = "kqp_Latn" + northern_kissi = "kqs" + northern_kissi__latn = "kqs_Latn" + kandas = "kqw" + kandas__latn = "kqw_Latn" + koorete = "kqy" + koorete__ethi = "kqy_Ethi" + karachay_balkar = "krc" + karachay_balkar__cyrl = "krc_Cyrl" + krio = "kri" + krio__latn = "kri_Latn" + kinaray_a = "krj" + kinaray_a__latn = "krj_Latn" + karelian = "krl" + karelian__latn = "krl_Latn" + kurukh = "kru" + kurukh__deva = "kru_Deva" + karon = "krx" + karon__latn = "krx_Latn" + shambala = "ksb" + shambala__latn = "ksb_Latn" + southern_kalinga = "ksc" + southern_kalinga__latn = "ksc_Latn" + kuanua = "ksd" + kuanua__latn = "ksd_Latn" + bafia = "ksf" + bafia__latn = "ksf_Latn" + kolsch = "ksh" + kolsch__latn = "ksh_Latn" + uare = "ksj" + uare__latn = "ksj_Latn" + kaba = "ksp" + kaba__latn = "ksp_Latn" + borong = "ksr" + borong__latn = "ksr_Latn" + southern_kisi = "kss" + southern_kisi__latn = "kss_Latn" + s_gaw_karen = "ksw" + s_gaw_karen__mymr = "ksw_Mymr" + kambaata = "ktb" + kambaata__ethi = "ktb_Ethi" + plapo_krumen = "ktj" + plapo_krumen__latn = "ktj_Latn" + kurti = "ktm" + kurti__latn = "ktm_Latn" + kuot = "kto" + kuot__latn = "kto_Latn" + kituba_democratic_republic_of_congo = "ktu" + kituba_democratic_republic_of_congo__latn = "ktu_Latn" + juǀʼhoan = "ktz" + juǀʼhoan__latn = "ktz_Latn" + kuanyama = "kua" + kuanyama__latn = "kua_Latn" + kutep = "kub" + kutep__latn = "kub_Latn" + auhelawa = "kud" + auhelawa__latn = "kud_Latn" + kuman_papua_new_guinea = "kue" + kuman_papua_new_guinea__latn = "kue_Latn" + kuria = "kuj" + kuria__latn = "kuj_Latn" + kumyk = "kum" + kumyk__cyrl = "kum_Cyrl" + kunimaipa = "kup" + kunimaipa__latn = "kup_Latn" + kurdish = "kur" + kurdish__arab = "kur_Arab" + kurdish__latn = "kur_Latn" + kurdish__cyrl = "kur_Cyrl" + kusaal = "kus" + kusaal__latn = "kus_Latn" + kuni_boazi = "kvg" + kuni_boazi__latn = "kvg_Latn" + psikye = "kvj" + psikye__latn = "kvj_Latn" + border_kuna = "kvn" + border_kuna__latn = "kvn_Latn" + kwaio = "kwd" + kwaio__latn = "kwd_Latn" + kwara_ae = "kwf" + kwara_ae__latn = "kwf_Latn" + awa_cuaiquer = "kwi" + awa_cuaiquer__latn = "kwi_Latn" + kwanga = "kwj" + kwanga__latn = "kwj_Latn" + kwangali = "kwn" + kwangali__latn = "kwn_Latn" + san_salvador_kongo = "kwy" + san_salvador_kongo__latn = "kwy_Latn" + konso = "kxc" + konso__ethi = "kxc_Ethi" + northern_khmer = "kxm" + northern_khmer__thai = "kxm_Thai" + konai = "kxw" + konai__latn = "kxw_Latn" + kyaka = "kyc" + kyaka__latn = "kyc_Latn" + kouya = "kyf" + kouya__latn = "kyf_Latn" + keyagana = "kyg" + keyagana__latn = "kyg_Latn" + kenga = "kyq" + kenga__latn = "kyq_Latn" + western_kayah = "kyu" + western_kayah__latn = "kyu_Latn" + western_kayah__kali = "kyu_Kali" + western_kayah__mymr = "kyu_Mymr" + kayabi = "kyz" + kayabi__latn = "kyz_Latn" + kosena = "kze" + kosena__latn = "kze_Latn" + da_a_kaili = "kzf" + da_a_kaili__latn = "kzf_Latn" + kokola = "kzn" + kokola__latn = "kzn_Latn" + lacandon = "lac" + lacandon__latn = "lac_Latn" + ladino = "lad" + ladino__latn = "lad_Latn" + ladino__hebr = "lad_Hebr" + lahnda = "lah" + lahnda__arab = "lah_Arab" + lambya = "lai" + lambya__latn = "lai_Latn" + lango_uganda = "laj" + lango_uganda__latn = "laj_Latn" + lamba = "lam" + lamba__latn = "lam_Latn" + lao = "lao" + lao__laoo = "lao_Laoo" + laka_chad = "lap" + laka_chad__latn = "lap_Latn" + lama_togo = "las" + lama_togo__latn = "las_Latn" + latin = "lat" + latin__latn = "lat_Latn" + latvian = "lav" + latvian__latn = "lav_Latn" + lauje = "law" + lauje__latn = "law_Latn" + label = "lbb" + label__latn = "lbb_Latn" + lak = "lbe" + lak__cyrl = "lbe_Cyrl" + ladakhi = "lbj" + ladakhi__tibt = "lbj_Tibt" + central_bontok = "lbk" + central_bontok__latn = "lbk_Latn" + tungag = "lcm" + tungag__latn = "lcm_Latn" + western_lawa = "lcp" + western_lawa__thai = "lcp_Thai" + laari = "ldi" + laari__latn = "ldi_Latn" + laadan = "ldn" + laadan__latn = "ldn_Latn" + lega_shabunda = "lea" + lega_shabunda__latn = "lea_Latn" + lendu = "led" + lendu__latn = "led_Latn" + lyele = "lee" + lyele__latn = "lee_Latn" + lelemi = "lef" + lelemi__latn = "lef_Latn" + lenje = "leh" + lenje__latn = "leh_Latn" + nomaande = "lem" + nomaande__latn = "lem_Latn" + kara_papua_new_guinea = "leu" + kara_papua_new_guinea__latn = "leu_Latn" + ledo_kaili = "lew" + ledo_kaili__latn = "lew_Latn" + luang = "lex" + luang__latn = "lex_Latn" + lezghian = "lez" + lezghian__cyrl = "lez_Cyrl" + lingua_franca_nova = "lfn" + lingua_franca_nova__latn = "lfn_Latn" + lingua_franca_nova__cyrl = "lfn_Cyrl" + lugbara = "lgg" + lugbara__latn = "lgg_Latn" + wala = "lgl" + wala__latn = "lgl_Latn" + lega_mwenga = "lgm" + lega_mwenga__latn = "lgm_Latn" + lahu_shi = "lhi" + lahu_shi__latn = "lhi_Latn" + lahu = "lhu" + lahu__latn = "lhu_Latn" + west_central_limba = "lia" + west_central_limba__latn = "lia_Latn" + nyindrou = "lid" + nyindrou__latn = "lid_Latn" + limbu = "lif" + limbu__deva = "lif_Deva" + limbu__limb = "lif_Limb" + ligurian = "lij" + ligurian__latn = "lij_Latn" + limburgan = "lim" + limburgan__latn = "lim_Latn" + lingala = "lin" + lingala__latn = "lin_Latn" + sekpele = "lip" + sekpele__latn = "lip_Latn" + lisu = "lis" + lisu__lisu = "lis_Lisu" + lithuanian = "lit" + lithuanian__latn = "lit_Latn" + liv = "liv" + liv__latn = "liv_Latn" + lampung_api = "ljp" + lampung_api__latn = "ljp_Latn" + laki = "lki" + laki__arab = "lki_Arab" + lolo = "llb" + lolo__latn = "llb_Latn" + ladin = "lld" + ladin__latn = "lld_Latn" + lole = "llg" + lole__latn = "llg_Latn" + lele_chad = "lln" + lele_chad__latn = "lln_Latn" + lamkang = "lmk" + lamkang__latn = "lmk_Latn" + lombard = "lmo" + lombard__latn = "lmo_Latn" + limbum = "lmp" + limbum__latn = "lmp_Latn" + lundayeh = "lnd" + lundayeh__latn = "lnd_Latn" + lobi = "lob" + lobi__latn = "lob_Latn" + saluan = "loe" + saluan__latn = "loe_Latn" + logo = "log" + logo__latn = "log_Latn" + loko = "lok" + loko__latn = "lok_Latn" + mongo = "lol" + mongo__latn = "lol_Latn" + loma_liberia = "lom" + loma_liberia__latn = "lom_Latn" + lobala = "loq" + lobala__latn = "loq_Latn" + lozi = "loz" + lozi__latn = "loz_Latn" + northern_luri = "lrc" + northern_luri__arab = "lrc_Arab" + lashi = "lsi" + lashi__latn = "lsi_Latn" + saamia = "lsm" + saamia__latn = "lsm_Latn" + latgalian = "ltg" + latgalian__latn = "ltg_Latn" + luxembourgish = "ltz" + luxembourgish__latn = "ltz_Latn" + luba_lulua = "lua" + luba_lulua__latn = "lua_Latn" + luba_katanga = "lub" + luba_katanga__latn = "lub_Latn" + aringa = "luc" + aringa__latn = "luc_Latn" + ludian = "lud" + ludian__latn = "lud_Latn" + luvale = "lue" + luvale__latn = "lue_Latn" + ganda = "lug" + ganda__latn = "lug_Latn" + lunda = "lun" + lunda__latn = "lun_Latn" + luo_kenya_and_tanzania = "luo" + luo_kenya_and_tanzania__latn = "luo_Latn" + lushai = "lus" + lushai__latn = "lus_Latn" + luyia = "luy" + luyia__latn = "luy_Latn" + standard_latvian = "lvs" + standard_latvian__latn = "lvs_Latn" + wanga = "lwg" + wanga__latn = "lwg_Latn" + luwo = "lwo" + luwo__latn = "lwo_Latn" + lewo = "lww" + lewo__latn = "lww_Latn" + literary_chinese = "lzh" + literary_chinese__hani = "lzh_Hani" + san_jeronimo_tecoatl_mazatec = "maa" + san_jeronimo_tecoatl_mazatec__latn = "maa_Latn" + madurese = "mad" + madurese__latn = "mad_Latn" + mafa = "maf" + mafa__latn = "maf_Latn" + magahi = "mag" + magahi__deva = "mag_Deva" + marshallese = "mah" + marshallese__latn = "mah_Latn" + maithili = "mai" + maithili__deva = "mai_Deva" + jalapa_de_diaz_mazatec = "maj" + jalapa_de_diaz_mazatec__latn = "maj_Latn" + makasar = "mak" + makasar__latn = "mak_Latn" + malayalam = "mal" + malayalam__mlym = "mal_Mlym" + malayalam__latn = "mal_Latn" + mam = "mam" + mam__latn = "mam_Latn" + mandingo = "man" + mandingo__latn = "man_Latn" + chiquihuitlan_mazatec = "maq" + chiquihuitlan_mazatec__latn = "maq_Latn" + marathi = "mar" + marathi__deva = "mar_Deva" + marathi__latn = "mar_Latn" + masai = "mas" + masai__latn = "mas_Latn" + huautla_mazatec = "mau" + huautla_mazatec__latn = "mau_Latn" + satere_mawe = "mav" + satere_mawe__latn = "mav_Latn" + mampruli = "maw" + mampruli__latn = "maw_Latn" + north_moluccan_malay = "max" + north_moluccan_malay__latn = "max_Latn" + central_mazahua = "maz" + central_mazahua__latn = "maz_Latn" + western_bukidnon_manobo = "mbb" + western_bukidnon_manobo__latn = "mbb_Latn" + macushi = "mbc" + macushi__latn = "mbc_Latn" + dibabawon_manobo = "mbd" + dibabawon_manobo__latn = "mbd_Latn" + baba_malay = "mbf" + baba_malay__latn = "mbf_Latn" + mangseng = "mbh" + mangseng__latn = "mbh_Latn" + ilianen_manobo = "mbi" + ilianen_manobo__latn = "mbi_Latn" + nadeb = "mbj" + nadeb__latn = "mbj_Latn" + maxakali = "mbl" + maxakali__latn = "mbl_Latn" + sarangani_manobo = "mbs" + sarangani_manobo__latn = "mbs_Latn" + matigsalug_manobo = "mbt" + matigsalug_manobo__latn = "mbt_Latn" + maca = "mca" + maca__latn = "mca_Latn" + machiguenga = "mcb" + machiguenga__latn = "mcb_Latn" + sharanahua = "mcd" + sharanahua__latn = "mcd_Latn" + matses = "mcf" + matses__latn = "mcf_Latn" + mbunda = "mck" + mbunda__latn = "mck_Latn" + masana = "mcn" + masana__latn = "mcn_Latn" + coatlan_mixe = "mco" + coatlan_mixe__latn = "mco_Latn" + makaa = "mcp" + makaa__latn = "mcp_Latn" + ese = "mcq" + ese__latn = "mcq_Latn" + cameroon_mambila = "mcu" + cameroon_mambila__latn = "mcu_Latn" + mada_nigeria = "mda" + mada_nigeria__latn = "mda_Latn" + moksha = "mdf" + moksha__cyrl = "mdf_Cyrl" + male_ethiopia = "mdy" + male_ethiopia__ethi = "mdy_Ethi" + melpa = "med" + melpa__latn = "med_Latn" + mengen = "mee" + mengen__latn = "mee_Latn" + meyah = "mej" + meyah__latn = "mej_Latn" + mekeo = "mek" + mekeo__latn = "mek_Latn" + mende_sierra_leone = "men" + mende_sierra_leone__latn = "men_Latn" + merey = "meq" + merey__latn = "meq_Latn" + meru = "mer" + meru__latn = "mer_Latn" + mato = "met" + mato__latn = "met_Latn" + motu = "meu" + motu__latn = "meu_Latn" + mano = "mev" + mano__latn = "mev_Latn" + morisyen = "mfe" + morisyen__latn = "mfe_Latn" + mogofin = "mfg" + mogofin__latn = "mfg_Latn" + matal = "mfh" + matal__latn = "mfh_Latn" + wandala = "mfi" + wandala__latn = "mfi_Latn" + north_mofu = "mfk" + north_mofu__latn = "mfk_Latn" + moba = "mfq" + moba__latn = "mfq_Latn" + mayo = "mfy" + mayo__latn = "mfy_Latn" + mabaan = "mfz" + mabaan__latn = "mfz_Latn" + morokodo = "mgc" + morokodo__latn = "mgc_Latn" + makhuwa_meetto = "mgh" + makhuwa_meetto__latn = "mgh_Latn" + mambae = "mgm" + mambae__latn = "mgm_Latn" + meta_ = "mgo" + meta___latn = "mgo_Latn" + mambwe_lungu = "mgr" + mambwe_lungu__latn = "mgr_Latn" + ma_di = "mhi" + ma_di__latn = "mhi_Latn" + mauwake = "mhl" + mauwake__latn = "mhl_Latn" + eastern_mari = "mhr" + eastern_mari__cyrl = "mhr_Cyrl" + mbukushu = "mhw" + mbukushu__latn = "mhw_Latn" + maru = "mhx" + maru__latn = "mhx_Latn" + ma_anyan = "mhy" + ma_anyan__latn = "mhy_Latn" + atatlahuca_mixtec = "mib" + atatlahuca_mixtec__latn = "mib_Latn" + mi_kmaq = "mic" + mi_kmaq__latn = "mic_Latn" + ocotepec_mixtec = "mie" + ocotepec_mixtec__latn = "mie_Latn" + mofu_gudur = "mif" + mofu_gudur__latn = "mif_Latn" + san_miguel_el_grande_mixtec = "mig" + san_miguel_el_grande_mixtec__latn = "mig_Latn" + chayuco_mixtec = "mih" + chayuco_mixtec__latn = "mih_Latn" + penoles_mixtec = "mil" + penoles_mixtec__latn = "mil_Latn" + alacatlatzala_mixtec = "mim" + alacatlatzala_mixtec__latn = "mim_Latn" + minangkabau = "min" + minangkabau__latn = "min_Latn" + minangkabau__arab = "min_Arab" + pinotepa_nacional_mixtec = "mio" + pinotepa_nacional_mixtec__latn = "mio_Latn" + apasco_apoala_mixtec = "mip" + apasco_apoala_mixtec__latn = "mip_Latn" + miskito = "miq" + miskito__latn = "miq_Latn" + isthmus_mixe = "mir" + isthmus_mixe__latn = "mir_Latn" + southern_puebla_mixtec = "mit" + southern_puebla_mixtec__latn = "mit_Latn" + ayutla_mixtec = "miy" + ayutla_mixtec__latn = "miy_Latn" + coatzospan_mixtec = "miz" + coatzospan_mixtec__latn = "miz_Latn" + san_juan_colorado_mixtec = "mjc" + san_juan_colorado_mixtec__latn = "mjc_Latn" + karbi = "mjw" + karbi__latn = "mjw_Latn" + macedonian = "mkd" + macedonian__cyrl = "mkd_Cyrl" + mokole = "mkl" + mokole__latn = "mkl_Latn" + kupang_malay = "mkn" + kupang_malay__latn = "mkn_Latn" + silacayoapan_mixtec = "mks" + silacayoapan_mixtec__latn = "mks_Latn" + makasae = "mkz" + makasae__latn = "mkz_Latn" + malagasy = "mlg" + malagasy__latn = "mlg_Latn" + mape = "mlh" + mape__latn = "mlh_Latn" + bargam = "mlp" + bargam__latn = "mlp_Latn" + maltese = "mlt" + maltese__latn = "mlt_Latn" + to_abaita = "mlu" + to_abaita__latn = "mlu_Latn" + mamanwa = "mmn" + mamanwa__latn = "mmn_Latn" + mangga_buang = "mmo" + mangga_buang__latn = "mmo_Latn" + madak = "mmx" + madak__latn = "mmx_Latn" + mbula = "mna" + mbula__latn = "mna_Latn" + muna = "mnb" + muna__latn = "mnb_Latn" + mundani = "mnf" + mundani__latn = "mnf_Latn" + manipuri = "mni" + manipuri__latn = "mni_Latn" + manipuri__beng = "mni_Beng" + manipuri__mtei = "mni_Mtei" + mandinka = "mnk" + mandinka__latn = "mnk_Latn" + mansi = "mns" + mansi__cyrl = "mns_Cyrl" + mon = "mnw" + mon__mymr = "mnw_Mymr" + manikion = "mnx" + manikion__latn = "mnx_Latn" + manyawa = "mny" + manyawa__latn = "mny_Latn" + mwan = "moa" + mwan__latn = "moa_Latn" + mocovi = "moc" + mocovi__latn = "moc_Latn" + mongondow = "mog" + mongondow__latn = "mog_Latn" + mohawk = "moh" + mohawk__latn = "moh_Latn" + mongolian = "mon" + mongolian__cyrl = "mon_Cyrl" + mopan_maya = "mop" + mopan_maya__latn = "mop_Latn" + moro = "mor" + moro__latn = "mor_Latn" + mossi = "mos" + mossi__latn = "mos_Latn" + molima = "mox" + molima__latn = "mox_Latn" + marba = "mpg" + marba__latn = "mpg_Latn" + maung = "mph" + maung__latn = "mph_Latn" + yosondua_mixtec = "mpm" + yosondua_mixtec__latn = "mpm_Latn" + migabac = "mpp" + migabac__latn = "mpp_Latn" + dadibi = "mps" + dadibi__latn = "mps_Latn" + mian = "mpt" + mian__latn = "mpt_Latn" + misima_panaeati = "mpx" + misima_panaeati__latn = "mpx_Latn" + mbuko = "mqb" + mbuko__latn = "mqb_Latn" + mamasa = "mqj" + mamasa__latn = "mqj_Latn" + manggarai = "mqy" + manggarai__latn = "mqy_Latn" + mising = "mrg" + mising__latn = "mrg_Latn" + maori = "mri" + maori__latn = "mri_Latn" + western_mari = "mrj" + western_mari__cyrl = "mrj_Cyrl" + north_marquesan = "mrq" + north_marquesan__latn = "mrq_Latn" + mangareva = "mrv" + mangareva__latn = "mrv_Latn" + maranao = "mrw" + maranao__latn = "mrw_Latn" + malay_macrolanguage = "msa" + malay_macrolanguage__latn = "msa_Latn" + malay_macrolanguage__arab = "msa_Arab" + malay_macrolanguage__thai = "msa_Thai" + masbatenyo = "msb" + masbatenyo__latn = "msb_Latn" + sankaran_maninka = "msc" + sankaran_maninka__latn = "msc_Latn" + musey = "mse" + musey__latn = "mse_Latn" + mansaka = "msk" + mansaka__latn = "msk_Latn" + agusan_manobo = "msm" + agusan_manobo__latn = "msm_Latn" + aruamu = "msy" + aruamu__latn = "msy_Latn" + cotabato_manobo = "mta" + cotabato_manobo__latn = "mta_Latn" + una = "mtg" + una__latn = "mtg_Latn" + maiwa_papua_new_guinea = "mti" + maiwa_papua_new_guinea__latn = "mti_Latn" + moskona = "mtj" + moskona__latn = "mtj_Latn" + totontepec_mixe = "mto" + totontepec_mixe__latn = "mto_Latn" + wichi_lhamtes_nocten = "mtp" + wichi_lhamtes_nocten__latn = "mtp_Latn" + mundang = "mua" + mundang__latn = "mua_Latn" + musgu = "mug" + musgu__latn = "mug_Latn" + mundu = "muh" + mundu__latn = "muh_Latn" + musi = "mui" + musi__latn = "mui_Latn" + malvi = "mup" + malvi__deva = "mup_Deva" + murle = "mur" + murle__latn = "mur_Latn" + creek = "mus" + creek__latn = "mus_Latn" + bo_ung = "mux" + bo_ung__latn = "mux_Latn" + muyang = "muy" + muyang__latn = "muy_Latn" + manam = "mva" + manam__latn = "mva_Latn" + minaveha = "mvn" + minaveha__latn = "mvn_Latn" + duri = "mvp" + duri__latn = "mvp_Latn" + are = "mwc" + are__latn = "mwc_Latn" + murrinh_patha = "mwf" + murrinh_patha__latn = "mwf_Latn" + mirandese = "mwl" + mirandese__latn = "mwl_Latn" + sar = "mwm" + sar__latn = "mwm_Latn" + nyamwanga = "mwn" + nyamwanga__latn = "mwn_Latn" + kala_lagaw_ya = "mwp" + kala_lagaw_ya__latn = "mwp_Latn" + mun_chin = "mwq" + mun_chin__latn = "mwq_Latn" + mentawai = "mwv" + mentawai__latn = "mwv_Latn" + hmong_daw = "mww" + hmong_daw__latn = "mww_Latn" + tezoatlan_mixtec = "mxb" + tezoatlan_mixtec__latn = "mxb_Latn" + tlahuitoltepec_mixe = "mxp" + tlahuitoltepec_mixe__latn = "mxp_Latn" + juquila_mixe = "mxq" + juquila_mixe__latn = "mxq_Latn" + jamiltepec_mixtec = "mxt" + jamiltepec_mixtec__latn = "mxt_Latn" + metlatonoc_mixtec = "mxv" + metlatonoc_mixtec__latn = "mxv_Latn" + burmese = "mya" + burmese__mymr = "mya_Mymr" + mbay = "myb" + mbay__latn = "myb_Latn" + mamara_senoufo = "myk" + mamara_senoufo__latn = "myk_Latn" + munduruku = "myu" + munduruku__latn = "myu_Latn" + erzya = "myv" + erzya__cyrl = "myv_Cyrl" + muyuw = "myw" + muyuw__latn = "myw_Latn" + masaaba = "myx" + masaaba__latn = "myx_Latn" + macuna = "myy" + macuna__latn = "myy_Latn" + santa_maria_zacatepec_mixtec = "mza" + santa_maria_zacatepec_mixtec__latn = "mza_Latn" + wichi_lhamtes_guisnay = "mzh" + wichi_lhamtes_guisnay__latn = "mzh_Latn" + nigeria_mambila = "mzk" + nigeria_mambila__latn = "mzk_Latn" + mazatlan_mixe = "mzl" + mazatlan_mixe__latn = "mzl_Latn" + mumuye = "mzm" + mumuye__latn = "mzm_Latn" + mazanderani = "mzn" + mazanderani__arab = "mzn_Arab" + deg = "mzw" + deg__latn = "mzw_Latn" + maiadomu = "mzz" + maiadomu__latn = "mzz_Latn" + southern_nambikuara = "nab" + southern_nambikuara__latn = "nab_Latn" + nabak = "naf" + nabak__latn = "naf_Latn" + nakanai = "nak" + nakanai__latn = "nak_Latn" + min_nan_chinese = "nan" + min_nan_chinese__hani = "nan_Hani" + min_nan_chinese__latn = "nan_Latn" + neapolitan = "nap" + neapolitan__latn = "nap_Latn" + khoekhoe = "naq" + khoekhoe__latn = "naq_Latn" + naasioi = "nas" + naasioi__latn = "nas_Latn" + navajo = "nav" + navajo__latn = "nav_Latn" + nawuri = "naw" + nawuri__latn = "naw_Latn" + nyemba = "nba" + nyemba__latn = "nba_Latn" + chang_naga = "nbc" + chang_naga__latn = "nbc_Latn" + konyak_naga = "nbe" + konyak_naga__latn = "nbe_Latn" + south_ndebele = "nbl" + south_ndebele__latn = "nbl_Latn" + nggem = "nbq" + nggem__latn = "nbq_Latn" + rongmei_naga = "nbu" + rongmei_naga__latn = "nbu_Latn" + iyo = "nca" + iyo__latn = "nca_Latn" + central_huasteca_nahuatl = "nch" + central_huasteca_nahuatl__latn = "nch_Latn" + northern_puebla_nahuatl = "ncj" + northern_puebla_nahuatl__latn = "ncj_Latn" + michoacan_nahuatl = "ncl" + michoacan_nahuatl__latn = "ncl_Latn" + northern_katang = "ncq" + northern_katang__laoo = "ncq_Laoo" + chothe_naga = "nct" + chothe_naga__latn = "nct_Latn" + chumburung = "ncu" + chumburung__latn = "ncu_Latn" + central_puebla_nahuatl = "ncx" + central_puebla_nahuatl__latn = "ncx_Latn" + ndau = "ndc" + ndau__latn = "ndc_Latn" + north_ndebele = "nde" + north_ndebele__latn = "nde_Latn" + ndali = "ndh" + ndali__latn = "ndh_Latn" + samba_leko = "ndi" + samba_leko__latn = "ndi_Latn" + ndamba = "ndj" + ndamba__latn = "ndj_Latn" + ndonga = "ndo" + ndonga__latn = "ndo_Latn" + ndo = "ndp" + ndo__latn = "ndp_Latn" + low_german = "nds" + low_german__latn = "nds_Latn" + lutos = "ndy" + lutos__latn = "ndy_Latn" + ndogo = "ndz" + ndogo__latn = "ndz_Latn" + toura_cote_d_ivoire = "neb" + toura_cote_d_ivoire__latn = "neb_Latn" + nepali_macrolanguage = "nep" + nepali_macrolanguage__deva = "nep_Deva" + nepali_macrolanguage__latn = "nep_Latn" + newari = "new" + newari__deva = "new_Deva" + dhao = "nfa" + dhao__latn = "nfa_Latn" + nafaanra = "nfr" + nafaanra__latn = "nfr_Latn" + northern_ngbandi = "ngb" + northern_ngbandi__latn = "ngb_Latn" + ngombe_democratic_republic_of_congo = "ngc" + ngombe_democratic_republic_of_congo__latn = "ngc_Latn" + lomwe = "ngl" + lomwe__latn = "ngl_Latn" + ngulu = "ngp" + ngulu__latn = "ngp_Latn" + guerrero_nahuatl = "ngu" + guerrero_nahuatl__latn = "ngu_Latn" + chiripa = "nhd" + chiripa__latn = "nhd_Latn" + eastern_huasteca_nahuatl = "nhe" + eastern_huasteca_nahuatl__latn = "nhe_Latn" + tetelcingo_nahuatl = "nhg" + tetelcingo_nahuatl__latn = "nhg_Latn" + zacatlan_ahuacatlan_tepetzintla_nahuatl = "nhi" + zacatlan_ahuacatlan_tepetzintla_nahuatl__latn = "nhi_Latn" + isthmus_cosoleacaque_nahuatl = "nhk" + isthmus_cosoleacaque_nahuatl__latn = "nhk_Latn" + takuu = "nho" + takuu__latn = "nho_Latn" + naro = "nhr" + naro__latn = "nhr_Latn" + noone = "nhu" + noone__latn = "nhu_Latn" + western_huasteca_nahuatl = "nhw" + western_huasteca_nahuatl__latn = "nhw_Latn" + isthmus_mecayapan_nahuatl = "nhx" + isthmus_mecayapan_nahuatl__latn = "nhx_Latn" + northern_oaxaca_nahuatl = "nhy" + northern_oaxaca_nahuatl__latn = "nhy_Latn" + nias = "nia" + nias__latn = "nia_Latn" + nek = "nif" + nek__latn = "nif_Latn" + nii = "nii" + nii__latn = "nii_Latn" + ngaju = "nij" + ngaju__latn = "nij_Latn" + nilamba = "nim" + nilamba__latn = "nim_Latn" + ninzo = "nin" + ninzo__latn = "nin_Latn" + nganasan = "nio" + nganasan__cyrl = "nio_Cyrl" + nandi = "niq" + nandi__latn = "niq_Latn" + niuean = "niu" + niuean__latn = "niu_Latn" + ngiti = "niy" + ngiti__latn = "niy_Latn" + nocte_naga = "njb" + nocte_naga__latn = "njb_Latn" + angami_naga = "njm" + angami_naga__latn = "njm_Latn" + liangmai_naga = "njn" + liangmai_naga__latn = "njn_Latn" + ao_naga = "njo" + ao_naga__latn = "njo_Latn" + nyishi = "njz" + nyishi__latn = "njz_Latn" + inpui_naga = "nkf" + inpui_naga__latn = "nkf_Latn" + thangal_naga = "nki" + thangal_naga__latn = "nki_Latn" + nkonya = "nko" + nkonya__latn = "nko_Latn" + ngombale = "nla" + ngombale__latn = "nla_Latn" + nalca = "nlc" + nalca__latn = "nlc_Latn" + dutch = "nld" + dutch__latn = "nld_Latn" + gela = "nlg" + gela__latn = "nlg_Latn" + maram_naga = "nma" + maram_naga__latn = "nma_Latn" + tangkhul_naga_india = "nmf" + tangkhul_naga_india__latn = "nmf_Latn" + monsang_naga = "nmh" + monsang_naga__latn = "nmh_Latn" + moyon_naga = "nmo" + moyon_naga__latn = "nmo_Latn" + nimoa = "nmw" + nimoa__latn = "nmw_Latn" + nawdm = "nmz" + nawdm__latn = "nmz_Latn" + nande = "nnb" + nande__latn = "nnb_Latn" + maring_naga = "nng" + maring_naga__latn = "nng_Latn" + ngiemboon = "nnh" + ngiemboon__latn = "nnh_Latn" + northern_rengma_naga = "nnl" + northern_rengma_naga__latn = "nnl_Latn" + norwegian_nynorsk = "nno" + norwegian_nynorsk__latn = "nno_Latn" + wancho_naga = "nnp" + wancho_naga__latn = "nnp_Latn" + ngindo = "nnq" + ngindo__latn = "nnq_Latn" + southern_nuni = "nnw" + southern_nuni__latn = "nnw_Latn" + woun_meu = "noa" + woun_meu__latn = "noa_Latn" + norwegian_bokmal = "nob" + norwegian_bokmal__latn = "nob_Latn" + northern_thai = "nod" + northern_thai__thai = "nod_Thai" + nogai = "nog" + nogai__cyrl = "nog_Cyrl" + old_norse = "non" + old_norse__latn = "non_Latn" + numanggang = "nop" + numanggang__latn = "nop_Latn" + norwegian = "nor" + norwegian__latn = "nor_Latn" + nomatsiguenga = "not" + nomatsiguenga__latn = "not_Latn" + ewage_notu = "nou" + ewage_notu__latn = "nou_Latn" + novial = "nov" + novial__latn = "nov_Latn" + phom_naga = "nph" + phom_naga__latn = "nph_Latn" + nepali_individual_language = "npi" + nepali_individual_language__deva = "npi_Deva" + nepali_individual_language__latn = "npi_Latn" + southeastern_puebla_nahuatl = "npl" + southeastern_puebla_nahuatl__latn = "npl_Latn" + pochuri_naga = "npo" + pochuri_naga__latn = "npo_Latn" + napu = "npy" + napu__latn = "npy_Latn" + n_ko = "nqo" + n_ko__nkoo = "nqo_Nkoo" + southern_rengma_naga = "nre" + southern_rengma_naga__latn = "nre_Latn" + jerriais = "nrf" + jerriais__latn = "nrf_Latn" + chokri_naga = "nri" + chokri_naga__latn = "nri_Latn" + narom = "nrm" + narom__latn = "nrm_Latn" + sangtam_naga = "nsa" + sangtam_naga__latn = "nsa_Latn" + nsenga = "nse" + nsenga__latn = "nse_Latn" + sumi_naga = "nsm" + sumi_naga__latn = "nsm_Latn" + nehan = "nsn" + nehan__latn = "nsn_Latn" + pedi = "nso" + pedi__latn = "nso_Latn" + nali = "nss" + nali__latn = "nss_Latn" + tase_naga = "nst" + tase_naga__latn = "nst_Latn" + sierra_negra_nahuatl = "nsu" + sierra_negra_nahuatl__latn = "nsu_Latn" + northern_tepehuan = "ntp" + northern_tepehuan__latn = "ntp_Latn" + delo = "ntr" + delo__latn = "ntr_Latn" + natugu = "ntu" + natugu__latn = "ntu_Latn" + nyole = "nuj" + nyole__latn = "nuj_Latn" + nuer = "nus" + nuer__latn = "nus_Latn" + nunggubuyu = "nuy" + nunggubuyu__latn = "nuy_Latn" + tlamacazapa_nahuatl = "nuz" + tlamacazapa_nahuatl__latn = "nuz_Latn" + namiae = "nvm" + namiae__latn = "nvm_Latn" + nyabwa = "nwb" + nyabwa__latn = "nwb_Latn" + southwest_tanna = "nwi" + southwest_tanna__latn = "nwi_Latn" + middle_newar = "nwx" + middle_newar__deva = "nwx_Deva" + ngando_democratic_republic_of_congo = "nxd" + ngando_democratic_republic_of_congo__latn = "nxd_Latn" + nyanja = "nya" + nyanja__latn = "nya_Latn" + giryama = "nyf" + giryama__latn = "nyf_Latn" + nyaneka = "nyk" + nyaneka__latn = "nyk_Latn" + nyankole = "nyn" + nyankole__latn = "nyn_Latn" + nyoro = "nyo" + nyoro__latn = "nyo_Latn" + nyungwe = "nyu" + nyungwe__latn = "nyu_Latn" + nyakyusa_ngonde = "nyy" + nyakyusa_ngonde__latn = "nyy_Latn" + tigon_mbembe = "nza" + tigon_mbembe__latn = "nza_Latn" + nzima = "nzi" + nzima__latn = "nzi_Latn" + zeme_naga = "nzm" + zeme_naga__latn = "nzm_Latn" + obo_manobo = "obo" + obo_manobo__latn = "obo_Latn" + occitan_post_1500 = "oci" + occitan_post_1500__latn = "oci_Latn" + khana = "ogo" + khana__latn = "ogo_Latn" + northwestern_ojibwa = "ojb" + northwestern_ojibwa__latn = "ojb_Latn" + northwestern_ojibwa__cans = "ojb_Cans" + ojibwa = "oji" + ojibwa__latn = "oji_Latn" + ojibwa__cans = "oji_Cans" + okpe_southwestern_edo = "oke" + okpe_southwestern_edo__latn = "oke_Latn" + oku = "oku" + oku__latn = "oku_Latn" + orokaiva = "okv" + orokaiva__latn = "okv_Latn" + mochi = "old" + mochi__latn = "old_Latn" + livvi = "olo" + livvi__latn = "olo_Latn" + east_ambae = "omb" + east_ambae__latn = "omb_Latn" + south_tairora = "omw" + south_tairora__latn = "omw_Latn" + olo = "ong" + olo__latn = "ong_Latn" + ono = "ons" + ono__latn = "ons_Latn" + tohono_o_odham = "ood" + tohono_o_odham__latn = "ood_Latn" + oksapmin = "opm" + oksapmin__latn = "opm_Latn" + oriya_macrolanguage = "ori" + oriya_macrolanguage__orya = "ori_Orya" + oriya_macrolanguage__latn = "ori_Latn" + oromo = "orm" + oromo__latn = "orm_Latn" + old_russian = "orv" + old_russian__cyrl = "orv_Cyrl" + odia = "ory" + odia__orya = "ory_Orya" + odia__latn = "ory_Latn" + ossetian = "oss" + ossetian__cyrl = "oss_Cyrl" + ottoman_turkish_1500_1928 = "ota" + ottoman_turkish_1500_1928__arab = "ota_Arab" + ot_danum = "otd" + ot_danum__latn = "otd_Latn" + mezquital_otomi = "ote" + mezquital_otomi__latn = "ote_Latn" + eastern_highland_otomi = "otm" + eastern_highland_otomi__latn = "otm_Latn" + tenango_otomi = "otn" + tenango_otomi__latn = "otn_Latn" + queretaro_otomi = "otq" + queretaro_otomi__latn = "otq_Latn" + estado_de_mexico_otomi = "ots" + estado_de_mexico_otomi__latn = "ots_Latn" + ottawa = "otw" + ottawa__latn = "otw_Latn" + wayampi = "oym" + wayampi__latn = "oym_Latn" + koonzime = "ozm" + koonzime__latn = "ozm_Latn" + parecis = "pab" + parecis__latn = "pab_Latn" + paumari = "pad" + paumari__latn = "pad_Latn" + pangasinan = "pag" + pangasinan__latn = "pag_Latn" + tenharim = "pah" + tenharim__latn = "pah_Latn" + pampanga = "pam" + pampanga__latn = "pam_Latn" + panjabi = "pan" + panjabi__guru = "pan_Guru" + panjabi__latn = "pan_Latn" + northern_paiute = "pao" + northern_paiute__latn = "pao_Latn" + papiamento = "pap" + papiamento__latn = "pap_Latn" + palauan = "pau" + palauan__latn = "pau_Latn" + paez = "pbb" + paez__latn = "pbb_Latn" + patamona = "pbc" + patamona__latn = "pbc_Latn" + parkwa = "pbi" + parkwa__latn = "pbi_Latn" + southern_pashto = "pbt" + southern_pashto__arab = "pbt_Arab" + picard = "pcd" + picard__latn = "pcd_Latn" + paite_chin = "pck" + paite_chin__latn = "pck_Latn" + nigerian_pidgin = "pcm" + nigerian_pidgin__latn = "pcm_Latn" + pennsylvania_german = "pdc" + pennsylvania_german__latn = "pdc_Latn" + plautdietsch = "pdt" + plautdietsch__latn = "pdt_Latn" + phende = "pem" + phende__latn = "pem_Latn" + pere = "pfe" + pere__latn = "pfe_Latn" + pfaelzisch = "pfl" + pfaelzisch__latn = "pfl_Latn" + phimbi = "phm" + phimbi__latn = "phm_Latn" + yine = "pib" + yine__latn = "pib_Latn" + piapoco = "pio" + piapoco__latn = "pio_Latn" + piratapuyo = "pir" + piratapuyo__latn = "pir_Latn" + pijin = "pis" + pijin__latn = "pis_Latn" + pitjantjatjara = "pjt" + pitjantjatjara__latn = "pjt_Latn" + pokomo = "pkb" + pokomo__latn = "pkb_Latn" + pilaga = "plg" + pilaga__latn = "plg_Latn" + san_marcos_tlacoyalco_popoloca = "pls" + san_marcos_tlacoyalco_popoloca__latn = "pls_Latn" + plateau_malagasy = "plt" + plateau_malagasy__latn = "plt_Latn" + palikur = "plu" + palikur__latn = "plu_Latn" + brooke_s_point_palawano = "plw" + brooke_s_point_palawano__latn = "plw_Latn" + paama = "pma" + paama__latn = "pma_Latn" + pamona = "pmf" + pamona__latn = "pmf_Latn" + northern_pame = "pmq" + northern_pame__latn = "pmq_Latn" + piemontese = "pms" + piemontese__latn = "pms_Latn" + poumei_naga = "pmx" + poumei_naga__latn = "pmx_Latn" + western_panjabi = "pnb" + western_panjabi__arab = "pnb_Arab" + western_penan = "pne" + western_penan__latn = "pne_Latn" + pontic = "pnt" + pontic__grek = "pnt_Grek" + pinyin = "pny" + pinyin__latn = "pny_Latn" + san_juan_atzingo_popoloca = "poe" + san_juan_atzingo_popoloca__latn = "poe_Latn" + poqomchi_ = "poh" + poqomchi___latn = "poh_Latn" + highland_popoluca = "poi" + highland_popoluca__latn = "poi_Latn" + polish = "pol" + polish__latn = "pol_Latn" + pohnpeian = "pon" + pohnpeian__latn = "pon_Latn" + portuguese = "por" + portuguese__latn = "por_Latn" + sayula_popoluca = "pos" + sayula_popoluca__latn = "pos_Latn" + potawatomi = "pot" + potawatomi__latn = "pot_Latn" + upper_guinea_crioulo = "pov" + upper_guinea_crioulo__latn = "pov_Latn" + pogolo = "poy" + pogolo__latn = "poy_Latn" + uma = "ppk" + uma__latn = "ppk_Latn" + folopa = "ppo" + folopa__latn = "ppo_Latn" + san_luis_temalacayuca_popoloca = "pps" + san_luis_temalacayuca_popoloca__latn = "pps_Latn" + paranan = "prf" + paranan__latn = "prf_Latn" + prussian = "prg" + prussian__latn = "prg_Latn" + paici = "pri" + paici__latn = "pri_Latn" + asheninka_perene = "prq" + asheninka_perene__latn = "prq_Latn" + central_malay = "pse" + central_malay__latn = "pse_Latn" + kaulong = "pss" + kaulong__latn = "pss_Latn" + patep = "ptp" + patep__latn = "ptp_Latn" + bambam = "ptu" + bambam__latn = "ptu_Latn" + western_highland_purepecha = "pua" + western_highland_purepecha__latn = "pua_Latn" + puinave = "pui" + puinave__latn = "pui_Latn" + pushto = "pus" + pushto__arab = "pus_Arab" + gapapaiwa = "pwg" + gapapaiwa__latn = "pwg_Latn" + paiwan = "pwn" + paiwan__latn = "pwn_Latn" + pwo_northern_karen = "pww" + pwo_northern_karen__thai = "pww_Thai" + quetzaltepec_mixe = "pxm" + quetzaltepec_mixe__latn = "pxm_Latn" + huallaga_huanuco_quechua = "qub" + huallaga_huanuco_quechua__latn = "qub_Latn" + k_iche_ = "quc" + k_iche___latn = "quc_Latn" + quechua = "que" + quechua__latn = "que_Latn" + lambayeque_quechua = "quf" + lambayeque_quechua__latn = "quf_Latn" + chimborazo_highland_quichua = "qug" + chimborazo_highland_quichua__latn = "qug_Latn" + south_bolivian_quechua = "quh" + south_bolivian_quechua__latn = "quh_Latn" + north_bolivian_quechua = "qul" + north_bolivian_quechua__latn = "qul_Latn" + southern_pastaza_quechua = "qup" + southern_pastaza_quechua__latn = "qup_Latn" + santiago_del_estero_quichua = "qus" + santiago_del_estero_quichua__latn = "qus_Latn" + tena_lowland_quichua = "quw" + tena_lowland_quichua__latn = "quw_Latn" + ayacucho_quechua = "quy" + ayacucho_quechua__latn = "quy_Latn" + cusco_quechua = "quz" + cusco_quechua__latn = "quz_Latn" + ambo_pasco_quechua = "qva" + ambo_pasco_quechua__latn = "qva_Latn" + cajamarca_quechua = "qvc" + cajamarca_quechua__latn = "qvc_Latn" + eastern_apurimac_quechua = "qve" + eastern_apurimac_quechua__latn = "qve_Latn" + huamalies_dos_de_mayo_huanuco_quechua = "qvh" + huamalies_dos_de_mayo_huanuco_quechua__latn = "qvh_Latn" + imbabura_highland_quichua = "qvi" + imbabura_highland_quichua__latn = "qvi_Latn" + margos_yarowilca_lauricocha_quechua = "qvm" + margos_yarowilca_lauricocha_quechua__latn = "qvm_Latn" + north_junin_quechua = "qvn" + north_junin_quechua__latn = "qvn_Latn" + napo_lowland_quechua = "qvo" + napo_lowland_quechua__latn = "qvo_Latn" + san_martin_quechua = "qvs" + san_martin_quechua__latn = "qvs_Latn" + huaylla_wanca_quechua = "qvw" + huaylla_wanca_quechua__latn = "qvw_Latn" + northern_pastaza_quichua = "qvz" + northern_pastaza_quichua__latn = "qvz_Latn" + huaylas_ancash_quechua = "qwh" + huaylas_ancash_quechua__latn = "qwh_Latn" + panao_huanuco_quechua = "qxh" + panao_huanuco_quechua__latn = "qxh_Latn" + salasaca_highland_quichua = "qxl" + salasaca_highland_quichua__latn = "qxl_Latn" + northern_conchucos_ancash_quechua = "qxn" + northern_conchucos_ancash_quechua__latn = "qxn_Latn" + southern_conchucos_ancash_quechua = "qxo" + southern_conchucos_ancash_quechua__latn = "qxo_Latn" + canar_highland_quichua = "qxr" + canar_highland_quichua__latn = "qxr_Latn" + rade = "rad" + rade__latn = "rad_Latn" + ramoaaina = "rai" + ramoaaina__latn = "rai_Latn" + rajasthani = "raj" + rajasthani__deva = "raj_Deva" + rapanui = "rap" + rapanui__latn = "rap_Latn" + rarotongan = "rar" + rarotongan__latn = "rar_Latn" + sampang = "rav" + sampang__deva = "rav_Deva" + rawang = "raw" + rawang__latn = "raw_Latn" + reunion_creole_french = "rcf" + reunion_creole_french__latn = "rcf_Latn" + rejang = "rej" + rejang__latn = "rej_Latn" + rendille = "rel" + rendille__latn = "rel_Latn" + ringgou = "rgu" + ringgou__latn = "rgu_Latn" + rohingya = "rhg" + rohingya__latn = "rhg_Latn" + riang_india = "ria" + riang_india__latn = "ria_Latn" + nyaturu = "rim" + nyaturu__latn = "rim_Latn" + rajbanshi = "rjs" + rajbanshi__deva = "rjs_Deva" + rikbaktsa = "rkb" + rikbaktsa__latn = "rkb_Latn" + carpathian_romani = "rmc" + carpathian_romani__latn = "rmc_Latn" + angloromani = "rme" + angloromani__latn = "rme_Latn" + baltic_romani = "rml" + baltic_romani__latn = "rml_Latn" + balkan_romani = "rmn" + balkan_romani__latn = "rmn_Latn" + balkan_romani__grek = "rmn_Grek" + balkan_romani__cyrl = "rmn_Cyrl" + sinte_romani = "rmo" + sinte_romani__latn = "rmo_Latn" + calo = "rmq" + calo__latn = "rmq_Latn" + vlax_romani = "rmy" + vlax_romani__latn = "rmy_Latn" + vlax_romani__cyrl = "rmy_Cyrl" + ruund = "rnd" + ruund__latn = "rnd_Latn" + ronga = "rng" + ronga__latn = "rng_Latn" + ranglong = "rnl" + ranglong__latn = "rnl_Latn" + romansh = "roh" + romansh__latn = "roh_Latn" + romany = "rom" + romany__latn = "rom_Latn" + romany__cyrl = "rom_Cyrl" + romany__grek = "rom_Grek" + romanian = "ron" + romanian__latn = "ron_Latn" + romanian__cyrl = "ron_Cyrl" + rotokas = "roo" + rotokas__latn = "roo_Latn" + kriol = "rop" + kriol__latn = "rop_Latn" + dela_oenale = "row" + dela_oenale__latn = "row_Latn" + waima = "rro" + waima__latn = "rro_Latn" + rotuman = "rtm" + rotuman__latn = "rtm_Latn" + gungu = "rub" + gungu__latn = "rub_Latn" + rusyn = "rue" + rusyn__cyrl = "rue_Cyrl" + luguru = "ruf" + luguru__latn = "ruf_Latn" + roviana = "rug" + roviana__latn = "rug_Latn" + rundi = "run" + rundi__latn = "run_Latn" + macedo_romanian = "rup" + macedo_romanian__latn = "rup_Latn" + russian = "rus" + russian__cyrl = "rus_Cyrl" + rawa = "rwo" + rawa__latn = "rwo_Latn" + buglere = "sab" + buglere__latn = "sab_Latn" + sango = "sag" + sango__latn = "sag_Latn" + yakut = "sah" + yakut__cyrl = "sah_Cyrl" + sahu = "saj" + sahu__latn = "saj_Latn" + sanskrit__latn = "san_Latn" + sanskrit = "san" + sanskrit__deva = "san_Deva" + sasak = "sas" + sasak__latn = "sas_Latn" + santali = "sat" + santali__olck = "sat_Olck" + santali__latn = "sat_Latn" + saya = "say" + saya__latn = "say_Latn" + ngambay = "sba" + ngambay__latn = "sba_Latn" + southern_samo = "sbd" + southern_samo__latn = "sbd_Latn" + saliba = "sbe" + saliba__latn = "sbe_Latn" + botolan_sambal = "sbl" + botolan_sambal__latn = "sbl_Latn" + subiya = "sbs" + subiya__latn = "sbs_Latn" + soli = "sby" + soli__latn = "sby_Latn" + sadri = "sck" + sadri__deva = "sck_Deva" + sicilian = "scn" + sicilian__latn = "scn_Latn" + scots = "sco" + scots__latn = "sco_Latn" + toraja_sa_dan = "sda" + toraja_sa_dan__latn = "sda_Latn" + sassarese_sardinian = "sdc" + sassarese_sardinian__latn = "sdc_Latn" + southern_kurdish = "sdh" + southern_kurdish__arab = "sdh_Arab" + bukar_sadung_bidayuh = "sdo" + bukar_sadung_bidayuh__latn = "sdo_Latn" + semandang = "sdq" + semandang__latn = "sdq_Latn" + sena = "seh" + sena__latn = "seh_Latn" + selkup = "sel" + selkup__cyrl = "sel_Cyrl" + koyraboro_senni_songhai = "ses" + koyraboro_senni_songhai__latn = "ses_Latn" + secoya = "sey" + secoya__latn = "sey_Latn" + sehwi = "sfw" + sehwi__latn = "sfw_Latn" + mag_antsi_ayta = "sgb" + mag_antsi_ayta__latn = "sgb_Latn" + kipsigis = "sgc" + kipsigis__latn = "sgc_Latn" + shughni = "sgh" + shughni__cyrl = "sgh_Cyrl" + samogitian = "sgs" + samogitian__latn = "sgs_Latn" + sebat_bet_gurage = "sgw" + sebat_bet_gurage__ethi = "sgw_Ethi" + sursurunga = "sgz" + sursurunga__latn = "sgz_Latn" + tachelhit = "shi" + tachelhit__latn = "shi_Latn" + shilluk = "shk" + shilluk__latn = "shk_Latn" + shan = "shn" + shan__mymr = "shn_Mymr" + shipibo_conibo = "shp" + shipibo_conibo__latn = "shp_Latn" + shi = "shr" + shi__latn = "shr_Latn" + chadian_arabic = "shu" + chadian_arabic__arab = "shu_Arab" + sidamo = "sid" + sidamo__latn = "sid_Latn" + paasaal = "sig" + paasaal__latn = "sig_Latn" + tumulung_sisaala = "sil" + tumulung_sisaala__latn = "sil_Latn" + mende_papua_new_guinea = "sim" + mende_papua_new_guinea__latn = "sim_Latn" + sinhala = "sin" + sinhala__sinh = "sin_Sinh" + epena = "sja" + epena__latn = "sja_Latn" + xibe = "sjo" + xibe__mong = "sjo_Mong" + ume_sami = "sju" + ume_sami__latn = "sju_Latn" + sakalava_malagasy = "skg" + sakalava_malagasy__latn = "skg_Latn" + saraiki = "skr" + saraiki__arab = "skr_Arab" + sissala = "sld" + sissala__latn = "sld_Latn" + slovak = "slk" + slovak__latn = "slk_Latn" + salt_yui = "sll" + salt_yui__latn = "sll_Latn" + slovenian = "slv" + slovenian__latn = "slv_Latn" + southern_sami = "sma" + southern_sami__latn = "sma_Latn" + northern_sami = "sme" + northern_sami__latn = "sme_Latn" + lule_sami = "smj" + lule_sami__latn = "smj_Latn" + bolinao = "smk" + bolinao__latn = "smk_Latn" + central_sama = "sml" + central_sama__latn = "sml_Latn" + inari_sami = "smn" + inari_sami__latn = "smn_Latn" + samoan = "smo" + samoan__latn = "smo_Latn" + skolt_sami = "sms" + skolt_sami__latn = "sms_Latn" + simte = "smt" + simte__latn = "smt_Latn" + shona = "sna" + shona__latn = "sna_Latn" + sinaugoro = "snc" + sinaugoro__latn = "snc_Latn" + sindhi = "snd" + sindhi__arab = "snd_Arab" + sindhi__latn = "snd_Latn" + sindhi__deva = "snd_Deva" + noon = "snf" + noon__latn = "snf_Latn" + siona = "snn" + siona__latn = "snn_Latn" + siane = "snp" + siane__latn = "snp_Latn" + selee = "snw" + selee__latn = "snw_Latn" + saniyo_hiyewe = "sny" + saniyo_hiyewe__latn = "sny_Latn" + songomeno = "soe" + songomeno__latn = "soe_Latn" + somali = "som" + somali__latn = "som_Latn" + songe = "sop" + songe__latn = "sop_Latn" + kanasi = "soq" + kanasi__latn = "soq_Latn" + southern_sotho = "sot" + southern_sotho__latn = "sot_Latn" + miyobe = "soy" + miyobe__latn = "soy_Latn" + spanish = "spa" + spanish__latn = "spa_Latn" + selepet = "spl" + selepet__latn = "spl_Latn" + akukem = "spm" + akukem__latn = "spm_Latn" + supyire_senoufo = "spp" + supyire_senoufo__latn = "spp_Latn" + saposa = "sps" + saposa__latn = "sps_Latn" + sabaot = "spy" + sabaot__latn = "spy_Latn" + albanian = "sqi" + albanian__latn = "sqi_Latn" + sardinian = "srd" + sardinian__latn = "srd_Latn" + siriano = "sri" + siriano__latn = "sri_Latn" + saramaccan = "srm" + saramaccan__latn = "srm_Latn" + sranan_tongo = "srn" + sranan_tongo__latn = "srn_Latn" + serbian = "srp" + serbian__cyrl = "srp_Cyrl" + serbian__latn = "srp_Latn" + siriono = "srq" + siriono__latn = "srq_Latn" + serer = "srr" + serer__latn = "srr_Latn" + siroi = "ssd" + siroi__latn = "ssd_Latn" + seimat = "ssg" + seimat__latn = "ssg_Latn" + swati = "ssw" + swati__latn = "ssw_Latn" + samberigi = "ssx" + samberigi__latn = "ssx_Latn" + owa = "stn" + owa__latn = "stn_Latn" + southeastern_tepehuan = "stp" + southeastern_tepehuan__latn = "stp_Latn" + saterfriesisch = "stq" + saterfriesisch__latn = "stq_Latn" + sulka = "sua" + sulka__latn = "sua_Latn" + western_subanon = "suc" + western_subanon__latn = "suc_Latn" + suena = "sue" + suena__latn = "sue_Latn" + sukuma = "suk" + sukuma__latn = "suk_Latn" + sundanese = "sun" + sundanese__latn = "sun_Latn" + mwaghavul = "sur" + mwaghavul__latn = "sur_Latn" + susu = "sus" + susu__latn = "sus_Latn" + susu__arab = "sus_Arab" + sunwar = "suz" + sunwar__deva = "suz_Deva" + swahili_macrolanguage = "swa" + swahili_macrolanguage__latn = "swa_Latn" + maore_comorian = "swb" + maore_comorian__latn = "swb_Latn" + congo_swahili = "swc" + congo_swahili__latn = "swc_Latn" + swedish = "swe" + swedish__latn = "swe_Latn" + swabian = "swg" + swabian__latn = "swg_Latn" + swahili_individual_language = "swh" + swahili_individual_language__latn = "swh_Latn" + malawi_sena = "swk" + malawi_sena__latn = "swk_Latn" + suau = "swp" + suau__latn = "swp_Latn" + suba = "sxb" + suba__latn = "sxb_Latn" + sangir = "sxn" + sangir__latn = "sxn_Latn" + central_subanen = "syb" + central_subanen__latn = "syb_Latn" + classical_syriac = "syc" + classical_syriac__syrc = "syc_Syrc" + sylheti = "syl" + sylheti__latn = "syl_Latn" + sylheti__beng = "syl_Beng" + syriac = "syr" + syriac__syrc = "syr_Syrc" + ngalum = "szb" + ngalum__latn = "szb_Latn" + silesian = "szl" + silesian__latn = "szl_Latn" + sakizaya = "szy" + sakizaya__latn = "szy_Latn" + tabassaran = "tab" + tabassaran__cyrl = "tab_Cyrl" + lowland_tarahumara = "tac" + lowland_tarahumara__latn = "tac_Latn" + tahitian = "tah" + tahitian__latn = "tah_Latn" + eastern_tamang = "taj" + eastern_tamang__deva = "taj_Deva" + tamil = "tam" + tamil__taml = "tam_Taml" + tamil__latn = "tam_Latn" + taabwa = "tap" + taabwa__latn = "tap_Latn" + tamasheq = "taq" + tamasheq__latn = "taq_Latn" + tamasheq__tfng = "taq_Tfng" + central_tarahumara = "tar" + central_tarahumara__latn = "tar_Latn" + tatar = "tat" + tatar__cyrl = "tat_Cyrl" + tatar__latn = "tat_Latn" + tatuyo = "tav" + tatuyo__latn = "tav_Latn" + tai = "taw" + tai__latn = "taw_Latn" + atayal = "tay" + atayal__latn = "tay_Latn" + takia = "tbc" + takia__latn = "tbc_Latn" + north_tairora = "tbg" + north_tairora__latn = "tbg_Latn" + calamian_tagbanwa = "tbk" + calamian_tagbanwa__latn = "tbk_Latn" + tboli = "tbl" + tboli__latn = "tbl_Latn" + tawala = "tbo" + tawala__latn = "tbo_Latn" + tagbanwa = "tbw" + tagbanwa__latn = "tbw_Latn" + tabaru = "tby" + tabaru__latn = "tby_Latn" + ditammari = "tbz" + ditammari__latn = "tbz_Latn" + ticuna = "tca" + ticuna__latn = "tca_Latn" + datooga = "tcc" + datooga__latn = "tcc_Latn" + malinaltepec_me_phaa = "tcf" + malinaltepec_me_phaa__latn = "tcf_Latn" + torres_strait_creole = "tcs" + torres_strait_creole__latn = "tcs_Latn" + tulu = "tcy" + tulu__knda = "tcy_Knda" + thado_chin = "tcz" + thado_chin__latn = "tcz_Latn" + tetun_dili = "tdt" + tetun_dili__latn = "tdt_Latn" + tandroy_mahafaly_malagasy = "tdx" + tandroy_mahafaly_malagasy__latn = "tdx_Latn" + tepo_krumen = "ted" + tepo_krumen__latn = "ted_Latn" + huehuetla_tepehua = "tee" + huehuetla_tepehua__latn = "tee_Latn" + telugu = "tel" + telugu__telu = "tel_Telu" + telugu__latn = "tel_Latn" + timne = "tem" + timne__latn = "tem_Latn" + teso = "teo" + teso__latn = "teo_Latn" + tereno = "ter" + tereno__latn = "ter_Latn" + tetum = "tet" + tetum__latn = "tet_Latn" + tewa_usa = "tew" + tewa_usa__latn = "tew_Latn" + teribe = "tfr" + teribe__latn = "tfr_Latn" + tajik = "tgk" + tajik__cyrl = "tgk_Cyrl" + sudest = "tgo" + sudest__latn = "tgo_Latn" + tangoa = "tgp" + tangoa__latn = "tgp_Latn" + thai = "tha" + thai__thai = "tha_Thai" + tharaka = "thk" + tharaka__latn = "thk_Latn" + dangaura_tharu = "thl" + dangaura_tharu__deva = "thl_Deva" + tahaggart_tamahaq = "thv" + tahaggart_tamahaq__latn = "thv_Latn" + tifal = "tif" + tifal__latn = "tif_Latn" + tigre = "tig" + tigre__ethi = "tig_Ethi" + timugon_murut = "tih" + timugon_murut__latn = "tih_Latn" + tikar = "tik" + tikar__latn = "tik_Latn" + timbe = "tim" + timbe__latn = "tim_Latn" + tigrinya = "tir" + tigrinya__ethi = "tir_Ethi" + tiv = "tiv" + tiv__latn = "tiv_Latn" + tiruray = "tiy" + tiruray__latn = "tiy_Latn" + takwane = "tke" + takwane__latn = "tke_Latn" + tokelau = "tkl" + tokelau__latn = "tkl_Latn" + tsakhur = "tkr" + tsakhur__cyrl = "tkr_Cyrl" + upper_necaxa_totonac = "tku" + upper_necaxa_totonac__latn = "tku_Latn" + tobelo = "tlb" + tobelo__latn = "tlb_Latn" + telefol = "tlf" + telefol__latn = "tlf_Latn" + klingon = "tlh" + klingon__latn = "tlh_Latn" + talinga_bwisi = "tlj" + talinga_bwisi__latn = "tlj_Latn" + tetela = "tll" + tetela__latn = "tll_Latn" + talysh = "tly" + talysh__latn = "tly_Latn" + tumak = "tmc" + tumak__latn = "tmc_Latn" + haruai = "tmd" + haruai__latn = "tmd_Latn" + tamashek = "tmh" + tamashek__latn = "tmh_Latn" + tamashek__tfng = "tmh_Tfng" + tacana = "tna" + tacana__latn = "tna_Latn" + tanimuca_retuara = "tnc" + tanimuca_retuara__latn = "tnc_Latn" + kwamera = "tnk" + kwamera__latn = "tnk_Latn" + north_tanna = "tnn" + north_tanna__latn = "tnn_Latn" + whitesands = "tnp" + whitesands__latn = "tnp_Latn" + menik = "tnr" + menik__latn = "tnr_Latn" + toba = "tob" + toba__latn = "tob_Latn" + coyutla_totonac = "toc" + coyutla_totonac__latn = "toc_Latn" + toma = "tod" + toma__latn = "tod_Latn" + tonga_nyasa = "tog" + tonga_nyasa__latn = "tog_Latn" + gitonga = "toh" + gitonga__latn = "toh_Latn" + tonga_zambia = "toi" + tonga_zambia__latn = "toi_Latn" + tojolabal = "toj" + tojolabal__latn = "toj_Latn" + toki_pona = "tok" + toki_pona__latn = "tok_Latn" + tonga_tonga_islands = "ton" + tonga_tonga_islands__latn = "ton_Latn" + xicotepec_de_juarez_totonac = "too" + xicotepec_de_juarez_totonac__latn = "too_Latn" + papantla_totonac = "top" + papantla_totonac__latn = "top_Latn" + highland_totonac = "tos" + highland_totonac__latn = "tos_Latn" + taupota = "tpa" + taupota__latn = "tpa_Latn" + tok_pisin = "tpi" + tok_pisin__latn = "tpi_Latn" + tampulma = "tpm" + tampulma__latn = "tpm_Latn" + tupinamba = "tpn" + tupinamba__latn = "tpn_Latn" + pisaflores_tepehua = "tpp" + pisaflores_tepehua__latn = "tpp_Latn" + tlachichilco_tepehua = "tpt" + tlachichilco_tepehua__latn = "tpt_Latn" + tinputz = "tpz" + tinputz__latn = "tpz_Latn" + toaripi = "tqo" + toaripi__latn = "tqo_Latn" + copala_triqui = "trc" + copala_triqui__latn = "trc_Latn" + trinitario = "trn" + trinitario__latn = "trn_Latn" + tarao_naga = "tro" + tarao_naga__latn = "tro_Latn" + kok_borok = "trp" + kok_borok__latn = "trp_Latn" + san_martin_itunyoso_triqui = "trq" + san_martin_itunyoso_triqui__latn = "trq_Latn" + chicahuaxtla_triqui = "trs" + chicahuaxtla_triqui__latn = "trs_Latn" + sediq = "trv" + sediq__latn = "trv_Latn" + tswa = "tsc" + tswa__latn = "tsc_Latn" + tausug = "tsg" + tausug__latn = "tsg_Latn" + tswana = "tsn" + tswana__latn = "tsn_Latn" + tsonga = "tso" + tsonga__latn = "tso_Latn" + tsishingini = "tsw" + tsishingini__latn = "tsw_Latn" + purepecha = "tsz" + purepecha__latn = "tsz_Latn" + tektiteko = "ttc" + tektiteko__latn = "ttc_Latn" + bwanabwana = "tte" + bwanabwana__latn = "tte_Latn" + tooro = "ttj" + tooro__latn = "ttj_Latn" + tawallammat_tamajaq = "ttq" + tawallammat_tamajaq__latn = "ttq_Latn" + tawallammat_tamajaq__tfng = "ttq_Tfng" + mutu = "tuc" + mutu__latn = "tuc_Latn" + tuyuca = "tue" + tuyuca__latn = "tue_Latn" + central_tunebo = "tuf" + central_tunebo__latn = "tuf_Latn" + tupuri = "tui" + tupuri__latn = "tui_Latn" + turkmen = "tuk" + turkmen__latn = "tuk_Latn" + turkmen__cyrl = "tuk_Cyrl" + turkmen__arab = "tuk_Arab" + tula = "tul" + tula__latn = "tul_Latn" + tumbuka = "tum" + tumbuka__latn = "tum_Latn" + tucano = "tuo" + tucano__latn = "tuo_Latn" + turkish = "tur" + turkish__latn = "tur_Latn" + turkana = "tuv" + turkana__latn = "tuv_Latn" + southeast_ambrym = "tvk" + southeast_ambrym__latn = "tvk_Latn" + tuvalu = "tvl" + tuvalu__latn = "tvl_Latn" + western_tawbuid = "twb" + western_tawbuid__latn = "twb_Latn" + twi = "twi" + twi__latn = "twi_Latn" + termanu = "twu" + termanu__latn = "twu_Latn" + tewe = "twx" + tewe__latn = "twx_Latn" + tii = "txq" + tii__latn = "txq_Latn" + kayapo = "txu" + kayapo__latn = "txu_Latn" + tuvinian = "tyv" + tuvinian__cyrl = "tyv_Cyrl" + tzeltal = "tzh" + tzeltal__latn = "tzh_Latn" + tz_utujil = "tzj" + tz_utujil__latn = "tzj_Latn" + talossan = "tzl" + talossan__latn = "tzl_Latn" + central_atlas_tamazight = "tzm" + central_atlas_tamazight__tfng = "tzm_Tfng" + tzotzil = "tzo" + tzotzil__latn = "tzo_Latn" + ubir = "ubr" + ubir__latn = "ubr_Latn" + umbu_ungu = "ubu" + umbu_ungu__latn = "ubu_Latn" + udmurt = "udm" + udmurt__cyrl = "udm_Cyrl" + uduk = "udu" + uduk__latn = "udu_Latn" + uighur = "uig" + uighur__arab = "uig_Arab" + uighur__latn = "uig_Latn" + uighur__cyrl = "uig_Cyrl" + ukrainian = "ukr" + ukrainian__cyrl = "ukr_Cyrl" + umbundu = "umb" + umbundu__latn = "umb_Latn" + undetermined = "und" + undetermined__mult = "und_Mult" + undetermined__bamu = "und_Bamu" + undetermined__kana = "und_Kana" + undetermined__tang = "und_Tang" + undetermined__cyrl = "und_Cyrl" + undetermined__xsux = "und_Xsux" + undetermined__kits = "und_Kits" + undetermined__yiii = "und_Yiii" + undetermined__grek = "und_Grek" + undetermined__hira = "und_Hira" + undetermined__syrc = "und_Syrc" + undetermined__lina = "und_Lina" + undetermined__samr = "und_Samr" + undetermined__copt = "und_Copt" + undetermined__shrd = "und_Shrd" + undetermined__cans = "und_Cans" + undetermined__egyp = "und_Egyp" + undetermined__hluw = "und_Hluw" + undetermined__mroo = "und_Mroo" + undetermined__laoo = "und_Laoo" + undetermined__linb = "und_Linb" + undetermined__brai = "und_Brai" + undetermined__runr = "und_Runr" + undetermined__hung = "und_Hung" + undetermined__modi = "und_Modi" + undetermined__nkoo = "und_Nkoo" + undetermined__mend = "und_Mend" + undetermined__sgnw = "und_Sgnw" + undetermined__lana = "und_Lana" + undetermined__bali = "und_Bali" + undetermined__bopo = "und_Bopo" + undetermined__ethi = "und_Ethi" + undetermined__mong = "und_Mong" + undetermined__nshu = "und_Nshu" + undetermined__adlm = "und_Adlm" + undetermined__mtei = "und_Mtei" + undetermined__dupl = "und_Dupl" + undetermined__cpmn = "und_Cpmn" + undetermined__vaii = "und_Vaii" + undetermined__orkh = "und_Orkh" + undetermined__glag = "und_Glag" + undetermined__geor = "und_Geor" + undetermined__thai = "und_Thai" + undetermined__hebr = "und_Hebr" + undetermined__dsrt = "und_Dsrt" + undetermined__tibt = "und_Tibt" + undetermined__telu = "und_Telu" + undetermined__lepc = "und_Lepc" + undetermined__sinh = "und_Sinh" + undetermined__armn = "und_Armn" + undetermined__saur = "und_Saur" + undetermined__khmr = "und_Khmr" + undetermined__deva = "und_Deva" + undetermined__hmnp = "und_Hmnp" + undetermined__thaa = "und_Thaa" + undetermined__orya = "und_Orya" + undetermined__lisu = "und_Lisu" + undetermined__cher = "und_Cher" + undetermined__mymr = "und_Mymr" + undetermined__merc = "und_Merc" + undetermined__bhks = "und_Bhks" + undetermined__plrd = "und_Plrd" + undetermined__brah = "und_Brah" + undetermined__gran = "und_Gran" + undetermined__hmng = "und_Hmng" + undetermined__khar = "und_Khar" + undetermined__cprt = "und_Cprt" + undetermined__mlym = "und_Mlym" + undetermined__taml = "und_Taml" + undetermined__tnsa = "und_Tnsa" + undetermined__aghb = "und_Aghb" + undetermined__mani = "und_Mani" + undetermined__gonm = "und_Gonm" + undetermined__vith = "und_Vith" + undetermined__ahom = "und_Ahom" + undetermined__newa = "und_Newa" + undetermined__kali = "und_Kali" + undetermined__beng = "und_Beng" + undetermined__limb = "und_Limb" + undetermined__phnx = "und_Phnx" + undetermined__medf = "und_Medf" + undetermined__phag = "und_Phag" + undetermined__cari = "und_Cari" + undetermined__gujr = "und_Gujr" + undetermined__java = "und_Java" + undetermined__osge = "und_Osge" + undetermined__diak = "und_Diak" + undetermined__talu = "und_Talu" + undetermined__nagm = "und_Nagm" + undetermined__sidd = "und_Sidd" + undetermined__marc = "und_Marc" + undetermined__wcho = "und_Wcho" + undetermined__avst = "und_Avst" + undetermined__sora = "und_Sora" + undetermined__armi = "und_Armi" + undetermined__wara = "und_Wara" + undetermined__rohg = "und_Rohg" + undetermined__sund = "und_Sund" + undetermined__xpeo = "und_Xpeo" + undetermined__tirh = "und_Tirh" + undetermined__khoj = "und_Khoj" + undetermined__knda = "und_Knda" + undetermined__palm = "und_Palm" + undetermined__kthi = "und_Kthi" + undetermined__ital = "und_Ital" + undetermined__ougr = "und_Ougr" + undetermined__takr = "und_Takr" + undetermined__soyo = "und_Soyo" + undetermined__lyci = "und_Lyci" + undetermined__zanb = "und_Zanb" + undetermined__dogr = "und_Dogr" + undetermined__tavt = "und_Tavt" + undetermined__gong = "und_Gong" + undetermined__cham = "und_Cham" + undetermined__elba = "und_Elba" + undetermined__bass = "und_Bass" + undetermined__yezi = "und_Yezi" + undetermined__toto = "und_Toto" + undetermined__ogam = "und_Ogam" + undetermined__pauc = "und_Pauc" + undetermined__tfng = "und_Tfng" + undetermined__cakm = "und_Cakm" + undetermined__kawi = "und_Kawi" + undetermined__guru = "und_Guru" + undetermined__tale = "und_Tale" + undetermined__osma = "und_Osma" + undetermined__sylo = "und_Sylo" + undetermined__sind = "und_Sind" + undetermined__nand = "und_Nand" + undetermined__sogo = "und_Sogo" + undetermined__nbat = "und_Nbat" + undetermined__batk = "und_Batk" + undetermined__phli = "und_Phli" + undetermined__mahj = "und_Mahj" + undetermined__sogd = "und_Sogd" + undetermined__shaw = "und_Shaw" + undetermined__rjng = "und_Rjng" + undetermined__tglg = "und_Tglg" + undetermined__lydi = "und_Lydi" + undetermined__mand = "und_Mand" + undetermined__goth = "und_Goth" + undetermined__hatr = "und_Hatr" + undetermined__mero = "und_Mero" + undetermined__phlp = "und_Phlp" + undetermined__chrs = "und_Chrs" + undetermined__sarb = "und_Sarb" + undetermined__prti = "und_Prti" + undetermined__olck = "und_Olck" + undetermined__bugi = "und_Bugi" + undetermined__narb = "und_Narb" + undetermined__tagb = "und_Tagb" + undetermined__hano = "und_Hano" + undetermined__ugar = "und_Ugar" + undetermined__maka = "und_Maka" + undetermined__perm = "und_Perm" + undetermined__buhd = "und_Buhd" + undetermined__elym = "und_Elym" + uripiv_wala_rano_atchin = "upv" + uripiv_wala_rano_atchin__latn = "upv_Latn" + urarina = "ura" + urarina__latn = "ura_Latn" + urubu_kaapor = "urb" + urubu_kaapor__latn = "urb_Latn" + urdu = "urd" + urdu__arab = "urd_Arab" + urdu__latn = "urd_Latn" + urhobo = "urh" + urhobo__latn = "urh_Latn" + urim = "uri" + urim__latn = "uri_Latn" + urak_lawoi_ = "urk" + urak_lawoi___thai = "urk_Thai" + urat = "urt" + urat__latn = "urt_Latn" + sop = "urw" + sop__latn = "urw_Latn" + orya = "ury" + orya__latn = "ury_Latn" + usarufa = "usa" + usarufa__latn = "usa_Latn" + uspanteco = "usp" + uspanteco__latn = "usp_Latn" + ut_hun = "uth" + ut_hun__latn = "uth_Latn" + uri = "uvh" + uri__latn = "uvh_Latn" + lote = "uvl" + lote__latn = "uvl_Latn" + uzbek = "uzb" + uzbek__latn = "uzb_Latn" + uzbek__cyrl = "uzb_Cyrl" + uzbek__arab = "uzb_Arab" + northern_uzbek = "uzn" + northern_uzbek__latn = "uzn_Latn" + northern_uzbek__cyrl = "uzn_Cyrl" + southern_uzbek = "uzs" + southern_uzbek__arab = "uzs_Arab" + vagla = "vag" + vagla__latn = "vag_Latn" + vaiphei = "vap" + vaiphei__latn = "vap_Latn" + huarijio = "var" + huarijio__latn = "var_Latn" + venetian = "vec" + venetian__latn = "vec_Latn" + venda = "ven" + venda__latn = "ven_Latn" + veps = "vep" + veps__latn = "vep_Latn" + vidunda = "vid" + vidunda__latn = "vid_Latn" + vietnamese = "vie" + vietnamese__latn = "vie_Latn" + iduna = "viv" + iduna__latn = "viv_Latn" + vlaams = "vls" + vlaams__latn = "vls_Latn" + makhuwa_shirima = "vmk" + makhuwa_shirima__latn = "vmk_Latn" + makhuwa = "vmw" + makhuwa__latn = "vmw_Latn" + ayautla_mazatec = "vmy" + ayautla_mazatec__latn = "vmy_Latn" + volapuk = "vol" + volapuk__latn = "vol_Latn" + votic = "vot" + votic__latn = "vot_Latn" + voro = "vro" + voro__latn = "vro_Latn" + vunjo = "vun" + vunjo__latn = "vun_Latn" + vute = "vut" + vute__latn = "vut_Latn" + waffa = "waj" + waffa__latn = "waj_Latn" + wolaytta = "wal" + wolaytta__latn = "wal_Latn" + wolaytta__ethi = "wal_Ethi" + wapishana = "wap" + wapishana__latn = "wap_Latn" + waray_philippines = "war" + waray_philippines__latn = "war_Latn" + kaninuwa = "wat" + kaninuwa__latn = "wat_Latn" + wayana = "way" + wayana__latn = "way_Latn" + warao = "wba" + warao__latn = "wba_Latn" + wa = "wbm" + wa__latn = "wbm_Latn" + warlpiri = "wbp" + warlpiri__latn = "wbp_Latn" + wedau = "wed" + wedau__latn = "wed_Latn" + weri = "wer" + weri__latn = "wer_Latn" + cameroon_pidgin = "wes" + cameroon_pidgin__latn = "wes_Latn" + wejewa = "wew" + wejewa__latn = "wew_Latn" + north_wahgi = "whg" + north_wahgi__latn = "whg_Latn" + wahau_kenyah = "whk" + wahau_kenyah__latn = "whk_Latn" + southern_toussian = "wib" + southern_toussian__latn = "wib_Latn" + wik_mungkan = "wim" + wik_mungkan__latn = "wim_Latn" + wiru = "wiu" + wiru__latn = "wiu_Latn" + walloon = "wln" + walloon__latn = "wln_Latn" + wallisian = "wls" + wallisian__latn = "wls_Latn" + wichi_lhamtes_vejoz = "wlv" + wichi_lhamtes_vejoz__latn = "wlv_Latn" + wali_ghana = "wlx" + wali_ghana__latn = "wlx_Latn" + walmajarri = "wmt" + walmajarri__latn = "wmt_Latn" + mwani = "wmw" + mwani__latn = "wmw_Latn" + wantoat = "wnc" + wantoat__latn = "wnc_Latn" + usan = "wnu" + usan__latn = "wnu_Latn" + we_northern = "wob" + we_northern__latn = "wob_Latn" + wolof = "wol" + wolof__latn = "wol_Latn" + hanga_hundi = "wos" + hanga_hundi__latn = "wos_Latn" + garrwa = "wrk" + garrwa__latn = "wrk_Latn" + waris = "wrs" + waris__latn = "wrs_Latn" + adilabad_gondi = "wsg" + adilabad_gondi__telu = "wsg_Telu" + waskia = "wsk" + waskia__latn = "wsk_Latn" + wu_chinese = "wuu" + wu_chinese__hani = "wuu_Hani" + wuvulu_aua = "wuv" + wuvulu_aua__latn = "wuv_Latn" + waama = "wwa" + waama__latn = "wwa_Latn" + kalmyk = "xal" + kalmyk__cyrl = "xal_Cyrl" + xavante = "xav" + xavante__latn = "xav_Latn" + kombio = "xbi" + kombio__latn = "xbi_Latn" + kambera = "xbr" + kambera__latn = "xbr_Latn" + hdi = "xed" + hdi__latn = "xed_Latn" + xhosa = "xho" + xhosa__latn = "xho_Latn" + kamula = "xla" + kamula__latn = "xla_Latn" + mingrelian = "xmf" + mingrelian__geor = "xmf_Geor" + manado_malay = "xmm" + manado_malay__latn = "xmm_Latn" + antankarana_malagasy = "xmv" + antankarana_malagasy__latn = "xmv_Latn" + northern_kankanay = "xnn" + northern_kankanay__latn = "xnn_Latn" + soga = "xog" + soga__latn = "xog_Latn" + konkomba = "xon" + konkomba__latn = "xon_Latn" + eastern_karaboro = "xrb" + eastern_karaboro__latn = "xrb_Latn" + sambal = "xsb" + sambal__latn = "xsb_Latn" + sio = "xsi" + sio__latn = "xsi_Latn" + kasem = "xsm" + kasem__latn = "xsm_Latn" + sherpa = "xsr" + sherpa__deva = "xsr_Deva" + sanuma = "xsu" + sanuma__latn = "xsu_Latn" + diuxi_tilantongo_mixtec = "xtd" + diuxi_tilantongo_mixtec__latn = "xtd_Latn" + magdalena_penasco_mixtec = "xtm" + magdalena_penasco_mixtec__latn = "xtm_Latn" + northern_tlaxiaco_mixtec = "xtn" + northern_tlaxiaco_mixtec__latn = "xtn_Latn" + umbrian = "xum" + umbrian__latn = "xum_Latn" + kuo = "xuo" + kuo__latn = "xuo_Latn" + yaminahua = "yaa" + yaminahua__latn = "yaa_Latn" + yagua = "yad" + yagua__latn = "yad_Latn" + yalunka = "yal" + yalunka__latn = "yal_Latn" + yamba = "yam" + yamba__latn = "yam_Latn" + mayangna = "yan" + mayangna__latn = "yan_Latn" + yao = "yao" + yao__latn = "yao_Latn" + yapese = "yap" + yapese__latn = "yap_Latn" + yaqui = "yaq" + yaqui__latn = "yaq_Latn" + nugunu_cameroon = "yas" + nugunu_cameroon__latn = "yas_Latn" + yambeta = "yat" + yambeta__latn = "yat_Latn" + lokaa = "yaz" + lokaa__latn = "yaz_Latn" + yemba = "ybb" + yemba__latn = "ybb_Latn" + yaweyuha = "yby" + yaweyuha__latn = "yby_Latn" + yucuna = "ycn" + yucuna__latn = "ycn_Latn" + eastern_yiddish = "ydd" + eastern_yiddish__hebr = "ydd_Hebr" + yiddish = "yid" + yiddish__hebr = "yid_Hebr" + yimchungru_naga = "yim" + yimchungru_naga__latn = "yim_Latn" + yakan = "yka" + yakan__latn = "yka_Latn" + yele = "yle" + yele__latn = "yle_Latn" + angguruk_yali = "yli" + angguruk_yali__latn = "yli_Latn" + iamalele = "yml" + iamalele__latn = "yml_Latn" + yombe = "yom" + yombe__latn = "yom_Latn" + yongkom = "yon" + yongkom__latn = "yon_Latn" + yoruba = "yor" + yoruba__latn = "yor_Latn" + yareba = "yrb" + yareba__latn = "yrb_Latn" + yaoure = "yre" + yaoure__latn = "yre_Latn" + nenets = "yrk" + nenets__cyrl = "yrk_Cyrl" + nhengatu = "yrl" + nhengatu__latn = "yrl_Latn" + yessan_mayo = "yss" + yessan_mayo__latn = "yss_Latn" + yucateco = "yua" + yucateco__latn = "yua_Latn" + yue_chinese = "yue" + yue_chinese__hani = "yue_Hani" + karkar_yuri = "yuj" + karkar_yuri__latn = "yuj_Latn" + yukpa = "yup" + yukpa__latn = "yup_Latn" + yopno = "yut" + yopno__latn = "yut_Latn" + yau_morobe_province = "yuw" + yau_morobe_province__latn = "yuw_Latn" + yuracare = "yuz" + yuracare__latn = "yuz_Latn" + yawa = "yva" + yawa__latn = "yva_Latn" + sierra_de_juarez_zapotec = "zaa" + sierra_de_juarez_zapotec__latn = "zaa_Latn" + western_tlacolula_valley_zapotec = "zab" + western_tlacolula_valley_zapotec__latn = "zab_Latn" + ocotlan_zapotec = "zac" + ocotlan_zapotec__latn = "zac_Latn" + cajonos_zapotec = "zad" + cajonos_zapotec__latn = "zad_Latn" + yareni_zapotec = "zae" + yareni_zapotec__latn = "zae_Latn" + isthmus_zapotec = "zai" + isthmus_zapotec__latn = "zai_Latn" + miahuatlan_zapotec = "zam" + miahuatlan_zapotec__latn = "zam_Latn" + ozolotepec_zapotec = "zao" + ozolotepec_zapotec__latn = "zao_Latn" + zapotec = "zap" + zapotec__latn = "zap_Latn" + rincon_zapotec = "zar" + rincon_zapotec__latn = "zar_Latn" + santo_domingo_albarradas_zapotec = "zas" + santo_domingo_albarradas_zapotec__latn = "zas_Latn" + tabaa_zapotec = "zat" + tabaa_zapotec__latn = "zat_Latn" + yatzachi_zapotec = "zav" + yatzachi_zapotec__latn = "zav_Latn" + mitla_zapotec = "zaw" + mitla_zapotec__latn = "zaw_Latn" + coatecas_altas_zapotec = "zca" + coatecas_altas_zapotec__latn = "zca_Latn" + ngazidja_comorian = "zdj" + ngazidja_comorian__latn = "zdj_Latn" + zeeuws = "zea" + zeeuws__latn = "zea_Latn" + standard_moroccan_tamazight = "zgh" + standard_moroccan_tamazight__tfng = "zgh_Tfng" + zhuang = "zha" + zhuang__latn = "zha_Latn" + chinese = "zho" + chinese__hani = "zho_Hani" + chinese__latn = "zho_Latn" + zia = "zia" + zia__latn = "zia_Latn" + zigula = "ziw" + zigula__latn = "ziw_Latn" + zande_individual_language = "zne" + zande_individual_language__latn = "zne_Latn" + copainala_zoque = "zoc" + copainala_zoque__latn = "zoc_Latn" + zou = "zom" + zou__latn = "zom_Latn" + francisco_leon_zoque = "zos" + francisco_leon_zoque__latn = "zos_Latn" + lachiguiri_zapotec = "zpa" + lachiguiri_zapotec__latn = "zpa_Latn" + choapan_zapotec = "zpc" + choapan_zapotec__latn = "zpc_Latn" + guevea_de_humboldt_zapotec = "zpg" + guevea_de_humboldt_zapotec__latn = "zpg_Latn" + santa_maria_quiegolani_zapotec = "zpi" + santa_maria_quiegolani_zapotec__latn = "zpi_Latn" + quiavicuzas_zapotec = "zpj" + quiavicuzas_zapotec__latn = "zpj_Latn" + lachixio_zapotec = "zpl" + lachixio_zapotec__latn = "zpl_Latn" + mixtepec_zapotec = "zpm" + mixtepec_zapotec__latn = "zpm_Latn" + amatlan_zapotec = "zpo" + amatlan_zapotec__latn = "zpo_Latn" + zoogocho_zapotec = "zpq" + zoogocho_zapotec__latn = "zpq_Latn" + san_vicente_coatlan_zapotec = "zpt" + san_vicente_coatlan_zapotec__latn = "zpt_Latn" + yalalag_zapotec = "zpu" + yalalag_zapotec__latn = "zpu_Latn" + chichicapan_zapotec = "zpv" + chichicapan_zapotec__latn = "zpv_Latn" + texmelucan_zapotec = "zpz" + texmelucan_zapotec__latn = "zpz_Latn" + standard_malay = "zsm" + standard_malay__latn = "zsm_Latn" + standard_malay__arab = "zsm_Arab" + southern_rincon_zapotec = "zsr" + southern_rincon_zapotec__latn = "zsr_Latn" + quioquitani_quieri_zapotec = "ztq" + quioquitani_quieri_zapotec__latn = "ztq_Latn" + yatee_zapotec = "zty" + yatee_zapotec__latn = "zty_Latn" + zulu = "zul" + zulu__latn = "zul_Latn" + no_linguistic_content = "zxx" + no_linguistic_content__latn = "zxx_Latn" + no_linguistic_content__zzzz = "zxx_Zzzz" + no_linguistic_content__arab = "zxx_Arab" + yongbei_zhuang = "zyb" + yongbei_zhuang__latn = "zyb_Latn" + zyphe_chin = "zyp" + zyphe_chin__latn = "zyp_Latn" + zaza = "zza" + zaza__latn = "zza_Latn" class StatHints: diff --git a/src/datatrove/utils/word_tokenizers.py b/src/datatrove/utils/word_tokenizers.py index 8eb205aa..1d8356fa 100644 --- a/src/datatrove/utils/word_tokenizers.py +++ b/src/datatrove/utils/word_tokenizers.py @@ -1,8 +1,15 @@ +import csv +import os +import re from abc import ABC, abstractmethod +from functools import lru_cache, partial from typing import Callable, Iterator -from datatrove.utils._import_utils import check_required_dependencies -from datatrove.utils.typeshelper import Languages +import regex +from loguru import logger + +from datatrove.utils._import_utils import ASSETS_PATH, check_required_dependencies +from datatrove.utils.text import TERMINAL_PUNCTUATION def strip_strings(els: list[str]) -> list[str]: @@ -10,6 +17,9 @@ def strip_strings(els: list[str]) -> list[str]: def simple_span_tokenize(text: str, sents: list[str]) -> Iterator[tuple[int, int]]: + if len(sents) == 1: + yield 0, len(text) + return start_index = 0 for sent in sents: start_char = text.index(sent, start_index) @@ -18,7 +28,38 @@ def simple_span_tokenize(text: str, sents: list[str]) -> Iterator[tuple[int, int yield start_char, end_char +# https://github.com/explosion/spaCy/issues/13207 +def chunk_text_on_bytes(text: str, max_chunk_size: int = 1_000_000): + def __utf8len(s: str): + return len(s.encode("utf-8")) + + factor = len(text) / __utf8len(text) if __utf8len(text) > 0 else 1 + increase_by = int(max(min(max_chunk_size * 0.1, 10), 1)) + initial_size_guess = int(max(max_chunk_size * factor - 10, 1)) + final_list = [] + remaining = text + while len(remaining): + part = remaining[:initial_size_guess] + if __utf8len(part) > max_chunk_size: + initial_size_guess = max(initial_size_guess - min(max_chunk_size * 0.001, 10), 1) + continue + cut_after = initial_size_guess + while __utf8len(part) < max_chunk_size and part != remaining: + cut_after = min(len(remaining), cut_after + increase_by) + part = remaining[:cut_after] + + if __utf8len(part) > max_chunk_size: + cut_after -= increase_by + final_list.append(remaining[:cut_after]) + remaining = remaining[cut_after:] + + return final_list + + class WordTokenizer(ABC): + def __init__(self, language: str | None = None): + self.language = language + @abstractmethod def word_tokenize(self, text: str) -> list[str]: pass @@ -33,10 +74,9 @@ def span_tokenize(self, text: str) -> list[tuple[int, int]]: class NLTKTokenizer(WordTokenizer): - def __init__(self, punkt_language: str): - super().__init__() - check_required_dependencies(f"{punkt_language} word tokenizer", ["nltk"]) - self.punkt_language = punkt_language + def __init__(self, language: str): + super().__init__(language) + check_required_dependencies(f"{language} word tokenizer", ["nltk"]) self._tokenizer = None @property @@ -44,19 +84,19 @@ def tokenizer(self): if self._tokenizer is None: from nltk import load - self._tokenizer = load(f"tokenizers/punkt/{self.punkt_language}.pickle") + self._tokenizer = load(f"tokenizers/punkt/{self.language}.pickle") return self._tokenizer def word_tokenize(self, text) -> list[str]: from nltk.tokenize import word_tokenize - tokens = word_tokenize(text, language=self.punkt_language) + tokens = word_tokenize(text, language=self.language) return strip_strings(tokens) def sent_tokenize(self, text: str) -> list[str]: from nltk.tokenize import sent_tokenize - sents = sent_tokenize(text, language=self.punkt_language) + sents = sent_tokenize(text, language=self.language) return strip_strings(sents) def span_tokenize(self, text: str) -> list[tuple[int, int]]: @@ -64,14 +104,14 @@ def span_tokenize(self, text: str) -> list[tuple[int, int]]: class SpaCyTokenizer(WordTokenizer): - def __init__(self, spacy_language: str, config=None): - super().__init__() - check_required_dependencies(f"{spacy_language} word tokenizer", ["spacy"]) - if spacy_language == "vi": - check_required_dependencies(f"{spacy_language} word tokenizer", ["pyvi"]) - elif spacy_language == "zh": - check_required_dependencies(f"{spacy_language} word tokenizer", ["jieba"]) - self.spacy_language = spacy_language + def __init__(self, language: str, config=None): + super().__init__(language) + check_required_dependencies(f"{language} word tokenizer", ["spacy"]) + if language == "vi": + check_required_dependencies(f"{language} word tokenizer", ["pyvi"]) + elif language == "zh": + config = {"nlp": {"tokenizer": {"segmenter": "jieba"}}} + check_required_dependencies(f"{language} word tokenizer", ["jieba"]) self.config = config self._tokenizer = None @@ -80,35 +120,50 @@ def tokenizer(self): if self._tokenizer is None: import spacy + # Important to hot-fix the memory leak in Japanese Tokenizer + from datatrove.utils.japanese_tokenizer import JapaneseTokenizer # noqa: F401 + if self.config is None: - self._tokenizer = spacy.blank(self.spacy_language) + self._tokenizer = spacy.blank(self.language) else: - self._tokenizer = spacy.blank(self.spacy_language, config=self.config) + self._tokenizer = spacy.blank(self.language, config=self.config) self._tokenizer.add_pipe("sentencizer") return self._tokenizer + def _do_tokenize(self, text: str): + # japanese has a max byte length + texts = [text] if self.language != "ja" else chunk_text_on_bytes(text, 40000) + self.tokenizer.max_length = len(text) + return [self.tokenizer(t, disable=["parser", "tagger", "ner"]) for t in texts] + def word_tokenize(self, text: str) -> list[str]: - self.tokenizer.max_length = len(text) + 10 - tokens = [token.text for token in self.tokenizer(text, disable=["parser", "tagger", "ner"])] - return strip_strings(tokens) + # Make sure to do all the token processing inside the memory zone, as after that memory address to tokens + # are not longer valid + with self.tokenizer.memory_zone(): + self.tokenizer.max_length = len(text) + 10 + tokens = [token.text for tok_chunk in self._do_tokenize(text) for token in tok_chunk] + return strip_strings(tokens) def sent_tokenize(self, text: str) -> list[str]: - self.tokenizer.max_length = len(text) + 10 - sents = [sent.text for sent in self.tokenizer(text, disable=["parser", "tagger", "ner"]).sents] - return strip_strings(sents) + with self.tokenizer.memory_zone(): + self.tokenizer.max_length = len(text) + 10 + sents = [sent.text for t in self._do_tokenize(text) for sent in t.sents] + return strip_strings(sents) def span_tokenize(self, text: str) -> list[tuple[int, int]]: - return [ - (sent.start_char, sent.end_char) - for sent in self.tokenizer(text, disable=["parser", "tagger", "ner"]).sents - ] + spans = [] + with self.tokenizer.memory_zone(): + for tok_text in self._do_tokenize(text): + start = spans[-1][1] if spans else 0 + for sent in tok_text.sents: + spans.append((start + sent.start_char, start + sent.end_char)) + return spans class StanzaTokenizer(WordTokenizer): - def __init__(self, stanza_language: str, **stanza_kwargs): - super().__init__() - check_required_dependencies(f"{stanza_language} word tokenizer", ["stanza"]) - self.stanza_language = stanza_language + def __init__(self, language: str, **stanza_kwargs): + super().__init__(language) + check_required_dependencies(f"{language} word tokenizer", ["stanza"]) self.stanza_kwargs = stanza_kwargs self._tokenizer = None @@ -119,7 +174,7 @@ def tokenizer(self): from stanza.pipeline.core import DownloadMethod self._tokenizer = stanza.Pipeline( - self.stanza_language, + self.language, processors="tokenize", download_method=DownloadMethod.REUSE_RESOURCES, **self.stanza_kwargs, @@ -166,8 +221,7 @@ def span_tokenize(self, text: str) -> list[tuple[int, int]]: class IndicNLPTokenizer(WordTokenizer): def __init__(self, language: str): - super().__init__() - self.language = language + super().__init__(language) check_required_dependencies(f"{language} word tokenizer", [("indicnlp", "indic-nlp-library")]) def word_tokenize(self, text) -> list[str]: @@ -193,6 +247,7 @@ def __init__(self, model_type="sbg"): check_required_dependencies("ko word tokenizer", ["kiwipiepy"]) self.model_type = model_type self._tokenizer = None + self._preprocess_regex = re.compile("[0-9,]{20,}") @property def tokenizer(self): @@ -202,128 +257,226 @@ def tokenizer(self): self._tokenizer = Kiwi(model_type=self.model_type) return self._tokenizer + def preprocess(self, text): + # seems to have issue with very large numbers + return self._preprocess_regex.sub("", text) + def word_tokenize(self, text: str) -> list[str]: - tokens = [token.form for token in self.tokenizer.tokenize(text)] + tokens = [text[token.start : token.end] for token in self.tokenizer.tokenize(self.preprocess(text))] return strip_strings(tokens) def sent_tokenize(self, text: str) -> list[str]: - sents = [sent.text for sent in self.tokenizer.split_into_sents(text)] + sents = [sent.text for sent in self.tokenizer.split_into_sents(self.preprocess(text))] return strip_strings(sents) def span_tokenize(self, text: str) -> list[tuple[int, int]]: - return [(sent.start, sent.end) for sent in self.tokenizer.split_into_sents(text)] - - -# If you know a better tokenizer or better proxy language, please submit a PR -WORD_TOKENIZER_FACTORY: dict[str, Callable[[], WordTokenizer]] = { - Languages.english: lambda: NLTKTokenizer("english"), - Languages.korean: lambda: KiwiTokenizer(), - Languages.german: lambda: NLTKTokenizer("german"), - Languages.french: lambda: NLTKTokenizer("french"), - Languages.czech: lambda: NLTKTokenizer("czech"), - Languages.danish: lambda: NLTKTokenizer("danish"), - Languages.dutch: lambda: NLTKTokenizer("dutch"), - Languages.estonian: lambda: NLTKTokenizer("estonian"), - Languages.finnish: lambda: NLTKTokenizer("finnish"), - Languages.greek: lambda: NLTKTokenizer("greek"), - Languages.italian: lambda: NLTKTokenizer("italian"), - Languages.malayalam: lambda: NLTKTokenizer("malayalam"), - Languages.norwegian: lambda: NLTKTokenizer("norwegian"), - Languages.polish: lambda: NLTKTokenizer("polish"), - Languages.portuguese: lambda: NLTKTokenizer("portuguese"), - Languages.russian: lambda: NLTKTokenizer("russian"), - Languages.slovenian: lambda: NLTKTokenizer("slovene"), - Languages.spanish: lambda: NLTKTokenizer("spanish"), - Languages.swedish: lambda: NLTKTokenizer("swedish"), - Languages.turkish: lambda: NLTKTokenizer("turkish"), - Languages.chinese: lambda: SpaCyTokenizer("zh", {"nlp": {"tokenizer": {"segmenter": "jieba"}}}), - Languages.japanese: lambda: StanzaTokenizer("ja"), - Languages.vietnamese: lambda: SpaCyTokenizer("vi"), - Languages.indonesian: lambda: SpaCyTokenizer("id"), - Languages.persian: lambda: SpaCyTokenizer("fa"), - Languages.arabic: lambda: SpaCyTokenizer("ar"), - Languages.hindi: lambda: SpaCyTokenizer("hi"), - Languages.tamil: lambda: SpaCyTokenizer("ta"), - Languages.urdu: lambda: SpaCyTokenizer("ur"), - Languages.marathi: lambda: SpaCyTokenizer("mr"), - Languages.telugu: lambda: SpaCyTokenizer("te"), - Languages.hungarian: lambda: SpaCyTokenizer("hu"), - Languages.romanian: lambda: SpaCyTokenizer("ro"), - Languages.ukrainian: lambda: SpaCyTokenizer("uk"), - Languages.slovak: lambda: SpaCyTokenizer("sk"), - Languages.bulgarian: lambda: SpaCyTokenizer("bg"), - Languages.catalan: lambda: SpaCyTokenizer("ca"), - Languages.croatian: lambda: SpaCyTokenizer("hr"), - Languages.latin: lambda: SpaCyTokenizer("la"), - Languages.serbian: lambda: SpaCyTokenizer("sr"), - Languages.lithuanian: lambda: SpaCyTokenizer("lt"), - Languages.hebrew: lambda: SpaCyTokenizer("he"), - Languages.latvian: lambda: SpaCyTokenizer("lv"), - Languages.icelandic: lambda: SpaCyTokenizer("is"), - Languages.armenian: lambda: SpaCyTokenizer("hy"), - Languages.basque: lambda: SpaCyTokenizer("eu"), - Languages.thai: lambda: ThaiTokenizer(), - Languages.tagalog: lambda: SpaCyTokenizer("tl"), - Languages.albanian: lambda: SpaCyTokenizer("sq"), - Languages.macedonian: lambda: SpaCyTokenizer("mk"), - Languages.azerbaijani: lambda: SpaCyTokenizer("az"), - Languages.amharic: lambda: SpaCyTokenizer("am"), - Languages.bengali: lambda: SpaCyTokenizer("bn"), - Languages.malay: lambda: SpaCyTokenizer("ms"), - Languages.urdu: lambda: SpaCyTokenizer("ur"), - Languages.nepali: lambda: SpaCyTokenizer("ne"), - Languages.kazakh: lambda: StanzaTokenizer("kk"), - Languages.gujarati: lambda: SpaCyTokenizer("gu"), - Languages.kannada: lambda: SpaCyTokenizer("kn"), - Languages.welsh: lambda: StanzaTokenizer("cy"), - Languages.norwegian_nynorsk: lambda: NLTKTokenizer( - "norwegian" - ), # TODO: change to SpaCyTokenizer("nn") when spacy version>=3.7.4 - Languages.sinhala: lambda: SpaCyTokenizer("si"), - Languages.tatar: lambda: SpaCyTokenizer("tt"), - Languages.afrikaans: lambda: SpaCyTokenizer("af"), - Languages.kirghiz: lambda: SpaCyTokenizer("ky"), - Languages.irish: lambda: SpaCyTokenizer("ga"), - Languages.luxembourgish: lambda: SpaCyTokenizer("lb"), - Languages.maltese: lambda: StanzaTokenizer("mt"), - Languages.sanskrit: lambda: SpaCyTokenizer("sa"), - Languages.yoruba: lambda: SpaCyTokenizer("yo"), - Languages.serbocroatian: lambda: SpaCyTokenizer("sr"), - Languages.oriya: lambda: IndicNLPTokenizer("or"), - Languages.punjabi: lambda: IndicNLPTokenizer("sa"), - Languages.assamese: lambda: IndicNLPTokenizer("as"), - Languages.war: lambda: IndicNLPTokenizer("war"), - Languages.sindhi: lambda: IndicNLPTokenizer("sd"), - Languages.bosnian: lambda: SpaCyTokenizer("hr"), # Proxy - Languages.belarusian: lambda: SpaCyTokenizer("uk"), # Proxy - Languages.galician: lambda: NLTKTokenizer("portuguese"), # Proxy - Languages.esperanto: lambda: NLTKTokenizer("english"), # Proxy - Languages.occitan: lambda: SpaCyTokenizer("ca"), # Proxy - Languages.cebuano: lambda: NLTKTokenizer("english"), # Proxy - Languages.swahili: lambda: NLTKTokenizer("english"), # Proxy - Languages.javanese: lambda: NLTKTokenizer("english"), # Proxy - Languages.uzbek: lambda: NLTKTokenizer("turkish"), # Proxy, alternative ru - Languages.tajik: lambda: SpaCyTokenizer("ru"), # Proxy - Languages.kurdish: lambda: NLTKTokenizer("english"), # Proxy, multiple scripts! - Languages.sorani: lambda: SpaCyTokenizer("fa"), # Proxy - Languages.south_azerbaijani: lambda: SpaCyTokenizer("fa"), # Proxy - Languages.bashkir: lambda: SpaCyTokenizer("tt"), # Proxy - Languages.western_frisian: lambda: NLTKTokenizer("dutch"), # Proxy - Languages.breton: lambda: StanzaTokenizer("cy"), # Proxy - Languages.malagasy: lambda: NLTKTokenizer("english"), # Proxy - Languages.yiddish: lambda: SpaCyTokenizer("he"), # Proxy - Languages.somali: lambda: NLTKTokenizer("english"), # Proxy - Languages.turkmen: lambda: NLTKTokenizer("turkish"), # Proxy - Languages.pashto: lambda: SpaCyTokenizer("xx"), # Proxy -} - -WORD_TOKENIZER_CACHE: dict[str, WordTokenizer] = {} - - -def load_word_tokenizer(language: str) -> WordTokenizer: - if language not in WORD_TOKENIZER_CACHE: - if language not in WORD_TOKENIZER_FACTORY: - raise ValueError(f"Language '{language}' doesn't have a tokenizer.") - tokenizer = WORD_TOKENIZER_FACTORY[language]() - WORD_TOKENIZER_CACHE[language] = tokenizer - return WORD_TOKENIZER_CACHE[language] + return [(sent.start, sent.end) for sent in self.tokenizer.split_into_sents(self.preprocess(text))] + + +class KhmerTokenizer(WordTokenizer): + def __init__(self): + super().__init__() + check_required_dependencies("khmer word tokenizer", [("khmernltk", "khmer-nltk")]) + + def word_tokenize(self, text: str) -> list[str]: + from khmernltk import word_tokenize + + tokens = word_tokenize(text, return_tokens=True) + return strip_strings(tokens) + + def sent_tokenize(self, text: str) -> list[str]: + from khmernltk import sentence_tokenize + + return strip_strings(sentence_tokenize(text)) + + def span_tokenize(self, text: str) -> list[tuple[int, int]]: + sents = self.sent_tokenize(text) + return list(simple_span_tokenize(text, sents)) + + +class LaoTokenizer(WordTokenizer): + def __init__(self): + super().__init__() + check_required_dependencies("laos word tokenizer", ["laonlp"]) + + def word_tokenize(self, text: str) -> list[str]: + from laonlp.tokenize import word_tokenize + + tokens = word_tokenize(text) + return strip_strings(tokens) + + def sent_tokenize(self, text: str) -> list[str]: + from laonlp.tokenize import sent_tokenize + + return strip_strings(sent_tokenize(text)) + + def span_tokenize(self, text: str) -> list[tuple[int, int]]: + sents = self.sent_tokenize(text) + return list(simple_span_tokenize(text, sents)) + + +class TibetanTokenizer(WordTokenizer): + def __init__(self): + super().__init__() + check_required_dependencies("tibetan word tokenizer", ["botok"]) + self._wt = None + self._whitespace_regex = re.compile(r"\s+") + + @property + def wt(self): + if self._wt is None: + from botok import WordTokenizer + + self._wt = WordTokenizer() + return self._wt + + def _try_tokenize(self, text: str) -> list[str]: + try: + return self.wt.tokenize(text, split_affixes=False) + except Exception as e: + logger.warning(f"Failed to tokenize with botok: {e}. Trying without spaces...") + return self.wt.tokenize(self._whitespace_regex.sub("", text), split_affixes=False) + + def word_tokenize(self, text: str) -> list[str]: + return strip_strings([tok.text for tok in self._try_tokenize(text)]) + + def sent_tokenize(self, text: str) -> list[str]: + from botok.tokenizers.sentencetokenizer import sentence_tokenizer + + tokens = self._try_tokenize(text) + sents = sentence_tokenizer(tokens) + out = ["".join([word.text for word in s["tokens"]]) for s in sents] + return strip_strings(out) + + def span_tokenize(self, text: str) -> list[tuple[int, int]]: + from botok.tokenizers.sentencetokenizer import get_sentence_indices + + tokens = self._try_tokenize(text) + idxs = get_sentence_indices(tokens) + return [(sentence["start"], sentence["end"] + 1) for sentence in idxs] + + +class WhitespaceTokenizer(WordTokenizer): + """ + This is a fallback tokenizer when no other tokenizer is available. + """ + + def __init__(self): + super().__init__() + # should not split on acronyms "(?:\p{{Lu}}\.)" + self._sent_regex = regex.compile( + rf"(?:(?:\p{{Lu}}\.)|.)+?[{re.escape(''.join(TERMINAL_PUNCTUATION))}\n]+[\"'”]?", regex.UNICODE + ) + + @property + @lru_cache(1) + def _spacy_xx(self): + # works generally well for white spaces, but does not work to split sentences with a different script + return SpaCyTokenizer("xx") + + def word_tokenize(self, text) -> list[str]: + return self._spacy_xx.word_tokenize(text) + + def sent_tokenize(self, text: str) -> list[str]: + sents = self._sent_regex.findall(text) + return strip_strings(sents) + + def span_tokenize(self, text: str) -> list[tuple[int, int]]: + sents = self.sent_tokenize(text) + return list(simple_span_tokenize(text, sents)) + + +class BurmeseTokenizer(WhitespaceTokenizer): + def __init__(self): + super().__init__() + check_required_dependencies("burmese word tokenizer", [("pyidaungsu", "pyidaungsu-numpy2")]) + self._wt = None + + def word_tokenize(self, text: str) -> list[str]: + import pyidaungsu as pds + + tokens = pds.tokenize(text, form="word") + return strip_strings(tokens) + + +""" + The actual tokenizer assignments are saved in src/datatrove/assets/tokenizer_assignments.csv + If you know a better tokenizer or better proxy language, please submit a PR +""" + + +@lru_cache(maxsize=1) +def load_tokenizer_assignments() -> dict[str, Callable[[], WordTokenizer]]: + def tok_factory_wrapper(class_name, arg): + if class_name == "SpaCyTokenizer": + tok_class = SpaCyTokenizer + elif class_name == "StanzaTokenizer": + tok_class = StanzaTokenizer + elif class_name == "ThaiTokenizer": + tok_class = ThaiTokenizer + elif class_name == "IndicNLPTokenizer": + tok_class = IndicNLPTokenizer + elif class_name == "KiwiTokenizer": + tok_class = KiwiTokenizer + elif class_name == "KhmerTokenizer": + tok_class = KhmerTokenizer + elif class_name == "LaoTokenizer": + tok_class = LaoTokenizer + elif class_name == "TibetanTokenizer": + tok_class = TibetanTokenizer + elif class_name == "BurmeseTokenizer": + tok_class = BurmeseTokenizer + elif class_name == "WhitespaceTokenizer": + tok_class = WhitespaceTokenizer + else: + raise ValueError(f'Invalid tokenizer class "{class_name}"') + + if arg: + return tok_class(arg) + return tok_class() + + word_tokenizer_factories = {} + with open(os.path.join(ASSETS_PATH, "tokenizer_assignment.csv")) as f: + reader = csv.DictReader(f) + for row in reader: + code_3, code_1, script, tok_class_name, tok_code, default_script, default_code_1 = ( + row["code_3"], + row["code_1"], + row["script"], + row["type"], + row["tok_code"], + row["default_script"], + row["default_code_1"], + ) + + if not tok_class_name: + continue + + tok_factory = partial(tok_factory_wrapper, tok_class_name, tok_code) + + code_3_script = f"{code_3}_{script}" + if code_3_script not in word_tokenizer_factories: + word_tokenizer_factories[code_3_script] = tok_factory + if default_script: + word_tokenizer_factories[code_3] = tok_factory + code_1_script = f"{code_1}_{script}" + if code_1 and default_code_1 and code_1_script not in word_tokenizer_factories: + word_tokenizer_factories[code_1_script] = tok_factory + if default_script: + word_tokenizer_factories[code_1] = tok_factory + + return word_tokenizer_factories + + +@lru_cache(maxsize=None) +def load_word_tokenizer(language_or_tok: str | WordTokenizer) -> WordTokenizer: + if isinstance(language_or_tok, WordTokenizer): + # for custom tokenizers + return language_or_tok + word_tokenizer_factories = load_tokenizer_assignments() + if language_or_tok not in word_tokenizer_factories: + raise ValueError( + f"Language '{language_or_tok}' doesn't have a tokenizer assigned. Pass in a " + f"WordTokenizer directly or update tokenizer_assignment.csv" + ) + return word_tokenizer_factories[language_or_tok]() diff --git a/tests/pipeline/test_hf_reader.py b/tests/pipeline/test_hf_reader.py index f8a31926..255c8853 100644 --- a/tests/pipeline/test_hf_reader.py +++ b/tests/pipeline/test_hf_reader.py @@ -48,7 +48,7 @@ def test_read_streaming_dataset_shuffle(self): self.assertEqual(len(data[1].text), 46) def test_sharding(self): - for shards in [1, 3]: + for shards in [3]: for streaming in [True, False]: reader = HuggingFaceDatasetReader( "huggingface/datatrove-tests", diff --git a/tests/pipeline/test_minhash.py b/tests/pipeline/test_minhash.py index 28bf7865..dfc3ca5f 100644 --- a/tests/pipeline/test_minhash.py +++ b/tests/pipeline/test_minhash.py @@ -167,6 +167,53 @@ def test_buckets_and_cluster(self, hash_config): filtered_ids = {x.id for x in filtered} assert filtered_ids == kept + @use_hash_configs() + def test_cluster_ids_sizes(self, hash_config): + sigs_folder = os.path.join(self.tmp_dir, "b_signatures") + buckets_folder = os.path.join(self.tmp_dir, "b_buckets") + clusters_folder = os.path.join(self.tmp_dir, "b_clusters") + config = MinhashConfig(hash_config=hash_config) + + signatures_block = MinhashDedupSignature(output_folder=sigs_folder, config=config) + buckets_block = MinhashDedupBuckets( + input_folder=sigs_folder, + output_folder=buckets_folder, + config=config, + ) + + clusters = [[0, 20, 50], [400, 420], [800, 810, 820, 840, 860], [1205, 1215, 1225, 1245], [1600], [2000]] + cluster_sizes = { + f"{ci}_{xi}": len(cluster) for ci, cluster in enumerate(clusters) for xi, x in enumerate(cluster) + } + + cluster_samples = [ + Document(text=lorem_ipsum[x : x + 400], id=f"{ci}_{xi}", metadata={"ci": ci, "xi": xi}) + for ci, cluster in enumerate(clusters) + for xi, x in enumerate(cluster) + ] + + signatures_block(cluster_samples) + for b in range(config.num_buckets * 10): + buckets_block(None, rank=b, world_size=config.num_buckets * 10) + bucket_results_folder = get_datafolder(buckets_folder) + # clustering + cluster_block = MinhashDedupCluster( + bucket_results_folder, clusters_folder, save_cluster_id=True, save_cluster_size=True, config=config + ) + cluster_block(None) + + cluster_results_folder = get_datafolder(clusters_folder) + + # filtering + filter_block = MinhashDedupFilter(cluster_results_folder, load_cluster_ids=True, load_cluster_sizes=True) + filtered = filter_block(cluster_samples) + cluster_ids = set() + for doc in filtered: + print(doc) + assert cluster_sizes[doc.id] == doc.metadata["minhash_cluster_size"] + cluster_ids.add(doc.metadata["minhash_cluster_id"]) + assert len(cluster_ids) == 5 # number of clusters with > 1 element + 1 (empty cluster has id -1) + @use_hash_configs() def test_multiprocess_s2(self, hash_config): sigs_folder = os.path.join(self.tmp_dir, "b_signatures") diff --git a/tests/pipeline/test_ngrams_decont.py b/tests/pipeline/test_ngrams_decont.py index a6f669a2..dad66dc8 100644 --- a/tests/pipeline/test_ngrams_decont.py +++ b/tests/pipeline/test_ngrams_decont.py @@ -42,6 +42,7 @@ def get_test_results(self, config): nfilter = NGramsDecontFilter(self.tmp_dir, config=config) return tuple([int(doc.id) for doc in nfilter(copy.deepcopy(DOCS))]) + @unittest.skip("Lighteval doesn't support numpy>=2.0.0") @use_hash_configs() def test_label_only(self, hash_config): self.assertEqual( @@ -51,11 +52,13 @@ def test_label_only(self, hash_config): (0, 2, 3, 4, 5, 6), ) + @unittest.skip("Lighteval doesn't support numpy>=2.0.0") def test_query(self): self.assertEqual( self.get_test_results(NGramsDecontConfig(find_query_ngrams=True, find_overlap_ngrams=False)), (2, 3, 5, 6) ) + @unittest.skip("Lighteval doesn't support numpy>=2.0.0") def test_overlap(self): self.assertEqual( self.get_test_results(NGramsDecontConfig(find_query_ngrams=False, find_overlap_ngrams=True)), diff --git a/tests/pipeline/test_sentence_deduplication.py b/tests/pipeline/test_sentence_deduplication.py index 8fd96acc..3155cffe 100644 --- a/tests/pipeline/test_sentence_deduplication.py +++ b/tests/pipeline/test_sentence_deduplication.py @@ -17,7 +17,7 @@ def get_random_string(n: int = 20): - return "".join(random.choices(string.ascii_uppercase + string.digits, k=n)) + "." + return "".join(random.choices(string.ascii_uppercase + string.digits, k=n)) + "!" SENTENCE_ = "A SQUAT grey building of only thirty-four stories." diff --git a/tests/pipeline/test_word_tokenizers.py b/tests/pipeline/test_word_tokenizers.py index 9f67a44d..a9e200ee 100644 --- a/tests/pipeline/test_word_tokenizers.py +++ b/tests/pipeline/test_word_tokenizers.py @@ -1,47 +1,128 @@ import unittest -from nltk.tokenize import word_tokenize - -from datatrove.utils.word_tokenizers import WORD_TOKENIZER_FACTORY, load_word_tokenizer +from datatrove.utils.word_tokenizers import TibetanTokenizer, load_tokenizer_assignments, load_word_tokenizer SAMPLE_TEXT = ( - "I wish it need not have happened in my time,' said Frodo. 'So do I,' said Gandalf, 'and so do all who live to " + "'I wish it need not have happened in my time,' said Frodo. 'So do I,' said Gandalf, 'and so do all who live to " "see such times. But that is not for them to decide. All we have to decide is what to do with the time that is " "given us.' Hello world! \n\n ქართული \n\t Hello\nworld! " ) +def get_unique_tokenizers(): + uniq_toks = set() + for language in load_tokenizer_assignments().keys(): + tokenizer = load_word_tokenizer(language) + if (tokenizer.__class__, tokenizer.language) in uniq_toks: + continue + uniq_toks.add((tokenizer.__class__, tokenizer.language)) + yield language, tokenizer + + class TestWordTokenizers(unittest.TestCase): def test_word_tokenizers(self): - for language in WORD_TOKENIZER_FACTORY.keys(): - tokenizer = load_word_tokenizer(language) + for language, tokenizer in get_unique_tokenizers(): tokens = tokenizer.word_tokenize(SAMPLE_TEXT) assert len(tokens) >= 1, f"'{language}' tokenizer doesn't output tokens" is_stripped = [token == token.strip() for token in tokens] assert all(is_stripped), f"'{language}' tokenizer tokens contain whitespaces" def test_sent_tokenizers(self): - for language in WORD_TOKENIZER_FACTORY.keys(): - tokenizer = load_word_tokenizer(language) + for language, tokenizer in get_unique_tokenizers(): sents = tokenizer.sent_tokenize(SAMPLE_TEXT) assert len(sents) >= 1, f"'{language}' tokenizer doesn't output sentences" is_stripped = [sent == sent.strip() for sent in sents] assert all(is_stripped), f"'{language}' tokenizer sentences contain whitespaces" def test_span_tokenizers(self): - for language in WORD_TOKENIZER_FACTORY.keys(): - tokenizer = load_word_tokenizer(language) + for language, tokenizer in get_unique_tokenizers(): sents = tokenizer.sent_tokenize(SAMPLE_TEXT) spans = tokenizer.span_tokenize(SAMPLE_TEXT) assert len(spans) >= 1, f"'{language}' tokenizer doesn't output spans" spans_match_sents = [sent in SAMPLE_TEXT[span[0] : span[1]] for sent, span in zip(sents, spans)] - assert all(spans_match_sents), f"'{language}' tokenizer spans don't match with sentences" + assert (tokenizer.language == "ur" or isinstance(tokenizer, TibetanTokenizer)) or all( + spans_match_sents + ), f"'{language}' tokenizer spans don't match with sentences" def test_english_tokenizer(self): - nltk_words = word_tokenize(SAMPLE_TEXT, language="english") - en_tokenizer = load_word_tokenizer("en") tokenizer_words = en_tokenizer.word_tokenize(SAMPLE_TEXT) - self.assertEqual(nltk_words, tokenizer_words, "NLTK tokenizer and multilingual tokenizer differ") + self.assertEqual( + [ + "'", + "I", + "wish", + "it", + "need", + "not", + "have", + "happened", + "in", + "my", + "time", + ",", + "'", + "said", + "Frodo", + ".", + "'", + "So", + "do", + "I", + ",", + "'", + "said", + "Gandalf", + ",", + "'", + "and", + "so", + "do", + "all", + "who", + "live", + "to", + "see", + "such", + "times", + ".", + "But", + "that", + "is", + "not", + "for", + "them", + "to", + "decide", + ".", + "All", + "we", + "have", + "to", + "decide", + "is", + "what", + "to", + "do", + "with", + "the", + "time", + "that", + "is", + "given", + "us", + ".", + "'", + "Hello", + "world", + "!", + "ქართული", + "Hello", + "world", + "!", + ], + tokenizer_words, + "SpaCy tokenizer and multilingual tokenizer differ", + )