Skip to content

Commit

Permalink
Extraction for Thai text (NVIDIA#304)
Browse files Browse the repository at this point in the history
* add thai stopwords

Signed-off-by: Sarah Yurick <[email protected]>

* run black

Signed-off-by: Sarah Yurick <[email protected]>

---------

Signed-off-by: Sarah Yurick <[email protected]>
  • Loading branch information
sarahyurick authored Oct 16, 2024
1 parent 8af2545 commit 924836f
Show file tree
Hide file tree
Showing 2 changed files with 124 additions and 0 deletions.
5 changes: 5 additions & 0 deletions nemo_curator/download/commoncrawl.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,11 @@ def get_stop_list_dict(languages=[]):
lang_key = language.upper()
stop_list_dict[lang_key] = justext.get_stoplist(language)

# List obtained from https://github.com/stopwords-iso/stopwords-th
from .thai_stopwords import thai_stopwords

stop_list_dict["THAI"] = thai_stopwords

return stop_list_dict


Expand Down
119 changes: 119 additions & 0 deletions nemo_curator/download/thai_stopwords.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
thai_stopwords = frozenset(
[
"กล่าว",
"กว่า",
"กัน",
"กับ",
"การ",
"ก็",
"ก่อน",
"ขณะ",
"ขอ",
"ของ",
"ขึ้น",
"คง",
"ครั้ง",
"ความ",
"คือ",
"จะ",
"จัด",
"จาก",
"จึง",
"ช่วง",
"ซึ่ง",
"ดัง",
"ด้วย",
"ด้าน",
"ตั้ง",
"ตั้งแต่",
"ตาม",
"ต่อ",
"ต่าง",
"ต่างๆ",
"ต้อง",
"ถึง",
"ถูก",
"ถ้า",
"ทั้ง",
"ทั้งนี้",
"ทาง",
"ทำ",
"ทำให้",
"ที่",
"ที่สุด",
"ทุก",
"นอกจาก",
"นัก",
"นั้น",
"นำ",
"นี้",
"น่า",
"บาง",
"ผล",
"ผ่าน",
"พบ",
"พร้อม",
"มา",
"มาก",
"มี",
"ยัง",
"รวม",
"ระหว่าง",
"รับ",
"ราย",
"ร่วม",
"ลง",
"วัน",
"ว่า",
"สำหรับ",
"สุด",
"ส่ง",
"ส่วน",
"หนึ่ง",
"หรือ",
"หลัง",
"หลังจาก",
"หลาย",
"หาก",
"อยาก",
"อยู่",
"อย่าง",
"ออก",
"อะไร",
"อาจ",
"อีก",
"เขา",
"เข้า",
"เคย",
"เฉพาะ",
"เช่น",
"เดียว",
"เดียวกัน",
"เนื่องจาก",
"เปิด",
"เปิดเผย",
"เป็น",
"เป็นการ",
"เพราะ",
"เพื่อ",
"เมื่อ",
"เรา",
"เริ่ม",
"เลย",
"เห็น",
"เอง",
"แต่",
"แบบ",
"แรก",
"และ",
"แล้ว",
"แห่ง",
"โดย",
"ใน",
"ให้",
"ได้",
"ไป",
"ไม่",
"ไว้",
]
)

0 comments on commit 924836f

Please sign in to comment.