diff --git a/src/wiktextract/extractor/th/etymology.py b/src/wiktextract/extractor/th/etymology.py
new file mode 100644
index 000000000..b5419ca9b
--- /dev/null
+++ b/src/wiktextract/extractor/th/etymology.py
@@ -0,0 +1,21 @@
+from wikitextprocessor.parser import LEVEL_KIND_FLAGS, LevelNode
+
+from ...page import clean_node
+from ...wxr_context import WiktextractContext
+from .models import WordEntry
+
+
+def extract_etymology_section(
+ wxr: WiktextractContext,
+ base_data: WordEntry,
+ level_node: LevelNode,
+) -> None:
+ base_data.etymology_text = ""
+ base_data.categories.clear()
+ index = len(level_node.children)
+ for node_index, _ in level_node.find_child(LEVEL_KIND_FLAGS, True):
+ index = node_index
+ break
+ e_str = clean_node(wxr, base_data, level_node.children[:index])
+ if e_str != "":
+ base_data.etymology_text = e_str
diff --git a/src/wiktextract/extractor/th/models.py b/src/wiktextract/extractor/th/models.py
new file mode 100644
index 000000000..969ea485d
--- /dev/null
+++ b/src/wiktextract/extractor/th/models.py
@@ -0,0 +1,31 @@
+from pydantic import BaseModel, ConfigDict, Field
+
+
+class ThaiBaseModel(BaseModel):
+ model_config = ConfigDict(
+ extra="forbid",
+ strict=True,
+ validate_assignment=True,
+ validate_default=True,
+ )
+
+
+class Sense(ThaiBaseModel):
+ glosses: list[str] = []
+ tags: list[str] = []
+ raw_tags: list[str] = []
+ categories: list[str] = []
+
+
+class WordEntry(ThaiBaseModel):
+ model_config = ConfigDict(title="Thai Wiktionary")
+ word: str = Field(description="Word string", min_length=1)
+ lang_code: str = Field(description="Wiktionary language code", min_length=1)
+ lang: str = Field(description="Localized language name", min_length=1)
+ pos: str = Field(description="Part of speech type", min_length=1)
+ pos_title: str = ""
+ senses: list[Sense] = []
+ categories: list[str] = []
+ tags: list[str] = []
+ raw_tags: list[str] = []
+ etymology_text: str = ""
diff --git a/src/wiktextract/extractor/th/page.py b/src/wiktextract/extractor/th/page.py
new file mode 100644
index 000000000..9aa2ad2a2
--- /dev/null
+++ b/src/wiktextract/extractor/th/page.py
@@ -0,0 +1,62 @@
+import string
+from typing import Any
+
+from mediawiki_langcodes import name_to_code
+from wikitextprocessor.parser import LEVEL_KIND_FLAGS, LevelNode, NodeKind
+
+from ...page import clean_node
+from ...wxr_context import WiktextractContext
+from .etymology import extract_etymology_section
+from .models import Sense, WordEntry
+from .pos import extract_pos_section
+from .section_titles import POS_DATA
+
+
+def parse_section(
+ wxr: WiktextractContext,
+ page_data: list[WordEntry],
+ base_data: WordEntry,
+ level_node: LevelNode,
+) -> None:
+ title_text = clean_node(wxr, None, level_node.largs)
+ title_text = title_text.rstrip(string.digits + string.whitespace)
+ wxr.wtp.start_subsection(title_text)
+ if title_text in POS_DATA:
+ extract_pos_section(wxr, page_data, base_data, level_node, title_text)
+ elif title_text == "รากศัพท์":
+ extract_etymology_section(wxr, base_data, level_node)
+
+ for next_level in level_node.find_child(LEVEL_KIND_FLAGS):
+ parse_section(wxr, page_data, base_data, next_level)
+
+
+def parse_page(
+ wxr: WiktextractContext, page_title: str, page_text: str
+) -> list[dict[str, Any]]:
+ # page layout
+ # https://th.wiktionary.org/wiki/วิธีใช้:คู่มือในการเขียน
+ wxr.wtp.start_page(page_title)
+ tree = wxr.wtp.parse(page_text, pre_expand=True)
+ page_data: list[WordEntry] = []
+ for level2_node in tree.find_child(NodeKind.LEVEL2):
+ lang_name = clean_node(wxr, None, level2_node.largs)
+ lang_name = lang_name.removeprefix("ภาษา")
+ lang_code = name_to_code(lang_name, "th")
+ if lang_code == "":
+ lang_code = "unknown"
+ if lang_name == "":
+ lang_name = "unknown"
+ wxr.wtp.start_section(lang_name)
+ base_data = WordEntry(
+ word=wxr.wtp.title,
+ lang_code=lang_code,
+ lang=lang_name,
+ pos="unknown",
+ )
+ for next_level_node in level2_node.find_child(LEVEL_KIND_FLAGS):
+ parse_section(wxr, page_data, base_data, next_level_node)
+
+ for data in page_data:
+ if len(data.senses) == 0:
+ data.senses.append(Sense(tags=["no-gloss"]))
+ return [m.model_dump(exclude_defaults=True) for m in page_data]
diff --git a/src/wiktextract/extractor/th/pos.py b/src/wiktextract/extractor/th/pos.py
new file mode 100644
index 000000000..47c8ac9d6
--- /dev/null
+++ b/src/wiktextract/extractor/th/pos.py
@@ -0,0 +1,39 @@
+from wikitextprocessor import LevelNode, NodeKind, WikiNode
+
+from ...page import clean_node
+from ...wxr_context import WiktextractContext
+from .models import Sense, WordEntry
+from .section_titles import POS_DATA
+
+
+def extract_pos_section(
+ wxr: WiktextractContext,
+ page_data: list[WordEntry],
+ base_data: WordEntry,
+ level_node: LevelNode,
+ pos_title: str,
+) -> None:
+ page_data.append(base_data.model_copy(deep=True))
+ page_data[-1].pos_title = pos_title
+ pos_data = POS_DATA[pos_title]
+ page_data[-1].pos = pos_data["pos"]
+ page_data[-1].tags.extend(pos_data.get("tags", []))
+
+ for list_node in level_node.find_child(NodeKind.LIST):
+ if list_node.sarg.startswith("#") and list_node.sarg.endswith("#"):
+ for list_item in list_node.find_child(NodeKind.LIST_ITEM):
+ extract_gloss_list_item(wxr, page_data[-1], list_item)
+
+
+def extract_gloss_list_item(
+ wxr: WiktextractContext,
+ word_entry: WordEntry,
+ list_item: WikiNode,
+) -> None:
+ sense = Sense()
+ gloss_str = clean_node(
+ wxr, sense, list(list_item.invert_find_child(NodeKind.LIST))
+ )
+ if gloss_str != "":
+ sense.glosses.append(gloss_str)
+ word_entry.senses.append(sense)
diff --git a/src/wiktextract/extractor/th/section_titles.py b/src/wiktextract/extractor/th/section_titles.py
new file mode 100644
index 000000000..43415e367
--- /dev/null
+++ b/src/wiktextract/extractor/th/section_titles.py
@@ -0,0 +1,15 @@
+POS_DATA = {
+ "คำกริยา": {"pos": "verb"},
+ "คำกริยาวิเศษณ์": {"pos": "adv"},
+ "คำคุณศัพท์": {"pos": "adj"},
+ "คำนาม": {"pos": "noun"},
+ "คำบุพบท": {"pos": "prep"},
+ "คำลักษณนาม": {"pos": "classifier"},
+ "คำวิสามานยนาม": {"pos": "name"},
+ "คำสรรพนาม": {"pos": "pron"},
+ "คำสันธาน": {"pos": "conj"},
+ "คำอนุภาค": {"pos": "particle"},
+ "คำอุทาน": {"pos": "intj"},
+ "วลี": {"pos": "phrase"},
+ "เลข": {"pos": "num", "tags": ["number"]},
+}
diff --git a/tests/test_th_gloss.py b/tests/test_th_gloss.py
new file mode 100644
index 000000000..41ba3d3cf
--- /dev/null
+++ b/tests/test_th_gloss.py
@@ -0,0 +1,71 @@
+from unittest import TestCase
+
+from wikitextprocessor import Wtp
+
+from wiktextract.config import WiktionaryConfig
+from wiktextract.extractor.th.page import parse_page
+from wiktextract.wxr_context import WiktextractContext
+
+
+class TestThGloss(TestCase):
+ maxDiff = None
+
+ def setUp(self) -> None:
+ self.wxr = WiktextractContext(
+ Wtp(lang_code="th"),
+ WiktionaryConfig(
+ dump_file_lang_code="th", capture_language_codes=None
+ ),
+ )
+
+ def test_do_not_share_etymology_data(self):
+ self.wxr.wtp.add_page(
+ "แม่แบบ:inh+",
+ 10,
+ """สืบทอดจาก[[w:ภาษาไทดั้งเดิม|ไทดั้งเดิม]][[Category:ศัพท์ภาษาไทยที่สืบทอดจากภาษาไทดั้งเดิม|กบ]][[Category:ศัพท์ภาษาไทยที่รับมาจากภาษาไทดั้งเดิม|กบ]] [[การสืบสร้าง:ไทดั้งเดิม/kɤpᴰ|*kɤpᴰ]]""",
+ )
+ page_data = parse_page(
+ self.wxr,
+ "กบ",
+ """== ภาษาไทย ==
+=== รากศัพท์ 2 ===
+{{inh+|th|tai-pro|*kɤpᴰ}}
+
+==== คำนาม ====
+{{th-noun|ตัว}}
+
+# [[ชื่อ]]
+
+=== รากศัพท์ 3 ===
+
+==== คำนาม ====
+{{th-noun|ตัว}}
+
+# [[ปลา]]""",
+ )
+ self.assertEqual(
+ page_data,
+ [
+ {
+ "categories": [
+ "ศัพท์ภาษาไทยที่สืบทอดจากภาษาไทดั้งเดิม",
+ "ศัพท์ภาษาไทยที่รับมาจากภาษาไทดั้งเดิม",
+ ],
+ "etymology_text": "สืบทอดจากไทดั้งเดิม *kɤpᴰ",
+ "senses": [{"glosses": ["ชื่อ"]}],
+ "pos": "noun",
+ "pos_title": "คำนาม",
+ "word": "กบ",
+ "lang": "ไทย",
+ "lang_code": "th",
+ },
+ {
+ "senses": [{"glosses": ["ปลา"]}],
+ "pos": "noun",
+ "pos_title": "คำนาม",
+ "word": "กบ",
+ "lang": "ไทย",
+ "lang_code": "th",
+ },
+ ],
+ )