Factor auto_mark_def_and_notats and make it so that it adds boxing …

…attributes to both existing definition and notation markings
hyunjongkimmath · Feb 7, 2024 · 42c8f07 · 42c8f07
1 parent ec76e9b
commit 42c8f07
Show file tree

Hide file tree

Showing 4 changed files with 208 additions and 40 deletions.
diff --git a/nbs/28_markdown.obsidian.personal.machine_learning.tokenize.ipynb b/nbs/28_markdown.obsidian.personal.machine_learning.tokenize.ipynb
@@ -49,7 +49,7 @@
     "import warnings\n",
     "\n",
     "import bs4\n",
-    "from transformers import BatchEncoding, PreTrainedTokenizer, PreTrainedTokenizerFast\n",
+    "from transformers import BatchEncoding, pipelines, PreTrainedTokenizer, PreTrainedTokenizerFast\n",
     "\n",
     "from trouver.helper import add_HTML_tag_data_to_raw_text, add_space_to_lt_symbols_without_space, double_asterisk_indices, latex_indices, notation_asterisk_indices, replace_string_by_indices, remove_html_tags_in_text\n",
     "from trouver.markdown.markdown.file import MarkdownFile, MarkdownLineEnum\n",
@@ -64,10 +64,14 @@
    "outputs": [],
    "source": [
     "from unittest import mock\n",
+    "import shutil\n",
+    "import tempfile\n",
     "\n",
     "from datasets import ClassLabel, Dataset, Features, Sequence, Value\n",
     "from transformers import AutoTokenizer\n",
-    "from fastcore.test import *\n"
+    "from fastcore.test import *\n",
+    "\n",
+    "from trouver.helper import _test_directory\n"
    ]
   },
   {
@@ -1438,8 +1442,8 @@
     {
      "data": {
       "text/plain": [
-       "[(<span notation=\"\" style=\"border-width:1px;border-style:solid;padding:3px\">$\\zeta(s)$</span>,\n",
-       "  26,\n",
+       "[(<b definition=\"\" style=\"border-width:1px;border-style:solid;padding:3px\">Riemann zeta function $\\zeta(s)$</b>,\n",
+       "  4,\n",
        "  36)]"
       ]
      },
@@ -1604,7 +1608,7 @@
    "outputs": [],
    "source": [
     "#| export\n",
-    "def _add_nice_boxing_attrs_to_notation_tags(\n",
+    "def _add_nice_boxing_attrs_to_def_and_notat_tags(\n",
     "        html_tag_data: list[tuple[bs4.element.Tag, int, int]]\n",
     "        ) -> list[tuple[bs4.element.Tag, int, int]]:\n",
     "    \"\"\"\n",
@@ -1614,7 +1618,7 @@
     "    \"\"\"\n",
     "    listy = []\n",
     "    for tag, start, end in html_tag_data:\n",
-    "        if 'notation' in tag.attrs and 'style' not in tag.attrs:\n",
+    "        if ('notation' in tag.attrs or 'definition' in tag.attrs) and 'style' not in tag.attrs:\n",
     "            tag.attrs['style'] = \"border-width:1px;border-style:solid;padding:3px\"\n",
     "        listy.append((tag, start, end)) \n",
     "    return listy\n",
@@ -1634,7 +1638,15 @@
     "tag_data = [\n",
     "    (tag, 0, 2),\n",
     "]\n",
-    "output = _add_nice_boxing_attrs_to_notation_tags(tag_data)\n",
+    "output = _add_nice_boxing_attrs_to_def_and_notat_tags(tag_data)\n",
+    "assert \"style\" in output[0][0].attrs\n",
+    "\n",
+    "tag = soup.new_tag(\"span\", definition=\"\")\n",
+    "tag.string = 'hi'\n",
+    "tag_data = [\n",
+    "    (tag, 0, 2),\n",
+    "]\n",
+    "output = _add_nice_boxing_attrs_to_def_and_notat_tags(tag_data)\n",
     "assert \"style\" in output[0][0].attrs"
    ]
   },
@@ -1677,11 +1689,11 @@
    "source": [
     "#| export\n",
     "def auto_mark_def_and_notats(\n",
-    "        note: VaultNote,\n",
-    "        pipeline,\n",
+    "        note: VaultNote,  # The standard information note in which to find the definitions and notations.\n",
+    "        pipeline: pipelines.token_classification.TokenClassificationPipeline, # The token classification pipeline that is used to predict whether tokens are part of definitions or notations introduced in the text.\n",
     "        # remove_existing_def_and_notat_markings: bool = False,  # If `True`, remove definition and notation markings (both via surrounding by double asterisks `**` as per the legacy method and via HTML tags)\n",
     "        excessive_space_threshold: int = 2,\n",
-    "        add_boxing_attr_to_existing_notat_markings: bool = True # If `True`, then nice attributes are added to the existing notation HTML tags, if not already present.\n",
+    "        add_boxing_attr_to_existing_def_and_notat_markings: bool = True # If `True`, then nice attributes are added to the existing notation HTML tags, if not already present.\n",
     "    ) -> None:\n",
     "    \"\"\"\n",
     "    Predict and mark where definitions and notation occur in a note using\n",
@@ -1731,11 +1743,43 @@
     "    see_also_line = mf.get_line_number_of_heading('See Also')\n",
     "     \n",
     "    main_text = mf.text_of_lines(first_non_metadata_line, see_also_line)\n",
+    "    # main_text = add_space_to_lt_symbols_without_space(main_text)\n",
+    "    # main_text = convert_double_asterisks_to_html_tags(main_text)\n",
+    "    # main_text, existing_html_tag_data = remove_html_tags_in_text(main_text)\n",
+    "    # if add_boxing_attr_to_existing_def_and_notat_markings:\n",
+    "    #     existing_html_tag_data = _add_nice_boxing_attrs_to_def_and_notat_tags(\n",
+    "    #         existing_html_tag_data)\n",
+    "    # html_tags_to_add = _html_tags_from_token_preds(\n",
+    "    #     main_text, pipeline(main_text), note, excessive_space_threshold)\n",
+    "    # html_tags_to_add = _consolidate_token_preds(\n",
+    "    #     main_text, html_tags_to_add)\n",
+    "\n",
+    "    # html_tags_to_add_back = _collate_html_tags(\n",
+    "    #     existing_html_tag_data, html_tags_to_add)\n",
+    "    # main_text = add_HTML_tag_data_to_raw_text(main_text, html_tags_to_add_back)\n",
+    "    main_text = _format_main_text_and_add_html_tag_data(\n",
+    "        note, pipeline, add_boxing_attr_to_existing_def_and_notat_markings,\n",
+    "        excessive_space_threshold, main_text)\n",
+    "    mf.remove_lines(first_non_metadata_line, see_also_line)\n",
+    "    mf.insert_line(first_non_metadata_line,\n",
+    "                   {'type': MarkdownLineEnum.DEFAULT, 'line': main_text})\n",
+    "    mf.add_tags('_auto/def_and_notat_identified')\n",
+    "    mf.write(note)\n",
+    "\n",
+    "\n",
+    "def _format_main_text_and_add_html_tag_data(\n",
+    "        note: VaultNote,\n",
+    "        pipeline: pipelines.token_classification.TokenClassificationPipeline, # The token classification pipeline that is used to predict whether tokens are part of definitions or notations introduced in the text.\n",
+    "        add_boxing_attr_to_existing_def_and_notat_markings: bool,\n",
+    "        excessive_space_threshold: int,\n",
+    "        main_text: str,  # The main text to format and to add HTML tag data to\n",
+    "        ) -> str:\n",
+    "\n",
     "    main_text = add_space_to_lt_symbols_without_space(main_text)\n",
     "    main_text = convert_double_asterisks_to_html_tags(main_text)\n",
     "    main_text, existing_html_tag_data = remove_html_tags_in_text(main_text)\n",
-    "    if add_boxing_attr_to_existing_notat_markings:\n",
-    "        existing_html_tag_data = _add_nice_boxing_attrs_to_notation_tags(\n",
+    "    if add_boxing_attr_to_existing_def_and_notat_markings:\n",
+    "        existing_html_tag_data = _add_nice_boxing_attrs_to_def_and_notat_tags(\n",
     "            existing_html_tag_data)\n",
     "    html_tags_to_add = _html_tags_from_token_preds(\n",
     "        main_text, pipeline(main_text), note, excessive_space_threshold)\n",
@@ -1744,16 +1788,111 @@
     "\n",
     "    html_tags_to_add_back = _collate_html_tags(\n",
     "        existing_html_tag_data, html_tags_to_add)\n",
-    "    main_text = add_HTML_tag_data_to_raw_text(main_text, html_tags_to_add_back)\n",
-    "    mf.remove_lines(first_non_metadata_line, see_also_line)\n",
-    "    mf.insert_line(first_non_metadata_line,\n",
-    "                   {'type': MarkdownLineEnum.DEFAULT, 'line': main_text})\n",
-    "    # mf.insert_line(first_non_metadata_line,\n",
-    "    #                {'type': MarkdownLineEnum.HEADING, 'line': '# Topic[^1]'})\n",
-    "    mf.add_tags('_auto/def_and_notat_identified')\n",
-    "    mf.write(note)\n",
+    "    return add_HTML_tag_data_to_raw_text(main_text, html_tags_to_add_back)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In the following examples, we mock pipeline objects instead of using actual ones.\n",
+    "\n",
+    "In the below example, we run the `auto_mark_def_and_notats` function on a note that has double asterisks `**` surrounding parts of the text that introduced definitions or notations. In these cases, appropriate HTML tags replace the double asterisks instead."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Text before:\n",
+      "\n",
+      "\n",
+      "---\n",
+      "cssclass: clean-embeds\n",
+      "aliases: []\n",
+      "tags: [_meta/literature_note, _meta/definition, _meta/notation]\n",
+      "---\n",
+      "# Ring of integers modulo $n$[^1]\n",
+      "\n",
+      "Let $n \\geq 1$ be an integer. The **ring of integers modulo $n$**, denoted by **$\\mathbb{Z}/n\\mathbb{Z}$**, is, informally, the ring whose elements are represented by the integers with the understanding that $0$ and $n$ are equal.\n",
+      "\n",
+      "More precisely, $\\mathbb{Z}/n\\mathbb{Z}$ has the elements $0,1,\\ldots,n-1$.\n",
+      "\n",
+      "...\n",
+      "\n",
+      "\n",
+      "# See Also\n",
+      "- [[reference_with_tag_labels_Exercise 1|reference_with_tag_labels_Z_nZ_is_a_ring]]\n",
+      "# Meta\n",
+      "## References\n",
+      "\n",
+      "## Citations and Footnotes\n",
+      "[^1]: Kim, Definition 2\n",
+      "\n",
+      "\n",
+      "\n",
+      "Text after:\n",
+      "\n",
+      "---\n",
+      "cssclass: clean-embeds\n",
+      "aliases: []\n",
+      "tags: [_meta/notation, _auto/def_and_notat_identified, _meta/literature_note, _meta/definition]\n",
+      "---\n",
+      "# Ring of integers modulo $n$[^1]\n",
+      "\n",
+      "Let $n \\geq 1$ be an integer. The <b definition=\"\" style=\"border-width:1px;border-style:solid;padding:3px\">ring of integers modulo $n$</b>, denoted by <span notation=\"\" style=\"border-width:1px;border-style:solid;padding:3px\">$\\mathbb{Z}/n\\mathbb{Z}$</span>, is, informally, the ring whose elements are represented by the integers with the understanding that $0$ and $n$ are equal.\n",
+      "\n",
+      "More precisely, $\\mathbb{Z}/n\\mathbb{Z}$ has the elements $0,1,\\ldots,n-1$.\n",
+      "\n",
+      "...\n",
+      "\n",
+      "\n",
+      "# See Also\n",
+      "- [[reference_with_tag_labels_Exercise 1|reference_with_tag_labels_Z_nZ_is_a_ring]]\n",
+      "# Meta\n",
+      "## References\n",
+      "\n",
+      "## Citations and Footnotes\n",
+      "[^1]: Kim, Definition 2\n"
+     ]
+    }
+   ],
+   "source": [
+    "with (tempfile.TemporaryDirectory(prefix='temp_dir', dir=os.getcwd()) as temp_dir,\n",
+    "      mock.patch('__main__.pipelines.token_classification.TokenClassificationPipeline') as mock_pipeline):\n",
+    "    temp_vault = Path(temp_dir) / 'test_vault_6'\n",
+    "    shutil.copytree(_test_directory() / 'test_vault_6', temp_vault)\n",
+    "\n",
+    "    vn = VaultNote(temp_vault, name='reference_with_tag_labels_Definition 2')\n",
+    "    print(\"Text before:\\n\\n\")\n",
+    "    print(vn.text())\n",
+    "    print(\"\\n\\n\\nText after:\\n\")\n",
+    "    auto_mark_def_and_notats(vn, mock_pipeline)\n",
+    "    print(vn.text())\n",
+    "    mf = MarkdownFile.from_vault_note(vn)\n",
+    "    assert mf.has_tag('_auto/def_and_notat_identified')\n",
+    "\n",
     "\n"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# TODO: more examples with pipeline mocking actual outputs"
+   ]
   }
  ],
  "metadata": {

diff --git a/nbs/_tests/test_vault_6/README.md b/nbs/_tests/test_vault_6/README.md
@@ -1 +1,4 @@
-This is a test vault for `23_markdown.obsidian.personal.machine_learning.information_note_types.ipynb` and `20_markdown.obsidian.personal.notation.ipynb`.
+This is a test vault for the following:
+- `23_markdown.obsidian.personal.machine_learning.information_note_types.ipynb`
+- `20_markdown.obsidian.personal.notation.ipynb`.
+- `28_markdown.obsidian.personal.machine_learning.tokenize.ipynb`
diff --git a/trouver/_modidx.py b/trouver/_modidx.py
@@ -512,8 +512,8 @@
                                                                                             'trouver.markdown.obsidian.personal.machine_learning.notation_summarization.summarize_notation': ( 'markdown.obsidian.personal.machine_learning.notation_summarization.html#summarize_notation',
                                                                                                                                                                                                'trouver/markdown/obsidian/personal/machine_learning/notation_summarization.py')},
             'trouver.markdown.obsidian.personal.machine_learning.notations': {},
-            'trouver.markdown.obsidian.personal.machine_learning.tokenize': { 'trouver.markdown.obsidian.personal.machine_learning.tokenize._add_nice_boxing_attrs_to_notation_tags': ( 'markdown.obsidian.personal.machine_learning.tokenize.html#_add_nice_boxing_attrs_to_notation_tags',
-                                                                                                                                                                                        'trouver/markdown/obsidian/personal/machine_learning/tokenize.py'),
+            'trouver.markdown.obsidian.personal.machine_learning.tokenize': { 'trouver.markdown.obsidian.personal.machine_learning.tokenize._add_nice_boxing_attrs_to_def_and_notat_tags': ( 'markdown.obsidian.personal.machine_learning.tokenize.html#_add_nice_boxing_attrs_to_def_and_notat_tags',
+                                                                                                                                                                                             'trouver/markdown/obsidian/personal/machine_learning/tokenize.py'),
                                                                               'trouver.markdown.obsidian.personal.machine_learning.tokenize._char_is_in_seq': ( 'markdown.obsidian.personal.machine_learning.tokenize.html#_char_is_in_seq',
                                                                                                                                                                 'trouver/markdown/obsidian/personal/machine_learning/tokenize.py'),
                                                                               'trouver.markdown.obsidian.personal.machine_learning.tokenize._collate_html_tags': ( 'markdown.obsidian.personal.machine_learning.tokenize.html#_collate_html_tags',
@@ -528,6 +528,8 @@
                                                                                                                                                                                'trouver/markdown/obsidian/personal/machine_learning/tokenize.py'),
                                                                               'trouver.markdown.obsidian.personal.machine_learning.tokenize._extend_tag_data_ranges': ( 'markdown.obsidian.personal.machine_learning.tokenize.html#_extend_tag_data_ranges',
                                                                                                                                                                         'trouver/markdown/obsidian/personal/machine_learning/tokenize.py'),
+                                                                              'trouver.markdown.obsidian.personal.machine_learning.tokenize._format_main_text_and_add_html_tag_data': ( 'markdown.obsidian.personal.machine_learning.tokenize.html#_format_main_text_and_add_html_tag_data',
+                                                                                                                                                                                        'trouver/markdown/obsidian/personal/machine_learning/tokenize.py'),
                                                                               'trouver.markdown.obsidian.personal.machine_learning.tokenize._html_tag_data_from_part': ( 'markdown.obsidian.personal.machine_learning.tokenize.html#_html_tag_data_from_part',
                                                                                                                                                                          'trouver/markdown/obsidian/personal/machine_learning/tokenize.py'),
                                                                               'trouver.markdown.obsidian.personal.machine_learning.tokenize._html_tag_from_double_ast': ( 'markdown.obsidian.personal.machine_learning.tokenize.html#_html_tag_from_double_ast',