Skip to content

Commit

Permalink
Factor auto_mark_def_and_notats and make it so that it adds boxing …
Browse files Browse the repository at this point in the history
…attributes to both existing definition and notation markings
  • Loading branch information
hyunjongkimmath committed Feb 7, 2024
1 parent ec76e9b commit 42c8f07
Show file tree
Hide file tree
Showing 4 changed files with 208 additions and 40 deletions.
179 changes: 159 additions & 20 deletions nbs/28_markdown.obsidian.personal.machine_learning.tokenize.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@
"import warnings\n",
"\n",
"import bs4\n",
"from transformers import BatchEncoding, PreTrainedTokenizer, PreTrainedTokenizerFast\n",
"from transformers import BatchEncoding, pipelines, PreTrainedTokenizer, PreTrainedTokenizerFast\n",
"\n",
"from trouver.helper import add_HTML_tag_data_to_raw_text, add_space_to_lt_symbols_without_space, double_asterisk_indices, latex_indices, notation_asterisk_indices, replace_string_by_indices, remove_html_tags_in_text\n",
"from trouver.markdown.markdown.file import MarkdownFile, MarkdownLineEnum\n",
Expand All @@ -64,10 +64,14 @@
"outputs": [],
"source": [
"from unittest import mock\n",
"import shutil\n",
"import tempfile\n",
"\n",
"from datasets import ClassLabel, Dataset, Features, Sequence, Value\n",
"from transformers import AutoTokenizer\n",
"from fastcore.test import *\n"
"from fastcore.test import *\n",
"\n",
"from trouver.helper import _test_directory\n"
]
},
{
Expand Down Expand Up @@ -1438,8 +1442,8 @@
{
"data": {
"text/plain": [
"[(<span notation=\"\" style=\"border-width:1px;border-style:solid;padding:3px\">$\\zeta(s)$</span>,\n",
" 26,\n",
"[(<b definition=\"\" style=\"border-width:1px;border-style:solid;padding:3px\">Riemann zeta function $\\zeta(s)$</b>,\n",
" 4,\n",
" 36)]"
]
},
Expand Down Expand Up @@ -1604,7 +1608,7 @@
"outputs": [],
"source": [
"#| export\n",
"def _add_nice_boxing_attrs_to_notation_tags(\n",
"def _add_nice_boxing_attrs_to_def_and_notat_tags(\n",
" html_tag_data: list[tuple[bs4.element.Tag, int, int]]\n",
" ) -> list[tuple[bs4.element.Tag, int, int]]:\n",
" \"\"\"\n",
Expand All @@ -1614,7 +1618,7 @@
" \"\"\"\n",
" listy = []\n",
" for tag, start, end in html_tag_data:\n",
" if 'notation' in tag.attrs and 'style' not in tag.attrs:\n",
" if ('notation' in tag.attrs or 'definition' in tag.attrs) and 'style' not in tag.attrs:\n",
" tag.attrs['style'] = \"border-width:1px;border-style:solid;padding:3px\"\n",
" listy.append((tag, start, end)) \n",
" return listy\n",
Expand All @@ -1634,7 +1638,15 @@
"tag_data = [\n",
" (tag, 0, 2),\n",
"]\n",
"output = _add_nice_boxing_attrs_to_notation_tags(tag_data)\n",
"output = _add_nice_boxing_attrs_to_def_and_notat_tags(tag_data)\n",
"assert \"style\" in output[0][0].attrs\n",
"\n",
"tag = soup.new_tag(\"span\", definition=\"\")\n",
"tag.string = 'hi'\n",
"tag_data = [\n",
" (tag, 0, 2),\n",
"]\n",
"output = _add_nice_boxing_attrs_to_def_and_notat_tags(tag_data)\n",
"assert \"style\" in output[0][0].attrs"
]
},
Expand Down Expand Up @@ -1677,11 +1689,11 @@
"source": [
"#| export\n",
"def auto_mark_def_and_notats(\n",
" note: VaultNote,\n",
" pipeline,\n",
" note: VaultNote, # The standard information note in which to find the definitions and notations.\n",
" pipeline: pipelines.token_classification.TokenClassificationPipeline, # The token classification pipeline that is used to predict whether tokens are part of definitions or notations introduced in the text.\n",
" # remove_existing_def_and_notat_markings: bool = False, # If `True`, remove definition and notation markings (both via surrounding by double asterisks `**` as per the legacy method and via HTML tags)\n",
" excessive_space_threshold: int = 2,\n",
" add_boxing_attr_to_existing_notat_markings: bool = True # If `True`, then nice attributes are added to the existing notation HTML tags, if not already present.\n",
" add_boxing_attr_to_existing_def_and_notat_markings: bool = True # If `True`, then nice attributes are added to the existing notation HTML tags, if not already present.\n",
" ) -> None:\n",
" \"\"\"\n",
" Predict and mark where definitions and notation occur in a note using\n",
Expand Down Expand Up @@ -1731,11 +1743,43 @@
" see_also_line = mf.get_line_number_of_heading('See Also')\n",
" \n",
" main_text = mf.text_of_lines(first_non_metadata_line, see_also_line)\n",
" # main_text = add_space_to_lt_symbols_without_space(main_text)\n",
" # main_text = convert_double_asterisks_to_html_tags(main_text)\n",
" # main_text, existing_html_tag_data = remove_html_tags_in_text(main_text)\n",
" # if add_boxing_attr_to_existing_def_and_notat_markings:\n",
" # existing_html_tag_data = _add_nice_boxing_attrs_to_def_and_notat_tags(\n",
" # existing_html_tag_data)\n",
" # html_tags_to_add = _html_tags_from_token_preds(\n",
" # main_text, pipeline(main_text), note, excessive_space_threshold)\n",
" # html_tags_to_add = _consolidate_token_preds(\n",
" # main_text, html_tags_to_add)\n",
"\n",
" # html_tags_to_add_back = _collate_html_tags(\n",
" # existing_html_tag_data, html_tags_to_add)\n",
" # main_text = add_HTML_tag_data_to_raw_text(main_text, html_tags_to_add_back)\n",
" main_text = _format_main_text_and_add_html_tag_data(\n",
" note, pipeline, add_boxing_attr_to_existing_def_and_notat_markings,\n",
" excessive_space_threshold, main_text)\n",
" mf.remove_lines(first_non_metadata_line, see_also_line)\n",
" mf.insert_line(first_non_metadata_line,\n",
" {'type': MarkdownLineEnum.DEFAULT, 'line': main_text})\n",
" mf.add_tags('_auto/def_and_notat_identified')\n",
" mf.write(note)\n",
"\n",
"\n",
"def _format_main_text_and_add_html_tag_data(\n",
" note: VaultNote,\n",
" pipeline: pipelines.token_classification.TokenClassificationPipeline, # The token classification pipeline that is used to predict whether tokens are part of definitions or notations introduced in the text.\n",
" add_boxing_attr_to_existing_def_and_notat_markings: bool,\n",
" excessive_space_threshold: int,\n",
" main_text: str, # The main text to format and to add HTML tag data to\n",
" ) -> str:\n",
"\n",
" main_text = add_space_to_lt_symbols_without_space(main_text)\n",
" main_text = convert_double_asterisks_to_html_tags(main_text)\n",
" main_text, existing_html_tag_data = remove_html_tags_in_text(main_text)\n",
" if add_boxing_attr_to_existing_notat_markings:\n",
" existing_html_tag_data = _add_nice_boxing_attrs_to_notation_tags(\n",
" if add_boxing_attr_to_existing_def_and_notat_markings:\n",
" existing_html_tag_data = _add_nice_boxing_attrs_to_def_and_notat_tags(\n",
" existing_html_tag_data)\n",
" html_tags_to_add = _html_tags_from_token_preds(\n",
" main_text, pipeline(main_text), note, excessive_space_threshold)\n",
Expand All @@ -1744,16 +1788,111 @@
"\n",
" html_tags_to_add_back = _collate_html_tags(\n",
" existing_html_tag_data, html_tags_to_add)\n",
" main_text = add_HTML_tag_data_to_raw_text(main_text, html_tags_to_add_back)\n",
" mf.remove_lines(first_non_metadata_line, see_also_line)\n",
" mf.insert_line(first_non_metadata_line,\n",
" {'type': MarkdownLineEnum.DEFAULT, 'line': main_text})\n",
" # mf.insert_line(first_non_metadata_line,\n",
" # {'type': MarkdownLineEnum.HEADING, 'line': '# Topic[^1]'})\n",
" mf.add_tags('_auto/def_and_notat_identified')\n",
" mf.write(note)\n",
" return add_HTML_tag_data_to_raw_text(main_text, html_tags_to_add_back)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"In the following examples, we mock pipeline objects instead of using actual ones.\n",
"\n",
"In the below example, we run the `auto_mark_def_and_notats` function on a note that has double asterisks `**` surrounding parts of the text that introduced definitions or notations. In these cases, appropriate HTML tags replace the double asterisks instead."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Text before:\n",
"\n",
"\n",
"---\n",
"cssclass: clean-embeds\n",
"aliases: []\n",
"tags: [_meta/literature_note, _meta/definition, _meta/notation]\n",
"---\n",
"# Ring of integers modulo $n$[^1]\n",
"\n",
"Let $n \\geq 1$ be an integer. The **ring of integers modulo $n$**, denoted by **$\\mathbb{Z}/n\\mathbb{Z}$**, is, informally, the ring whose elements are represented by the integers with the understanding that $0$ and $n$ are equal.\n",
"\n",
"More precisely, $\\mathbb{Z}/n\\mathbb{Z}$ has the elements $0,1,\\ldots,n-1$.\n",
"\n",
"...\n",
"\n",
"\n",
"# See Also\n",
"- [[reference_with_tag_labels_Exercise 1|reference_with_tag_labels_Z_nZ_is_a_ring]]\n",
"# Meta\n",
"## References\n",
"\n",
"## Citations and Footnotes\n",
"[^1]: Kim, Definition 2\n",
"\n",
"\n",
"\n",
"Text after:\n",
"\n",
"---\n",
"cssclass: clean-embeds\n",
"aliases: []\n",
"tags: [_meta/notation, _auto/def_and_notat_identified, _meta/literature_note, _meta/definition]\n",
"---\n",
"# Ring of integers modulo $n$[^1]\n",
"\n",
"Let $n \\geq 1$ be an integer. The <b definition=\"\" style=\"border-width:1px;border-style:solid;padding:3px\">ring of integers modulo $n$</b>, denoted by <span notation=\"\" style=\"border-width:1px;border-style:solid;padding:3px\">$\\mathbb{Z}/n\\mathbb{Z}$</span>, is, informally, the ring whose elements are represented by the integers with the understanding that $0$ and $n$ are equal.\n",
"\n",
"More precisely, $\\mathbb{Z}/n\\mathbb{Z}$ has the elements $0,1,\\ldots,n-1$.\n",
"\n",
"...\n",
"\n",
"\n",
"# See Also\n",
"- [[reference_with_tag_labels_Exercise 1|reference_with_tag_labels_Z_nZ_is_a_ring]]\n",
"# Meta\n",
"## References\n",
"\n",
"## Citations and Footnotes\n",
"[^1]: Kim, Definition 2\n"
]
}
],
"source": [
"with (tempfile.TemporaryDirectory(prefix='temp_dir', dir=os.getcwd()) as temp_dir,\n",
" mock.patch('__main__.pipelines.token_classification.TokenClassificationPipeline') as mock_pipeline):\n",
" temp_vault = Path(temp_dir) / 'test_vault_6'\n",
" shutil.copytree(_test_directory() / 'test_vault_6', temp_vault)\n",
"\n",
" vn = VaultNote(temp_vault, name='reference_with_tag_labels_Definition 2')\n",
" print(\"Text before:\\n\\n\")\n",
" print(vn.text())\n",
" print(\"\\n\\n\\nText after:\\n\")\n",
" auto_mark_def_and_notats(vn, mock_pipeline)\n",
" print(vn.text())\n",
" mf = MarkdownFile.from_vault_note(vn)\n",
" assert mf.has_tag('_auto/def_and_notat_identified')\n",
"\n",
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# TODO: more examples with pipeline mocking actual outputs"
]
}
],
"metadata": {
Expand Down
5 changes: 4 additions & 1 deletion nbs/_tests/test_vault_6/README.md
Original file line number Diff line number Diff line change
@@ -1 +1,4 @@
This is a test vault for `23_markdown.obsidian.personal.machine_learning.information_note_types.ipynb` and `20_markdown.obsidian.personal.notation.ipynb`.
This is a test vault for the following:
- `23_markdown.obsidian.personal.machine_learning.information_note_types.ipynb`
- `20_markdown.obsidian.personal.notation.ipynb`.
- `28_markdown.obsidian.personal.machine_learning.tokenize.ipynb`
6 changes: 4 additions & 2 deletions trouver/_modidx.py
Original file line number Diff line number Diff line change
Expand Up @@ -512,8 +512,8 @@
'trouver.markdown.obsidian.personal.machine_learning.notation_summarization.summarize_notation': ( 'markdown.obsidian.personal.machine_learning.notation_summarization.html#summarize_notation',
'trouver/markdown/obsidian/personal/machine_learning/notation_summarization.py')},
'trouver.markdown.obsidian.personal.machine_learning.notations': {},
'trouver.markdown.obsidian.personal.machine_learning.tokenize': { 'trouver.markdown.obsidian.personal.machine_learning.tokenize._add_nice_boxing_attrs_to_notation_tags': ( 'markdown.obsidian.personal.machine_learning.tokenize.html#_add_nice_boxing_attrs_to_notation_tags',
'trouver/markdown/obsidian/personal/machine_learning/tokenize.py'),
'trouver.markdown.obsidian.personal.machine_learning.tokenize': { 'trouver.markdown.obsidian.personal.machine_learning.tokenize._add_nice_boxing_attrs_to_def_and_notat_tags': ( 'markdown.obsidian.personal.machine_learning.tokenize.html#_add_nice_boxing_attrs_to_def_and_notat_tags',
'trouver/markdown/obsidian/personal/machine_learning/tokenize.py'),
'trouver.markdown.obsidian.personal.machine_learning.tokenize._char_is_in_seq': ( 'markdown.obsidian.personal.machine_learning.tokenize.html#_char_is_in_seq',
'trouver/markdown/obsidian/personal/machine_learning/tokenize.py'),
'trouver.markdown.obsidian.personal.machine_learning.tokenize._collate_html_tags': ( 'markdown.obsidian.personal.machine_learning.tokenize.html#_collate_html_tags',
Expand All @@ -528,6 +528,8 @@
'trouver/markdown/obsidian/personal/machine_learning/tokenize.py'),
'trouver.markdown.obsidian.personal.machine_learning.tokenize._extend_tag_data_ranges': ( 'markdown.obsidian.personal.machine_learning.tokenize.html#_extend_tag_data_ranges',
'trouver/markdown/obsidian/personal/machine_learning/tokenize.py'),
'trouver.markdown.obsidian.personal.machine_learning.tokenize._format_main_text_and_add_html_tag_data': ( 'markdown.obsidian.personal.machine_learning.tokenize.html#_format_main_text_and_add_html_tag_data',
'trouver/markdown/obsidian/personal/machine_learning/tokenize.py'),
'trouver.markdown.obsidian.personal.machine_learning.tokenize._html_tag_data_from_part': ( 'markdown.obsidian.personal.machine_learning.tokenize.html#_html_tag_data_from_part',
'trouver/markdown/obsidian/personal/machine_learning/tokenize.py'),
'trouver.markdown.obsidian.personal.machine_learning.tokenize._html_tag_from_double_ast': ( 'markdown.obsidian.personal.machine_learning.tokenize.html#_html_tag_from_double_ast',
Expand Down
Loading

0 comments on commit 42c8f07

Please sign in to comment.