Preliminarily write functions of `34_markdown.obsidian.machine_learni…

…ng.notation_linking.ipynb` and modify `parse_notation_note` to return a list instead of a MarkdownFile for the bulleted list of notation note links
hyunjongkimmath · May 31, 2024 · 5db0911 · 5db0911
1 parent 395f28c
commit 5db0911
Show file tree

Hide file tree

Showing 8 changed files with 1,067 additions and 93 deletions.
diff --git a/nbs/00_helper.ipynb b/nbs/00_helper.ipynb
@@ -353,7 +353,15 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['', 'hello', ' asdf', ' ', 'asdf']\n"
+     ]
+    }
+   ],
    "source": [
     "text = 'hello asdf asdf'\n",
     "sample_output = separate_indices_from_str(text, [(0,5), (10,11)])\n",
@@ -993,6 +1001,39 @@
     "#### Removing HTML tags in a text and obtaining the data of the tags."
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "markup = '<b>Hello</b>'\n",
+    "soup = BeautifulSoup(markup, 'html.parser')\n",
+    "tag = soup.b\n",
+    "new_str = soup.new_string(' World')\n",
+    "tag.append(new_str)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "' World'"
+      ]
+     },
+     "execution_count": null,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "new_str"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -1064,8 +1105,13 @@
     "        return position + len(content)\n",
     "    replacement_text = _select_replacement_text(\n",
     "        content, replace_with_attributes, definitely_replace)\n",
-    "    replaced_content = content.replace_with(\n",
-    "        parsed_soup.new_string(replacement_text))\n",
+    "    \n",
+    "    try:\n",
+    "        replaced_content = content.replace_with(\n",
+    "            parsed_soup.new_string(replacement_text))\n",
+    "    except TypeError as e:\n",
+    "        raise e\n",
+    "\n",
     "    replaced_contents.append((\n",
     "        replaced_content,\n",
     "        position,\n",
@@ -2266,7 +2312,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "2024-03-24T14:35\n"
+      "2024-05-27T21:24\n"
      ]
     }
    ],

diff --git a/nbs/20_markdown.obsidian.personal.notation.ipynb b/nbs/20_markdown.obsidian.personal.notation.ipynb
@@ -45,7 +45,7 @@
     "from trouver.helper import latex_indices, latex_to_path_accepted_string, notation_asterisk_indices, remove_html_tags_in_text\n",
     "from trouver.markdown.markdown.file import MarkdownFile, MarkdownLineEnum\n",
     "from trouver.markdown.obsidian.links import (\n",
-    "    find_links_in_markdown_text, LinkType, ObsidianLink, MARKDOWNLINK_PATTERN, WIKILINK_PATTERN\n",
+    "    find_links_in_markdown_text, LinkType, ObsidianLink, MARKDOWNLINK_PATTERN, MARKDOWNLINK_CAPTURE_PATTERN, WIKILINK_PATTERN\n",
     ")\n",
     "from trouver.markdown.obsidian.personal.information_notes import bulleted_links_of_type_in_section\n",
     "from trouver.markdown.obsidian.personal.note_type import (\n",
@@ -194,9 +194,10 @@
     "#| export\n",
     "def _divide_bulleted_list_mf_at_end(\n",
     "        mf: MarkdownFile\n",
-    "        ) -> tuple[MarkdownFile, Union[MarkdownFile, None]]: # The first MarkdownFile contains the main content. The second MarkdonwFile contains the bulleted list at the end; if no such bulleted list exists, then this is None.\n",
-    "    \"\"\"Divide a `MarkdownFile` for a notation note into two MarkdownFiles, one\n",
-    "    of the main content and the other for the trailing bulleted list of links\n",
+    "        ) -> tuple[MarkdownFile, list[tuple[str, str]]]: # The first MarkdownFile contains the main content. The second MarkdonwFile contains the bulleted list at the end; if no such bulleted list exists, then this is None.\n",
+    "    \"\"\"Return a `MarkdownFile` consisting of just the main content\n",
+    "    of a notation note along with the list of tuples capturing the\n",
+    "    information of the trailing bulleted list of links\n",
     "    for notations used in the notation note.\n",
     "\n",
     "    Assumes that the bulleted list is formatted correctly\n",
@@ -215,11 +216,16 @@
     "        last_part = main_parts.pop() # Should be the same as `part`\n",
     "        trailing_parts.insert(0, last_part)\n",
     "    \n",
-    "    if trailing_parts:\n",
-    "        bulleted_list_mf = MarkdownFile(trailing_parts)\n",
-    "    else:\n",
-    "        bulleted_list_mf = None\n",
-    "    return MarkdownFile(main_parts), bulleted_list_mf\n",
+    "    bulleted_list = [\n",
+    "        _notat_str_and_linked_notat_note_name_from_bullet_point_part(part)\n",
+    "        for part in trailing_parts]\n",
+    "    return MarkdownFile(main_parts), bulleted_list\n",
+    "    \n",
+    "    # if trailing_parts:\n",
+    "    #     bulleted_list_mf = MarkdownFile(trailing_parts)\n",
+    "    # else:\n",
+    "    #     bulleted_list_mf = None\n",
+    "    # return MarkdownFile(main_parts), bulleted_list_mf\n",
     "\n",
     "def _part_is_unordered_list_and_is_of_markdownstyle_link(\n",
     "        part: dict[str, Union[str, MarkdownLineEnum]]\n",
@@ -235,7 +241,22 @@
     "    if not re.match(MARKDOWNLINK_PATTERN, part['line'][2:]):\n",
     "        return False\n",
     "    return True\n",
-    "    "
+    "    \n",
+    "\n",
+    "def _notat_str_and_linked_notat_note_name_from_bullet_point_part(\n",
+    "        part: dict[str, Union[str, MarkdownLineEnum]]\n",
+    "        ) -> tuple[str, str]:\n",
+    "    match = re.match(MARKDOWNLINK_CAPTURE_PATTERN, part['line'][2:])\n",
+    "    return match[1], match[2]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "test_eq(['hi'], ['hi'])"
    ]
   },
   {
@@ -249,22 +270,22 @@
     "mf = MarkdownFile.from_string(notation_note_str)\n",
     "main_content, bulleted_list = _divide_bulleted_list_mf_at_end(mf)\n",
     "test_eq(str(main_content), '$\\operatorname{Gal}(L/K)$ [[linky|denotes]] blah blah blah.')\n",
-    "test_eq(str(bulleted_list), '- [$L$](notation_L_some_field_extension)\\n- [$K$](notation_K_some_base_field)')\n",
+    "test_eq(bulleted_list, [('$L$', 'notation_L_some_field_extension'), ('$K$', 'notation_K_some_base_field')])\n",
     "\n",
     "# This time, add more blank lines\n",
     "notation_note_str = '$\\operatorname{Gal}(L/K)$ [[linky|denotes]] blah blah blah.\\n\\n- [$L$](notation_L_some_field_extension)\\n\\n- [$K$](notation_K_some_base_field)\\n'\n",
     "mf = MarkdownFile.from_string(notation_note_str)\n",
     "main_content, bulleted_list = _divide_bulleted_list_mf_at_end(mf)\n",
     "test_eq(str(main_content), '$\\operatorname{Gal}(L/K)$ [[linky|denotes]] blah blah blah.')\n",
-    "test_eq(str(bulleted_list), '- [$L$](notation_L_some_field_extension)\\n- [$K$](notation_K_some_base_field)')\n",
+    "test_eq(bulleted_list, [('$L$', 'notation_L_some_field_extension'), ('$K$', 'notation_K_some_base_field')])\n",
     "\n",
     "# No bulleted list means the second output is `None`:\n",
     "\n",
     "notation_note_str = '$\\operatorname{Gal}(L/K)$ [[linky|denotes]] blah blah blah.\\n\\n'\n",
     "mf = MarkdownFile.from_string(notation_note_str)\n",
     "main_content, bulleted_list = _divide_bulleted_list_mf_at_end(mf)\n",
     "test_eq(str(main_content), '$\\operatorname{Gal}(L/K)$ [[linky|denotes]] blah blah blah.')\n",
-    "assert bulleted_list is None\n",
+    "test_eq(bulleted_list, [])\n",
     "\n"
    ]
   },
@@ -277,25 +298,26 @@
     "#| export\n",
     "def parse_notation_note(\n",
     "        notation_note: Union[str, VaultNote],\n",
-    "        vault: Optional[PathLike] = None # The vault If `None`, then uses th\n",
+    "        vault: Optional[PathLike] = None # The vault If `None`, then uses `notation_note.vault`\n",
     "        ) -> tuple[Union[dict, None], Union[str, None], str, MarkdownFile,\n",
-    "                   Union[MarkdownFile, None]]:\n",
+    "                   list[tuple[str, str]]]:\n",
     "    \"\"\"Parse information from the notation note.\n",
     "\n",
     "    **Returns**\n",
     "\n",
-    "    - tuple[Union[dict, None], str, ObsidianLink, MarkdownFile, MarkdownFile]\n",
+    "    - tuple[Union[dict, None], str, ObsidianLink, MarkdownFile, list[tuple[str, str]]]\n",
     "        - The first entry is the YAML frontmatter meta, if available.\n",
     "        - The second entry is the notation string\n",
     "        - The third entry is the name of the \"main note\" of the notation note. This is usual\n",
     "          the linked note in the link `[[<linked_note>|denotes]]`. If no such main note\n",
     "          exists, then this is `None`.\n",
     "        - The fourth entry is the MarkdownFile consisting of the \"main\" content of the note,\n",
     "          which excludes the information given by all of the other entries.\n",
-    "        - The fifth entry is the MarkdownFile consisting of the ending bulleted list, listing\n",
-    "          the notations used in the notation notes along with links to the notation notes\n",
-    "          describing these notations. If there is not such bulleted list, then this entry\n",
-    "          is `None`. \n",
+    "        - The fifth entry is a list of tuples of two `str`'s representing a\n",
+    "          bulleted list of notation notes to which `notation_note` links to.\n",
+    "          Each tuple is of the form `(latex_str, notation_note_name)` and\n",
+    "          the corresponding entry in the bulleted list is of the form\n",
+    "          `- [<latex_str>](<notation_note_name)`.\n",
     "\n",
     "    **Raises**\n",
     "\n",
@@ -322,12 +344,12 @@
     "\n",
     "    file_text = str(mf_without_metadata)\n",
     "\n",
-    "    main_mf, mf_with_links_to_notations = _divide_bulleted_list_mf_at_end(mf_without_metadata)\n",
+    "    main_mf, linked_notations_list = _divide_bulleted_list_mf_at_end(mf_without_metadata)\n",
     "    _remove_the_notation_str_and_denotes_in_main_mf(main_mf, notation_note)\n",
     "\n",
     "    return (metadata, _get_notation_string(file_text, notation_note),\n",
     "            _main_of_notation_from_text(file_text), main_mf,\n",
-    "            mf_with_links_to_notations)\n",
+    "            linked_notations_list)\n",
     "\n",
     "\n",
     "def _get_notation_string(\n",
@@ -388,13 +410,13 @@
    "source": [
     "vault = _test_directory() / 'test_vault_7'\n",
     "notation_note = VaultNote(vault, name='some_reference_name_notation_Spec_A')\n",
-    "metadata, notation_str, main_of_notation, main_mf, mf_with_links_to_notations = parse_notation_note(notation_note, vault)\n",
+    "metadata, notation_str, main_of_notation, main_mf, linked_notat_notes = parse_notation_note(notation_note, vault)\n",
     "\n",
     "test_eq(metadata, {'detect_regex': [], 'latex_in_original': ['\\\\operatorname{Spec} A']})\n",
     "test_eq(notation_str, '$\\\\operatorname{Spec} A$')\n",
     "test_eq(main_of_notation, 'spectrum_of_a_ring')\n",
     "test_eq(str(main_mf), 'the spectrum of the ring $A$.')\n",
-    "assert mf_with_links_to_notations is None # There is not a bulleted list at the end, so the last output is `None`."
+    "test_eq(linked_notat_notes, []) # There is not a bulleted list at the end, so the last output is `None`."
    ]
   },
   {
@@ -405,13 +427,13 @@
    "source": [
     "vault = _test_directory() / 'test_vault_7'\n",
     "notation_note = VaultNote(vault, name='poonen_curves_notation_zeta_X_s_zeta_function_of_variety')\n",
-    "metadata, notation_str, main_of_notation, main_mf, mf_with_links_to_notations = parse_notation_note(notation_note, vault)\n",
+    "metadata, notation_str, main_of_notation, main_mf, linked_notat_notes = parse_notation_note(notation_note, vault)\n",
     "\n",
     "test_eq(metadata, None)\n",
     "test_eq(notation_str, r'$\\zeta_{X}(s)$')\n",
     "test_eq(main_of_notation, 'poonen_curves_3.4.1 DEFINITION')\n",
     "test_eq(str(main_mf), 'the zeta function of the [[poonen_curves_1.0.2 DEFINITION|variety]] $X$ over $\\\\mathbb{F}_q$.\\n\\nIt is defined as\\n\\n$$\\\\zeta_X(s) = Z_X(q^{-s}).$$\\n\\nA priori, it is a formal series, but in fact [[poonen_curves_ 3.6_page_56|it converges]] for $\\\\operatorname{Re} s > \\\\dim X$.')\n",
-    "test_eq(str(mf_with_links_to_notations), '- [$Z_X$](poonen_curves_notation_Z_X_T)') # There is a bulleted list at the end, so the last output is `None`."
+    "test_eq(linked_notat_notes, [('$Z_X$', 'poonen_curves_notation_Z_X_T')]) # There is a bulleted list at the end, so the last output is `None`."
    ]
   },
   {
@@ -425,7 +447,7 @@
     "\n",
     "vault = _test_directory() / 'test_vault_7'\n",
     "notation_note = VaultNote(vault, name='some_reference_name_notation_O_X_this_file_has_no_links') \n",
-    "metadata, notation_str, main_of_notation, main_mf, mf_with_links_to_notations = parse_notation_note(notation_note, vault)\n",
+    "metadata, notation_str, main_of_notation, main_mf, linked_notat_notes = parse_notation_note(notation_note, vault)\n",
     "assert main_note is None\n"
    ]
   },
@@ -440,7 +462,7 @@
     "\n",
     "vault = _test_directory() / 'test_vault_7'\n",
     "notation_note = VaultNote(vault, name='some_reference_name_notation_B_R') \n",
-    "metadata, notation_str, main_of_notation, main_mf, mf_with_links_to_notations = parse_notation_note(notation_note, vault)\n",
+    "metadata, notation_str, main_of_notation, main_mf, linked_notat_notes = parse_notation_note(notation_note, vault)\n",
     "assert main_note is None\n"
    ]
   },
@@ -454,7 +476,7 @@
     "# Test the case where the notation string contains a [[]]\n",
     "vault = _test_directory() / 'test_vault_7'\n",
     "notation_note = VaultNote(vault, name='some_reference_name_notation_k_t_formal_power_series_ring') \n",
-    "metadata, notation_str, main_of_notation, main_mf, mf_with_links_to_notations = parse_notation_note(notation_note, vault)\n",
+    "metadata, notation_str, main_of_notation, main_mf, linked_notat_notes = parse_notation_note(notation_note, vault)\n",
     "test_eq(notation_str, '$k[[t]]$')\n",
     "test_eq(main_of_notation, 'some_note')\n",
     "# assert main_note is None"
@@ -2321,7 +2343,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "C:\\Users\\hyunj\\AppData\\Local\\Temp\\ipykernel_20072\\2025152828.py:57: UserWarning: The following note has the following excess notations: note_with_some_excessive_notation_notes, \\mathcal{B}(\\mathbb{R}), \\operatorname{Jac}(C)\n",
+      "C:\\Users\\hyunj\\AppData\\Local\\Temp\\ipykernel_19052\\2025152828.py:57: UserWarning: The following note has the following excess notations: note_with_some_excessive_notation_notes, \\mathcal{B}(\\mathbb{R}), \\operatorname{Jac}(C)\n",
       "  warnings.warn(\n"
      ]
     }
@@ -2759,9 +2781,9 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "trouver_py310_venv",
+   "display_name": "python3",
    "language": "python",
-   "name": "trouver_py310_venv"
+   "name": "python3"
   }
  },
  "nbformat": 4,