Make varioius edits to polish `28_markdown.obsidian.personal.machine_…

…learning.tokenize.ipynb`
hyunjongkimmath · Jan 19, 2024 · dae8f57 · dae8f57
1 parent d75337a
commit dae8f57
Show file tree

Hide file tree

Showing 12 changed files with 1,059 additions and 222 deletions.
diff --git a/nbs/00_helper.ipynb b/nbs/00_helper.ipynb
@@ -105,7 +105,7 @@
       "text/markdown": [
        "---\n",
        "\n",
-       "[source](https://github.com/hyunjongkimmath/trouver/blob/main/trouver/helper.py#L39){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n",
+       "[source](https://github.com/hyunjongkimmath/trouver/blob/main/trouver/helper.py#L41){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n",
        "\n",
        "### _test_directory\n",
        "\n",
@@ -121,7 +121,7 @@
       "text/plain": [
        "---\n",
        "\n",
-       "[source](https://github.com/hyunjongkimmath/trouver/blob/main/trouver/helper.py#L39){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n",
+       "[source](https://github.com/hyunjongkimmath/trouver/blob/main/trouver/helper.py#L41){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n",
        "\n",
        "### _test_directory\n",
        "\n",
@@ -618,6 +618,53 @@
     "## HTML"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Consolidating special characters that are changed with the `__str__` function of `bs4.element.Tags` objects"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#| export\n",
+    "def html_tag_str(\n",
+    "        html_tag: bs4.element.Tag\n",
+    "        ) -> str:\n",
+    "    \"\"\"\n",
+    "    Return the full string of `html_tag`, accounting for \n",
+    "    special characters that `bs4` changes\n",
+    "    \"\"\"\n",
+    "    special_chars = {'&lt;': '<', '&gt;': '>', '&amp;': '&'}\n",
+    "    text_to_return = str(html_tag)\n",
+    "    for special_char, replace_with in special_chars.items():\n",
+    "        text_to_return = text_to_return.replace(special_char, replace_with)\n",
+    "    return text_to_return"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "When using the `__str__` function of `bs4.element.Tag` objects, special characters such as `<`, `>` and `&` change into `&lt;`, `&gt;` and `&amp;`, etc. The `html_tag_str` function makes it so that these characters are changed back."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "soup = BeautifulSoup('', 'html.parser')\n",
+    "tag = soup.new_tag('span')\n",
+    "tag.string = '&hi<'\n",
+    "test_eq(html_tag_str(tag), '<span>&hi<</span>')"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -671,15 +718,10 @@
    "metadata": {},
    "outputs": [
     {
-     "ename": "UnboundLocalError",
-     "evalue": "local variable 'latex_indices' referenced before assignment",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[1;31mUnboundLocalError\u001b[0m                         Traceback (most recent call last)",
-      "Cell \u001b[1;32mIn [124], line 13\u001b[0m\n\u001b[0;32m      1\u001b[0m text \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mr\u001b[39m\u001b[38;5;124m\"\"\"\u001b[39m\n\u001b[0;32m      2\u001b[0m \u001b[38;5;124mhere is a math mode $a<b$. Here is another $a< b$.\u001b[39m\n\u001b[0;32m      3\u001b[0m \u001b[38;5;124mHere is an in-line one:\u001b[39m\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m     11\u001b[0m \u001b[38;5;124m$$\u001b[39m\n\u001b[0;32m     12\u001b[0m \u001b[38;5;124m\"\"\"\u001b[39m\n\u001b[1;32m---> 13\u001b[0m output \u001b[38;5;241m=\u001b[39m \u001b[43mfind_lt_symbols_without_space_in_math_mode\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtext\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m     14\u001b[0m \u001b[38;5;28mprint\u001b[39m(output)\n\u001b[0;32m     15\u001b[0m test_eq(\u001b[38;5;28mlen\u001b[39m(output), \u001b[38;5;241m2\u001b[39m)\n",
-      "Cell \u001b[1;32mIn [123], line 9\u001b[0m, in \u001b[0;36mfind_lt_symbols_without_space_in_math_mode\u001b[1;34m(text)\u001b[0m\n\u001b[0;32m      2\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mfind_lt_symbols_without_space_in_math_mode\u001b[39m(\n\u001b[0;32m      3\u001b[0m         text: \u001b[38;5;28mstr\u001b[39m\n\u001b[0;32m      4\u001b[0m         ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28mlist\u001b[39m[\u001b[38;5;28mint\u001b[39m]: \u001b[38;5;66;03m# The index of  \u001b[39;00m\n\u001b[0;32m      5\u001b[0m     \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m      6\u001b[0m \u001b[38;5;124;03m    Return the indices in `text` with math mode less than `<` symbols without\u001b[39;00m\n\u001b[0;32m      7\u001b[0m \u001b[38;5;124;03m    a space that follows.\u001b[39;00m\n\u001b[0;32m      8\u001b[0m \u001b[38;5;124;03m    \"\"\"\u001b[39;00m\n\u001b[1;32m----> 9\u001b[0m     latex_indices \u001b[38;5;241m=\u001b[39m \u001b[43mlatex_indices\u001b[49m(text)\n\u001b[0;32m     10\u001b[0m     lt_pattern \u001b[38;5;241m=\u001b[39m re\u001b[38;5;241m.\u001b[39mcompile(\u001b[38;5;124mr\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m<(?! )\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[0;32m     11\u001b[0m     inds_of_lt_without_spaces_after \u001b[38;5;241m=\u001b[39m []\n",
-      "\u001b[1;31mUnboundLocalError\u001b[0m: local variable 'latex_indices' referenced before assignment"
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[23, 85]\n"
      ]
     }
    ],
@@ -792,23 +834,6 @@
     "#### Removing HTML tags in a text and obtaining the data of the tags."
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "<a href=\"www.\">this does</a>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
-   "source": []
-  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -838,8 +863,8 @@
     "        position = _process_content(\n",
     "            parsed_soup, replace_with_attributes, definitely_replace, content,\n",
     "            position, replaced_contents)\n",
-    "\n",
-    "    return str(parsed_soup), replaced_contents\n",
+    "    text_to_return = html_tag_str(parsed_soup)\n",
+    "    return text_to_return, replaced_contents\n",
     "\n",
     "\n",
     "def _init_replace_with_attributes(\n",
@@ -918,6 +943,61 @@
     "test_eq(text_without_html_tags, 'Let $K$ be a field. An Abelian variety over $K$ is a variety that')"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In the following example, there is a less than `<` symbol, which is definitely not the opening of an HTML tag. The following verifies that the placeholder `&lt;` is not used to replace the less than symbol, which is what `bs4.BeautifulSoup`'s `html.parser` does."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "text = 'Hello, this has a less than symbol: $a< b$'\n",
+    "text, html_tags = remove_html_tags_in_text(text)\n",
+    "assert not html_tags\n",
+    "assert '< ' in text\n",
+    "assert 'lt' not in html_tags\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The same applies to the greater than `>` symbol, and `&` symbols"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "c:\\Users\\hyunj\\Documents\\Development\\Python\\trouver_py310_venv\\lib\\site-packages\\bs4\\__init__.py:435: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.\n",
+      "  warnings.warn(\n"
+     ]
+    }
+   ],
+   "source": [
+    "text = 'Hello, this has a greater than symbol: $a>b$'\n",
+    "text, html_tags = remove_html_tags_in_text(text)\n",
+    "assert not html_tags\n",
+    "assert '>' in text\n",
+    "assert 'gt' not in html_tags\n",
+    "\n",
+    "text = r'Hello $$ f &= 3 \\\\ g &= 5'\n",
+    "text, html_tags = remove_html_tags_in_text(text)\n",
+    "assert not html_tags\n",
+    "assert '&' in text\n",
+    "assert '&amp;' not in html_tags"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -1062,7 +1142,7 @@
     "    tags_and_locations = sorted(\n",
     "        tags_and_locations, key=lambda x: x[1])\n",
     "    replace_ranges = [(start, end) for _, start, end in tags_and_locations]\n",
-    "    replace_with = [str(html_tag) for html_tag, _, _ in tags_and_locations]\n",
+    "    replace_with = [html_tag_str(html_tag) for html_tag, _, _ in tags_and_locations]\n",
     "    return replace_string_by_indices(text, replace_ranges, replace_with)"
    ]
   },
@@ -2027,7 +2107,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "2024-01-13T17:29\n"
+      "2024-01-18T19:14\n"
      ]
     }
    ],

diff --git a/nbs/18_markdown.obsidian.personal.machine_learning.notations.ipynb b/nbs/18_markdown.obsidian.personal.machine_learning.notations.ipynb
@@ -38,20 +38,11 @@
     "    notation_asterisk_indices,\n",
     "    double_asterisk_indices, latex_indices\n",
     ")\n",
-    "# from trouver.machine_learning.text.encoder_decoder import (\n",
-    "#     EncoderRNN, AttnDecoderRNN, train, trainIters, evaluate,\n",
-    "#     evaluateRandomly\n",
-    "# )\n",
-    "# from trouver.machine_learning.text.tokenize import (\n",
-    "#     replace_bold, replace_math_mode_strings, special_cases\n",
-    "# )\n",
     "from trouver.markdown.markdown.file import MarkdownFile, MarkdownLineEnum\n",
     "from trouver.markdown.obsidian.personal.note_processing import (\n",
     "    process_standard_information_note\n",
     ")\n",
     "from trouver.markdown.obsidian.personal.notes import notes_linked_in_note, notes_linked_in_notes_linked_in_note\n",
-    "# from trouver.markdown.obsidian.personal.notation import (\n",
-    "#     make_a_notation_note, latex_to_path_accepted_string, make_notation_notes_from_double_asts)\n",
     "\n",
     "from trouver.markdown.obsidian.vault import VaultNote"
    ]