Skip to content

Commit

Permalink
Make varioius edits to polish `28_markdown.obsidian.personal.machine_…
Browse files Browse the repository at this point in the history
…learning.tokenize.ipynb`
  • Loading branch information
hyunjongkimmath committed Jan 19, 2024
1 parent d75337a commit dae8f57
Show file tree
Hide file tree
Showing 12 changed files with 1,059 additions and 222 deletions.
144 changes: 112 additions & 32 deletions nbs/00_helper.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@
"text/markdown": [
"---\n",
"\n",
"[source](https://github.com/hyunjongkimmath/trouver/blob/main/trouver/helper.py#L39){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n",
"[source](https://github.com/hyunjongkimmath/trouver/blob/main/trouver/helper.py#L41){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n",
"\n",
"### _test_directory\n",
"\n",
Expand All @@ -121,7 +121,7 @@
"text/plain": [
"---\n",
"\n",
"[source](https://github.com/hyunjongkimmath/trouver/blob/main/trouver/helper.py#L39){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n",
"[source](https://github.com/hyunjongkimmath/trouver/blob/main/trouver/helper.py#L41){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n",
"\n",
"### _test_directory\n",
"\n",
Expand Down Expand Up @@ -618,6 +618,53 @@
"## HTML"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Consolidating special characters that are changed with the `__str__` function of `bs4.element.Tags` objects"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#| export\n",
"def html_tag_str(\n",
" html_tag: bs4.element.Tag\n",
" ) -> str:\n",
" \"\"\"\n",
" Return the full string of `html_tag`, accounting for \n",
" special characters that `bs4` changes\n",
" \"\"\"\n",
" special_chars = {'&lt;': '<', '&gt;': '>', '&amp;': '&'}\n",
" text_to_return = str(html_tag)\n",
" for special_char, replace_with in special_chars.items():\n",
" text_to_return = text_to_return.replace(special_char, replace_with)\n",
" return text_to_return"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"When using the `__str__` function of `bs4.element.Tag` objects, special characters such as `<`, `>` and `&` change into `&lt;`, `&gt;` and `&amp;`, etc. The `html_tag_str` function makes it so that these characters are changed back."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"soup = BeautifulSoup('', 'html.parser')\n",
"tag = soup.new_tag('span')\n",
"tag.string = '&hi<'\n",
"test_eq(html_tag_str(tag), '<span>&hi<</span>')"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand Down Expand Up @@ -671,15 +718,10 @@
"metadata": {},
"outputs": [
{
"ename": "UnboundLocalError",
"evalue": "local variable 'latex_indices' referenced before assignment",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mUnboundLocalError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[1;32mIn [124], line 13\u001b[0m\n\u001b[0;32m 1\u001b[0m text \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mr\u001b[39m\u001b[38;5;124m\"\"\"\u001b[39m\n\u001b[0;32m 2\u001b[0m \u001b[38;5;124mhere is a math mode $a<b$. Here is another $a< b$.\u001b[39m\n\u001b[0;32m 3\u001b[0m \u001b[38;5;124mHere is an in-line one:\u001b[39m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 11\u001b[0m \u001b[38;5;124m$$\u001b[39m\n\u001b[0;32m 12\u001b[0m \u001b[38;5;124m\"\"\"\u001b[39m\n\u001b[1;32m---> 13\u001b[0m output \u001b[38;5;241m=\u001b[39m \u001b[43mfind_lt_symbols_without_space_in_math_mode\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtext\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 14\u001b[0m \u001b[38;5;28mprint\u001b[39m(output)\n\u001b[0;32m 15\u001b[0m test_eq(\u001b[38;5;28mlen\u001b[39m(output), \u001b[38;5;241m2\u001b[39m)\n",
"Cell \u001b[1;32mIn [123], line 9\u001b[0m, in \u001b[0;36mfind_lt_symbols_without_space_in_math_mode\u001b[1;34m(text)\u001b[0m\n\u001b[0;32m 2\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mfind_lt_symbols_without_space_in_math_mode\u001b[39m(\n\u001b[0;32m 3\u001b[0m text: \u001b[38;5;28mstr\u001b[39m\n\u001b[0;32m 4\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28mlist\u001b[39m[\u001b[38;5;28mint\u001b[39m]: \u001b[38;5;66;03m# The index of \u001b[39;00m\n\u001b[0;32m 5\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 6\u001b[0m \u001b[38;5;124;03m Return the indices in `text` with math mode less than `<` symbols without\u001b[39;00m\n\u001b[0;32m 7\u001b[0m \u001b[38;5;124;03m a space that follows.\u001b[39;00m\n\u001b[0;32m 8\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[1;32m----> 9\u001b[0m latex_indices \u001b[38;5;241m=\u001b[39m \u001b[43mlatex_indices\u001b[49m(text)\n\u001b[0;32m 10\u001b[0m lt_pattern \u001b[38;5;241m=\u001b[39m re\u001b[38;5;241m.\u001b[39mcompile(\u001b[38;5;124mr\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m<(?! )\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[0;32m 11\u001b[0m inds_of_lt_without_spaces_after \u001b[38;5;241m=\u001b[39m []\n",
"\u001b[1;31mUnboundLocalError\u001b[0m: local variable 'latex_indices' referenced before assignment"
"name": "stdout",
"output_type": "stream",
"text": [
"[23, 85]\n"
]
}
],
Expand Down Expand Up @@ -792,23 +834,6 @@
"#### Removing HTML tags in a text and obtaining the data of the tags."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<a href=\"www.\">this does</a>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
Expand Down Expand Up @@ -838,8 +863,8 @@
" position = _process_content(\n",
" parsed_soup, replace_with_attributes, definitely_replace, content,\n",
" position, replaced_contents)\n",
"\n",
" return str(parsed_soup), replaced_contents\n",
" text_to_return = html_tag_str(parsed_soup)\n",
" return text_to_return, replaced_contents\n",
"\n",
"\n",
"def _init_replace_with_attributes(\n",
Expand Down Expand Up @@ -918,6 +943,61 @@
"test_eq(text_without_html_tags, 'Let $K$ be a field. An Abelian variety over $K$ is a variety that')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"In the following example, there is a less than `<` symbol, which is definitely not the opening of an HTML tag. The following verifies that the placeholder `&lt;` is not used to replace the less than symbol, which is what `bs4.BeautifulSoup`'s `html.parser` does."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"text = 'Hello, this has a less than symbol: $a< b$'\n",
"text, html_tags = remove_html_tags_in_text(text)\n",
"assert not html_tags\n",
"assert '< ' in text\n",
"assert 'lt' not in html_tags\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The same applies to the greater than `>` symbol, and `&` symbols"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\hyunj\\Documents\\Development\\Python\\trouver_py310_venv\\lib\\site-packages\\bs4\\__init__.py:435: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.\n",
" warnings.warn(\n"
]
}
],
"source": [
"text = 'Hello, this has a greater than symbol: $a>b$'\n",
"text, html_tags = remove_html_tags_in_text(text)\n",
"assert not html_tags\n",
"assert '>' in text\n",
"assert 'gt' not in html_tags\n",
"\n",
"text = r'Hello $$ f &= 3 \\\\ g &= 5'\n",
"text, html_tags = remove_html_tags_in_text(text)\n",
"assert not html_tags\n",
"assert '&' in text\n",
"assert '&amp;' not in html_tags"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand Down Expand Up @@ -1062,7 +1142,7 @@
" tags_and_locations = sorted(\n",
" tags_and_locations, key=lambda x: x[1])\n",
" replace_ranges = [(start, end) for _, start, end in tags_and_locations]\n",
" replace_with = [str(html_tag) for html_tag, _, _ in tags_and_locations]\n",
" replace_with = [html_tag_str(html_tag) for html_tag, _, _ in tags_and_locations]\n",
" return replace_string_by_indices(text, replace_ranges, replace_with)"
]
},
Expand Down Expand Up @@ -2027,7 +2107,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"2024-01-13T17:29\n"
"2024-01-18T19:14\n"
]
}
],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,20 +38,11 @@
" notation_asterisk_indices,\n",
" double_asterisk_indices, latex_indices\n",
")\n",
"# from trouver.machine_learning.text.encoder_decoder import (\n",
"# EncoderRNN, AttnDecoderRNN, train, trainIters, evaluate,\n",
"# evaluateRandomly\n",
"# )\n",
"# from trouver.machine_learning.text.tokenize import (\n",
"# replace_bold, replace_math_mode_strings, special_cases\n",
"# )\n",
"from trouver.markdown.markdown.file import MarkdownFile, MarkdownLineEnum\n",
"from trouver.markdown.obsidian.personal.note_processing import (\n",
" process_standard_information_note\n",
")\n",
"from trouver.markdown.obsidian.personal.notes import notes_linked_in_note, notes_linked_in_notes_linked_in_note\n",
"# from trouver.markdown.obsidian.personal.notation import (\n",
"# make_a_notation_note, latex_to_path_accepted_string, make_notation_notes_from_double_asts)\n",
"\n",
"from trouver.markdown.obsidian.vault import VaultNote"
]
Expand Down
Loading

0 comments on commit dae8f57

Please sign in to comment.