diff --git a/llms_txt/_modidx.py b/llms_txt/_modidx.py index ee270cf..d5bd669 100644 --- a/llms_txt/_modidx.py +++ b/llms_txt/_modidx.py @@ -10,6 +10,8 @@ 'llms_txt.core._parse_llms': ('core.html#_parse_llms', 'llms_txt/core.py'), 'llms_txt.core._section': ('core.html#_section', 'llms_txt/core.py'), 'llms_txt.core.create_ctx': ('core.html#create_ctx', 'llms_txt/core.py'), + 'llms_txt.core.find_root_dir': ('core.html#find_root_dir', 'llms_txt/core.py'), + 'llms_txt.core.get_doc_content': ('core.html#get_doc_content', 'llms_txt/core.py'), 'llms_txt.core.get_sizes': ('core.html#get_sizes', 'llms_txt/core.py'), 'llms_txt.core.llms_txt2ctx': ('core.html#llms_txt2ctx', 'llms_txt/core.py'), 'llms_txt.core.mk_ctx': ('core.html#mk_ctx', 'llms_txt/core.py'), diff --git a/llms_txt/core.py b/llms_txt/core.py index 845b4a1..fd37dac 100644 --- a/llms_txt/core.py +++ b/llms_txt/core.py @@ -3,7 +3,8 @@ # AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/01_core.ipynb. # %% auto 0 -__all__ = ['opt_re', 'named_re', 'search', 'parse_link', 'parse_llms_file', 'mk_ctx', 'get_sizes', 'create_ctx', 'llms_txt2ctx'] +__all__ = ['opt_re', 'named_re', 'search', 'parse_link', 'parse_llms_file', 'find_root_dir', 'get_doc_content', 'mk_ctx', + 'get_sizes', 'create_ctx', 'llms_txt2ctx'] # %% ../nbs/01_core.ipynb import re @@ -13,6 +14,7 @@ from fastcore.xml import * from fastcore.script import * import httpx +from urllib.parse import urlparse # %% ../nbs/01_core.ipynb def opt_re(s): @@ -65,14 +67,40 @@ def parse_llms_file(txt): # %% ../nbs/01_core.ipynb from fastcore.xml import Sections,Project,Doc +# %% ../nbs/01_core.ipynb +def find_root_dir(): + "Find the root directory of the nbdev project by looking for settings.ini" + path = Path.cwd() + while path != path.parent: + if (path / 'settings.ini').exists(): return path + path = path.parent + return None + +# %% ../nbs/01_core.ipynb +def get_doc_content(url): + "Fetch content from local file if in nbdev repo." + root_dir = find_root_dir() + if root_dir: + config = Config(root_dir, 'settings.ini') + doc_host = config.get('doc_host') + if doc_host and url.startswith(doc_host): + parsed_url = urlparse(url) + relative_path = parsed_url.path.lstrip('/') + local_path = root_dir / '_docs' / relative_path + if local_path.exists(): + with open(local_path, 'r') as f: return f.read() + # If not a local file or file doesn't exist, fetch from URL + return httpx.get(url).text + # %% ../nbs/01_core.ipynb def _doc(kw): "Create a `Doc` FT object with the text retrieved from `url` as the child, and `kw` as attrs." url = kw.pop('url') + txt = get_doc_content(url) re_comment = re.compile('^$', flags=re.MULTILINE) re_base64_img = re.compile(r']*src="data:image/[^"]*"[^>]*>') - txt = [o for o in httpx.get(url).text.splitlines() if not re_comment.search(o) and not re_base64_img.search(o)] - return Doc('\n'.join(txt), **kw) + txt = '\n'.join([o for o in txt.splitlines() if not re_comment.search(o) and not re_base64_img.search(o)]) + return Doc(txt, **kw) # %% ../nbs/01_core.ipynb def _section(nm, items, n_workers=None): diff --git a/nbs/01_core.ipynb b/nbs/01_core.ipynb index be56b40..c0aefc1 100644 --- a/nbs/01_core.ipynb +++ b/nbs/01_core.ipynb @@ -49,7 +49,8 @@ "from fastcore.utils import *\n", "from fastcore.xml import *\n", "from fastcore.script import *\n", - "import httpx" + "import httpx\n", + "from urllib.parse import urlparse" ] }, { @@ -673,6 +674,45 @@ "from fastcore.xml import Sections,Project,Doc" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#| export\n", + "def find_root_dir():\n", + " \"Find the root directory of the nbdev project by looking for settings.ini\"\n", + " path = Path.cwd()\n", + " while path != path.parent:\n", + " if (path / 'settings.ini').exists(): return path\n", + " path = path.parent\n", + " return None" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#|export\n", + "def get_doc_content(url):\n", + " \"Fetch content from local file if in nbdev repo.\"\n", + " root_dir = find_root_dir()\n", + " if root_dir:\n", + " config = Config(root_dir, 'settings.ini')\n", + " doc_host = config.get('doc_host')\n", + " if doc_host and url.startswith(doc_host):\n", + " parsed_url = urlparse(url)\n", + " relative_path = parsed_url.path.lstrip('/')\n", + " local_path = root_dir / '_docs' / relative_path\n", + " if local_path.exists():\n", + " with open(local_path, 'r') as f: return f.read()\n", + " # If not a local file or file doesn't exist, fetch from URL\n", + " return httpx.get(url).text" + ] + }, { "cell_type": "code", "execution_count": null, @@ -683,10 +723,11 @@ "def _doc(kw):\n", " \"Create a `Doc` FT object with the text retrieved from `url` as the child, and `kw` as attrs.\"\n", " url = kw.pop('url')\n", + " txt = get_doc_content(url)\n", " re_comment = re.compile('^$', flags=re.MULTILINE)\n", " re_base64_img = re.compile(r']*src=\"data:image/[^\"]*\"[^>]*>')\n", - " txt = [o for o in httpx.get(url).text.splitlines() if not re_comment.search(o) and not re_base64_img.search(o)]\n", - " return Doc('\\n'.join(txt), **kw)" + " txt = '\\n'.join([o for o in txt.splitlines() if not re_comment.search(o) and not re_base64_img.search(o)])\n", + " return Doc(txt, **kw)" ] }, {