diff --git a/src/gpt-crawler/.dockerignore b/src/gpt-crawler/.dockerignore new file mode 100644 index 0000000..556fbb2 --- /dev/null +++ b/src/gpt-crawler/.dockerignore @@ -0,0 +1,13 @@ +# configurations +.idea + +# crawlee and apify storage folders +apify_storage +crawlee_storage +storage + +# installed files +node_modules + +# ignore base image 'main.js' +main.js \ No newline at end of file diff --git a/src/gpt-crawler/.github/workflows/build.yml b/src/gpt-crawler/.github/workflows/build.yml new file mode 100644 index 0000000..b1d9b54 --- /dev/null +++ b/src/gpt-crawler/.github/workflows/build.yml @@ -0,0 +1,23 @@ +name: Build workflow + +on: + pull_request: + types: [opened, reopened, synchronize] + +jobs: + build: + name: build + runs-on: ubuntu-latest + env: + CI_JOB_NUMBER: 1 + steps: + - uses: actions/checkout@v2 + - uses: actions/setup-node@v2 + with: + cache: npm + node-version: 18 + - run: npm i + - run: npm run build + - uses: preactjs/compressed-size-action@v2 + with: + pattern: ".dist/**/*.{js,ts,json}" diff --git a/src/gpt-crawler/.github/workflows/release.yml b/src/gpt-crawler/.github/workflows/release.yml new file mode 100644 index 0000000..fa42132 --- /dev/null +++ b/src/gpt-crawler/.github/workflows/release.yml @@ -0,0 +1,23 @@ +name: Release workflow + +on: + push: + branches: + - main + +jobs: + release: + name: release + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - uses: actions/setup-node@v2 + with: + cache: npm + node-version: 18 + - run: npm i + - run: npm run build + - run: npm run semantic-release + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + NPM_TOKEN: ${{ secrets.NPM_TOKEN }} diff --git a/src/gpt-crawler/.github/workflows/test.yml b/src/gpt-crawler/.github/workflows/test.yml new file mode 100644 index 0000000..ef57a37 --- /dev/null +++ b/src/gpt-crawler/.github/workflows/test.yml @@ -0,0 +1,18 @@ +name: Test workflow + +on: [push, pull_request] + +jobs: + prettier_check: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v3 + - name: Set up Node.js + uses: actions/setup-node@v2 + with: + node-version: "20" + - name: Install Dependencies + run: npm ci + - name: Run prettier + run: npm run prettier:check diff --git a/src/gpt-crawler/.gitignore b/src/gpt-crawler/.gitignore new file mode 100644 index 0000000..9b56ebd --- /dev/null +++ b/src/gpt-crawler/.gitignore @@ -0,0 +1,23 @@ +# This file tells Git which files shouldn't be added to source control + +.idea +dist +node_modules +apify_storage +crawlee_storage +storage +.DS_Store + +!package.json +!package-lock.json +!tsconfig.json + +# any output from the crawler +*.json + +pnpm-lock.yaml + +# Python +__pycache__ +venv/ +.venv/ diff --git a/src/gpt-crawler/.pylintrc b/src/gpt-crawler/.pylintrc new file mode 100644 index 0000000..97c4883 --- /dev/null +++ b/src/gpt-crawler/.pylintrc @@ -0,0 +1,638 @@ +[MAIN] + +# Analyse import fallback blocks. This can be used to support both Python 2 and +# 3 compatible code, which means that the block might have code that exists +# only in one or another interpreter, leading to false positives when analysed. +analyse-fallback-blocks=no + +# Clear in-memory caches upon conclusion of linting. Useful if running pylint +# in a server-like mode. +clear-cache-post-run=no + +# Load and enable all available extensions. Use --list-extensions to see a list +# all available extensions. +#enable-all-extensions= + +# In error mode, messages with a category besides ERROR or FATAL are +# suppressed, and no reports are done by default. Error mode is compatible with +# disabling specific errors. +#errors-only= + +# Always return a 0 (non-error) status code, even if lint errors are found. +# This is primarily useful in continuous integration scripts. +#exit-zero= + +# A comma-separated list of package or module names from where C extensions may +# be loaded. Extensions are loading into the active Python interpreter and may +# run arbitrary code. +extension-pkg-allow-list= + +# A comma-separated list of package or module names from where C extensions may +# be loaded. Extensions are loading into the active Python interpreter and may +# run arbitrary code. (This is an alternative name to extension-pkg-allow-list +# for backward compatibility.) +extension-pkg-whitelist= + +# Return non-zero exit code if any of these messages/categories are detected, +# even if score is above --fail-under value. Syntax same as enable. Messages +# specified are enabled, while categories only check already-enabled messages. +fail-on= + +# Specify a score threshold under which the program will exit with error. +fail-under=10 + +# Interpret the stdin as a python script, whose filename needs to be passed as +# the module_or_package argument. +#from-stdin= + +# Files or directories to be skipped. They should be base names, not paths. +ignore=CVS + +# Add files or directories matching the regular expressions patterns to the +# ignore-list. The regex matches against paths and can be in Posix or Windows +# format. Because '\\' represents the directory delimiter on Windows systems, +# it can't be used as an escape character. +ignore-paths= + +# Files or directories matching the regular expression patterns are skipped. +# The regex matches against base names, not paths. The default value ignores +# Emacs file locks +ignore-patterns=^\.# + +# List of module names for which member attributes should not be checked +# (useful for modules/projects where namespaces are manipulated during runtime +# and thus existing member attributes cannot be deduced by static analysis). It +# supports qualified module names, as well as Unix pattern matching. +ignored-modules= + +# Python code to execute, usually for sys.path manipulation such as +# pygtk.require(). +#init-hook= + +# Use multiple processes to speed up Pylint. Specifying 0 will auto-detect the +# number of processors available to use, and will cap the count on Windows to +# avoid hangs. +jobs=1 + +# Control the amount of potential inferred values when inferring a single +# object. This can help the performance when dealing with large functions or +# complex, nested conditions. +limit-inference-results=100 + +# List of plugins (as comma separated values of python module names) to load, +# usually to register additional checkers. +load-plugins= + +# Pickle collected data for later comparisons. +persistent=yes + +# Minimum Python version to use for version dependent checks. Will default to +# the version used to run pylint. +py-version=3.12 + +# Discover python modules and packages in the file system subtree. +recursive=no + +# Add paths to the list of the source roots. Supports globbing patterns. The +# source root is an absolute path or a path relative to the current working +# directory used to determine a package namespace for modules located under the +# source root. +source-roots= + +# When enabled, pylint would attempt to guess common misconfiguration and emit +# user-friendly hints instead of false-positive error messages. +suggestion-mode=yes + +# Allow loading of arbitrary C extensions. Extensions are imported into the +# active Python interpreter and may run arbitrary code. +unsafe-load-any-extension=no + +# In verbose mode, extra non-checker-related info will be displayed. +#verbose= + + +[BASIC] + +# Naming style matching correct argument names. +argument-naming-style=snake_case + +# Regular expression matching correct argument names. Overrides argument- +# naming-style. If left empty, argument names will be checked with the set +# naming style. +#argument-rgx= + +# Naming style matching correct attribute names. +attr-naming-style=snake_case + +# Regular expression matching correct attribute names. Overrides attr-naming- +# style. If left empty, attribute names will be checked with the set naming +# style. +#attr-rgx= + +# Bad variable names which should always be refused, separated by a comma. +bad-names=foo, + bar, + baz, + toto, + tutu, + tata + +# Bad variable names regexes, separated by a comma. If names match any regex, +# they will always be refused +bad-names-rgxs= + +# Naming style matching correct class attribute names. +class-attribute-naming-style=any + +# Regular expression matching correct class attribute names. Overrides class- +# attribute-naming-style. If left empty, class attribute names will be checked +# with the set naming style. +#class-attribute-rgx= + +# Naming style matching correct class constant names. +class-const-naming-style=UPPER_CASE + +# Regular expression matching correct class constant names. Overrides class- +# const-naming-style. If left empty, class constant names will be checked with +# the set naming style. +#class-const-rgx= + +# Naming style matching correct class names. +class-naming-style=PascalCase + +# Regular expression matching correct class names. Overrides class-naming- +# style. If left empty, class names will be checked with the set naming style. +#class-rgx= + +# Naming style matching correct constant names. +const-naming-style=UPPER_CASE + +# Regular expression matching correct constant names. Overrides const-naming- +# style. If left empty, constant names will be checked with the set naming +# style. +#const-rgx= + +# Minimum line length for functions/classes that require docstrings, shorter +# ones are exempt. +docstring-min-length=-1 + +# Naming style matching correct function names. +function-naming-style=snake_case + +# Regular expression matching correct function names. Overrides function- +# naming-style. If left empty, function names will be checked with the set +# naming style. +#function-rgx= + +# Good variable names which should always be accepted, separated by a comma. +good-names=i, + j, + k, + ex, + Run, + _ + +# Good variable names regexes, separated by a comma. If names match any regex, +# they will always be accepted +good-names-rgxs= + +# Include a hint for the correct naming format with invalid-name. +include-naming-hint=no + +# Naming style matching correct inline iteration names. +inlinevar-naming-style=any + +# Regular expression matching correct inline iteration names. Overrides +# inlinevar-naming-style. If left empty, inline iteration names will be checked +# with the set naming style. +#inlinevar-rgx= + +# Naming style matching correct method names. +method-naming-style=snake_case + +# Regular expression matching correct method names. Overrides method-naming- +# style. If left empty, method names will be checked with the set naming style. +#method-rgx= + +# Naming style matching correct module names. +module-naming-style=snake_case + +# Regular expression matching correct module names. Overrides module-naming- +# style. If left empty, module names will be checked with the set naming style. +#module-rgx= + +# Colon-delimited sets of names that determine each other's naming style when +# the name regexes allow several styles. +name-group= + +# Regular expression which should only match function or class names that do +# not require a docstring. +no-docstring-rgx=^_ + +# List of decorators that produce properties, such as abc.abstractproperty. Add +# to this list to register other decorators that produce valid properties. +# These decorators are taken in consideration only for invalid-name. +property-classes=abc.abstractproperty + +# Regular expression matching correct type alias names. If left empty, type +# alias names will be checked with the set naming style. +#typealias-rgx= + +# Regular expression matching correct type variable names. If left empty, type +# variable names will be checked with the set naming style. +#typevar-rgx= + +# Naming style matching correct variable names. +variable-naming-style=snake_case + +# Regular expression matching correct variable names. Overrides variable- +# naming-style. If left empty, variable names will be checked with the set +# naming style. +#variable-rgx= + + +[CLASSES] + +# Warn about protected attribute access inside special methods +check-protected-access-in-special-methods=no + +# List of method names used to declare (i.e. assign) instance attributes. +defining-attr-methods=__init__, + __new__, + setUp, + asyncSetUp, + __post_init__ + +# List of member names, which should be excluded from the protected access +# warning. +exclude-protected=_asdict,_fields,_replace,_source,_make,os._exit + +# List of valid names for the first argument in a class method. +valid-classmethod-first-arg=cls + +# List of valid names for the first argument in a metaclass class method. +valid-metaclass-classmethod-first-arg=mcs + + +[DESIGN] + +# List of regular expressions of class ancestor names to ignore when counting +# public methods (see R0903) +exclude-too-few-public-methods= + +# List of qualified class names to ignore when counting class parents (see +# R0901) +ignored-parents= + +# Maximum number of arguments for function / method. +max-args=5 + +# Maximum number of attributes for a class (see R0902). +max-attributes=7 + +# Maximum number of boolean expressions in an if statement (see R0916). +max-bool-expr=5 + +# Maximum number of branch for function / method body. +max-branches=12 + +# Maximum number of locals for function / method body. +max-locals=15 + +# Maximum number of parents for a class (see R0901). +max-parents=7 + +# Maximum number of public methods for a class (see R0904). +max-public-methods=20 + +# Maximum number of return / yield for function / method body. +max-returns=6 + +# Maximum number of statements in function / method body. +max-statements=50 + +# Minimum number of public methods for a class (see R0903). +min-public-methods=2 + + +[EXCEPTIONS] + +# Exceptions that will emit a warning when caught. +overgeneral-exceptions=builtins.BaseException,builtins.Exception + + +[FORMAT] + +# Expected format of line ending, e.g. empty (any line ending), LF or CRLF. +expected-line-ending-format= + +# Regexp for a line that is allowed to be longer than the limit. +ignore-long-lines=^\s*(# )??$ + +# Number of spaces of indent required inside a hanging or continued line. +indent-after-paren=4 + +# String used as indentation unit. This is usually " " (4 spaces) or "\t" (1 +# tab). +indent-string=' ' + +# Maximum number of characters on a single line. +max-line-length=100 + +# Maximum number of lines in a module. +max-module-lines=1000 + +# Allow the body of a class to be on the same line as the declaration if body +# contains single statement. +single-line-class-stmt=no + +# Allow the body of an if to be on the same line as the test if there is no +# else. +single-line-if-stmt=no + + +[IMPORTS] + +# List of modules that can be imported at any level, not just the top level +# one. +allow-any-import-level= + +# Allow explicit reexports by alias from a package __init__. +allow-reexport-from-package=no + +# Allow wildcard imports from modules that define __all__. +allow-wildcard-with-all=no + +# Deprecated modules which should not be used, separated by a comma. +deprecated-modules= + +# Output a graph (.gv or any supported image format) of external dependencies +# to the given file (report RP0402 must not be disabled). +ext-import-graph= + +# Output a graph (.gv or any supported image format) of all (i.e. internal and +# external) dependencies to the given file (report RP0402 must not be +# disabled). +import-graph= + +# Output a graph (.gv or any supported image format) of internal dependencies +# to the given file (report RP0402 must not be disabled). +int-import-graph= + +# Force import order to recognize a module as part of the standard +# compatibility libraries. +known-standard-library= + +# Force import order to recognize a module as part of a third party library. +known-third-party=enchant + +# Couples of modules and preferred modules, separated by a comma. +preferred-modules= + + +[LOGGING] + +# The type of string formatting that logging methods do. `old` means using % +# formatting, `new` is for `{}` formatting. +logging-format-style=old + +# Logging modules to check that the string format arguments are in logging +# function parameter format. +logging-modules=logging + + +[MESSAGES CONTROL] + +# Only show warnings with the listed confidence levels. Leave empty to show +# all. Valid levels: HIGH, CONTROL_FLOW, INFERENCE, INFERENCE_FAILURE, +# UNDEFINED. +confidence=HIGH, + CONTROL_FLOW, + INFERENCE, + INFERENCE_FAILURE, + UNDEFINED + +# Disable the message, report, category or checker with the given id(s). You +# can either give multiple identifiers separated by comma (,) or put this +# option multiple times (only on the command line, not in the configuration +# file where it should appear only once). You can also use "--disable=all" to +# disable everything first and then re-enable specific checks. For example, if +# you want to run only the similarities checker, you can use "--disable=all +# --enable=similarities". If you want to run only the classes checker, but have +# no Warning level messages displayed, use "--disable=all --enable=classes +# --disable=W". +disable=C0330, # Wrong hanging indentation before block (black) + C0326, # Bad Whitespace (black) + W0718, # Catching too general exception Exception. \ + # This setting was added intentionally for `conv_html_to_markdown.py` + raw-checker-failed, + bad-inline-option, + locally-disabled, + file-ignored, + suppressed-message, + useless-suppression, + deprecated-pragma, + use-symbolic-message-instead, + use-implicit-booleaness-not-comparison-to-string, + use-implicit-booleaness-not-comparison-to-zero + +# Enable the message, report, category or checker with the given id(s). You can +# either give multiple identifier separated by comma (,) or put this option +# multiple time (only on the command line, not in the configuration file where +# it should appear only once). See also the "--disable" option for examples. +enable= + + +[METHOD_ARGS] + +# List of qualified names (i.e., library.method) which require a timeout +# parameter e.g. 'requests.api.get,requests.api.post' +timeout-methods=requests.api.delete,requests.api.get,requests.api.head,requests.api.options,requests.api.patch,requests.api.post,requests.api.put,requests.api.request + + +[MISCELLANEOUS] + +# List of note tags to take in consideration, separated by a comma. +notes=FIXME, + XXX, + TODO + +# Regular expression of note tags to take in consideration. +notes-rgx= + + +[REFACTORING] + +# Maximum number of nested blocks for function / method body +max-nested-blocks=5 + +# Complete name of functions that never returns. When checking for +# inconsistent-return-statements if a never returning function is called then +# it will be considered as an explicit return statement and no message will be +# printed. +never-returning-functions=sys.exit,argparse.parse_error + + +[REPORTS] + +# Python expression which should return a score less than or equal to 10. You +# have access to the variables 'fatal', 'error', 'warning', 'refactor', +# 'convention', and 'info' which contain the number of messages in each +# category, as well as 'statement' which is the total number of statements +# analyzed. This score is used by the global evaluation report (RP0004). +evaluation=max(0, 0 if fatal else 10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)) + +# Template used to display messages. This is a python new-style format string +# used to format the message information. See doc for all details. +msg-template= + +# Set the output format. Available formats are: text, parseable, colorized, +# json2 (improved json format), json (old json format) and msvs (visual +# studio). You can also give a reporter class, e.g. +# mypackage.mymodule.MyReporterClass. +#output-format= + +# Tells whether to display a full report or only the messages. +reports=no + +# Activate the evaluation score. +score=yes + + +[SIMILARITIES] + +# Comments are removed from the similarity computation +ignore-comments=yes + +# Docstrings are removed from the similarity computation +ignore-docstrings=yes + +# Imports are removed from the similarity computation +ignore-imports=yes + +# Signatures are removed from the similarity computation +ignore-signatures=yes + +# Minimum lines number of a similarity. +min-similarity-lines=4 + + +[SPELLING] + +# Limits count of emitted suggestions for spelling mistakes. +max-spelling-suggestions=4 + +# Spelling dictionary name. No available dictionaries : You need to install +# both the python package and the system dependency for enchant to work. +spelling-dict= + +# List of comma separated words that should be considered directives if they +# appear at the beginning of a comment and should not be checked. +spelling-ignore-comment-directives=fmt: on,fmt: off,noqa:,noqa,nosec,isort:skip,mypy: + +# List of comma separated words that should not be checked. +spelling-ignore-words= + +# A path to a file that contains the private dictionary; one word per line. +spelling-private-dict-file= + +# Tells whether to store unknown words to the private dictionary (see the +# --spelling-private-dict-file option) instead of raising a message. +spelling-store-unknown-words=no + + +[STRING] + +# This flag controls whether inconsistent-quotes generates a warning when the +# character used as a quote delimiter is used inconsistently within a module. +check-quote-consistency=no + +# This flag controls whether the implicit-str-concat should generate a warning +# on implicit string concatenation in sequences defined over several lines. +check-str-concat-over-line-jumps=no + + +[TYPECHECK] + +# List of decorators that produce context managers, such as +# contextlib.contextmanager. Add to this list to register other decorators that +# produce valid context managers. +contextmanager-decorators=contextlib.contextmanager + +# List of members which are set dynamically and missed by pylint inference +# system, and so shouldn't trigger E1101 when accessed. Python regular +# expressions are accepted. +generated-members= + +# Tells whether to warn about missing members when the owner of the attribute +# is inferred to be None. +ignore-none=yes + +# This flag controls whether pylint should warn about no-member and similar +# checks whenever an opaque object is returned when inferring. The inference +# can return multiple potential results while evaluating a Python object, but +# some branches might not be evaluated, which results in partial inference. In +# that case, it might be useful to still emit no-member and other checks for +# the rest of the inferred objects. +ignore-on-opaque-inference=yes + +# List of symbolic message names to ignore for Mixin members. +ignored-checks-for-mixins=no-member, + not-async-context-manager, + not-context-manager, + attribute-defined-outside-init + +# List of class names for which member attributes should not be checked (useful +# for classes with dynamically set attributes). This supports the use of +# qualified names. +ignored-classes=optparse.Values,thread._local,_thread._local,argparse.Namespace + +# Show a hint with possible names when a member name was not found. The aspect +# of finding the hint is based on edit distance. +missing-member-hint=yes + +# The minimum edit distance a name should have in order to be considered a +# similar match for a missing member name. +missing-member-hint-distance=1 + +# The total number of similar names that should be taken in consideration when +# showing a hint for a missing member. +missing-member-max-choices=1 + +# Regex pattern to define which classes are considered mixins. +mixin-class-rgx=.*[Mm]ixin + +# List of decorators that change the signature of a decorated function. +signature-mutators= + + +[VARIABLES] + +# List of additional names supposed to be defined in builtins. Remember that +# you should avoid defining new builtins when possible. +additional-builtins= + +# Tells whether unused global variables should be treated as a violation. +allow-global-unused-variables=yes + +# List of names allowed to shadow builtins +allowed-redefined-builtins= + +# List of strings which can identify a callback function by name. A callback +# name must start or end with one of those strings. +callbacks=cb_, + _cb + +# A regular expression matching the name of dummy variables (i.e. expected to +# not be used). +dummy-variables-rgx=_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_ + +# Argument names that match this expression will be ignored. +ignored-argument-names=_.*|^ignored_|^unused_ + +# Tells whether we should check for unused import in __init__ files. +init-import=no + +# List of qualified module names which can have objects that can redefine +# builtins. +redefining-builtins-modules=six.moves,past.builtins,future.builtins,builtins,io \ No newline at end of file diff --git a/src/gpt-crawler/.releaserc b/src/gpt-crawler/.releaserc new file mode 100644 index 0000000..fd0064d --- /dev/null +++ b/src/gpt-crawler/.releaserc @@ -0,0 +1,12 @@ +{ + "branches": [ + "main" + ], + "plugins": [ + "@semantic-release/commit-analyzer", + "@semantic-release/changelog", + "@semantic-release/npm", + "@semantic-release/git", + "@semantic-release/github" + ] +} diff --git a/src/gpt-crawler/Dockerfile b/src/gpt-crawler/Dockerfile new file mode 100644 index 0000000..918e7f4 --- /dev/null +++ b/src/gpt-crawler/Dockerfile @@ -0,0 +1,62 @@ +# Specify the base Docker image. You can read more about +# the available images at https://crawlee.dev/docs/guides/docker-images +# You can also use any other image from Docker Hub. +FROM apify/actor-node-playwright-chrome:18 AS builder + +# Copy just package.json and package-lock.json +# to speed up the build using Docker layer cache. +COPY --chown=myuser package*.json ./ + +# Install all dependencies. Don't audit to speed up the installation. +RUN npm install --include=dev --audit=false + +# Next, copy the source files using the user set +# in the base image. +COPY --chown=myuser . ./ + +# Install all dependencies and build the project. +# Don't audit to speed up the installation. +RUN npm run build + +# Create final image +FROM apify/actor-node-playwright-chrome:18 + +# Copy only built JS files from builder image +COPY --from=builder --chown=myuser /home/myuser/dist ./dist + +# Copy just package.json and package-lock.json +# to speed up the build using Docker layer cache. +COPY --chown=myuser package*.json ./ + +# Install NPM packages, skip optional and development dependencies to +# keep the image small. Avoid logging too much and print the dependency +# tree for debugging +RUN npm --quiet set progress=false \ + && npm install --omit=dev --omit=optional \ + && echo "Installed NPM packages:" \ + && (npm list --omit=dev --all || true) \ + && echo "Node.js version:" \ + && node --version \ + && echo "NPM version:" \ + && npm --version + +# Install Python and required dependencies for the Python module +# Switch user to ROOT for installation +USER root +RUN apt-get update \ + && apt-get install -y python3 python3-pip +USER myuser +RUN pip3 install -Uq beautifulsoup4 \ + markdownify transformers torch + +# Copy the Python script +COPY --chown=myuser src/conv_html_to_markdown.py ./ + +# Next, copy the remaining files and directories with the source code. +# Since we do this after NPM install, quick build will be really fast +# for most source file changes. +COPY --chown=myuser . ./ + +# Run the image. If you know you won't need headful browsers, +# you can remove the XVFB start script for a micro perf gain. +CMD ./start_xvfb_and_run_cmd.sh && npm run start:prod --silent && python3 conv_html_to_markdown.py diff --git a/src/gpt-crawler/License b/src/gpt-crawler/License new file mode 100644 index 0000000..5d7a502 --- /dev/null +++ b/src/gpt-crawler/License @@ -0,0 +1,15 @@ +ISC License + +Copyright (c) 2023 BuilderIO + +Permission to use, copy, modify, and/or distribute this software for any purpose +with or without fee is hereby granted, provided that the above copyright notice +and this permission notice appear in all copies. + +THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH +REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND +FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, +INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS +OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER +TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF +THIS SOFTWARE. diff --git a/src/gpt-crawler/README.md b/src/gpt-crawler/README.md new file mode 100644 index 0000000..43bfe4c --- /dev/null +++ b/src/gpt-crawler/README.md @@ -0,0 +1,150 @@ +# GPT Crawler + +Crawl a site to generate knowledge files to create your own custom GPT from one or multiple URLs + +![Gif showing the crawl run](https://github.com/BuilderIO/gpt-crawler/assets/844291/feb8763a-152b-4708-9c92-013b5c70d2f2) + +- [Example](#example) +- [Get started](#get-started) + - [Running locally](#running-locally) + - [Clone the repository](#clone-the-repository) + - [Install dependencies](#install-dependencies) + - [Configure the crawler](#configure-the-crawler) + - [Run your crawler](#run-your-crawler) + - [Alternative methods](#alternative-methods) + - [Running in a container with Docker](#running-in-a-container-with-docker) + - [Running as a CLI](#running-as-a-cli) + - [Development](#development) + - [Upload your data to OpenAI](#upload-your-data-to-openai) + - [Create a custom GPT](#create-a-custom-gpt) + - [Create a custom assistant](#create-a-custom-assistant) +- [Contributing](#contributing) + +## Example + +[Here is a custom GPT](https://chat.openai.com/g/g-kywiqipmR-builder-io-assistant) that I quickly made to help answer questions about how to use and integrate [Builder.io](https://www.builder.io) by simply providing the URL to the Builder docs. + +This project crawled the docs and generated the file that I uploaded as the basis for the custom GPT. + +[Try it out yourself](https://chat.openai.com/g/g-kywiqipmR-builder-io-assistant) by asking questions about how to integrate Builder.io into a site. + +> Note that you may need a paid ChatGPT plan to access this feature + +## Get started + +### Running locally + +#### Clone the repository + +Be sure you have Node.js >= 16 installed. + +```sh +git clone https://github.com/builderio/gpt-crawler +``` + +#### Install dependencies + +```sh +npm i +``` + +#### Configure the crawler + +Open [config.ts](config.ts) and edit the `url` and `selectors` properties to match your needs. + +E.g. to crawl the Builder.io docs to make our custom GPT you can use: + +```ts +export const defaultConfig: Config = { + url: "https://www.builder.io/c/docs/developers", + match: "https://www.builder.io/c/docs/**", + selector: `.docs-builder-container`, + maxPagesToCrawl: 50, + outputFileName: "output.json", +}; +``` + +See [config.ts](src/config.ts) for all available options. Here is a sample of the common configu options: + +```ts +type Config = { + /** URL to start the crawl, if sitemap is provided then it will be used instead and download all pages in the sitemap */ + url: string; + /** Pattern to match against for links on a page to subsequently crawl */ + match: string; + /** Selector to grab the inner text from */ + selector: string; + /** Don't crawl more than this many pages */ + maxPagesToCrawl: number; + /** File name for the finished data */ + outputFileName: string; + /** Optional resources to exclude + * + * @example + * ['png','jpg','jpeg','gif','svg','css','js','ico','woff','woff2','ttf','eot','otf','mp4','mp3','webm','ogg','wav','flac','aac','zip','tar','gz','rar','7z','exe','dmg','apk','csv','xls','xlsx','doc','docx','pdf','epub','iso','dmg','bin','ppt','pptx','odt','avi','mkv','xml','json','yml','yaml','rss','atom','swf','txt','dart','webp','bmp','tif','psd','ai','indd','eps','ps','zipx','srt','wasm','m4v','m4a','webp','weba','m4b','opus','ogv','ogm','oga','spx','ogx','flv','3gp','3g2','jxr','wdp','jng','hief','avif','apng','avifs','heif','heic','cur','ico','ani','jp2','jpm','jpx','mj2','wmv','wma','aac','tif','tiff','mpg','mpeg','mov','avi','wmv','flv','swf','mkv','m4v','m4p','m4b','m4r','m4a','mp3','wav','wma','ogg','oga','webm','3gp','3g2','flac','spx','amr','mid','midi','mka','dts','ac3','eac3','weba','m3u','m3u8','ts','wpl','pls','vob','ifo','bup','svcd','drc','dsm','dsv','dsa','dss','vivo','ivf','dvd','fli','flc','flic','flic','mng','asf','m2v','asx','ram','ra','rm','rpm','roq','smi','smil','wmf','wmz','wmd','wvx','wmx','movie','wri','ins','isp','acsm','djvu','fb2','xps','oxps','ps','eps','ai','prn','svg','dwg','dxf','ttf','fnt','fon','otf','cab'] + */ + resourceExclusions?: string[]; + /** Optional maximum file size in megabytes to include in the output file */ + maxFileSize?: number; + /** Optional maximum number tokens to include in the output file */ + maxTokens?: number; +}; +``` + +#### Run your crawler + +```sh +npm start +``` + +### Alternative methods + +#### [Running in a container with Docker](./containerapp/README.md) + +To obtain the `output.json` with a containerized execution. Go into the `containerapp` directory. Modify the `config.ts` same as above, the `output.json`file should be generated in the data folder. Note : the `outputFileName` property in the `config.ts` file in containerapp folder is configured to work with the container. + +### Upload your data to OpenAI + +The crawl will generate a file called `output.json` at the root of this project. Upload that [to OpenAI](https://platform.openai.com/docs/assistants/overview) to create your custom assistant or custom GPT. + +#### Create a custom GPT + +Use this option for UI access to your generated knowledge that you can easily share with others + +> Note: you may need a paid ChatGPT plan to create and use custom GPTs right now + +1. Go to [https://chat.openai.com/](https://chat.openai.com/) +2. Click your name in the bottom left corner +3. Choose "My GPTs" in the menu +4. Choose "Create a GPT" +5. Choose "Configure" +6. Under "Knowledge" choose "Upload a file" and upload the file you generated +7. if you get an error about the file being too large, you can try to split it into multiple files and upload them separately using the option maxFileSize in the config.ts file or also use tokenization to reduce the size of the file with the option maxTokens in the config.ts file + +![Gif of how to upload a custom GPT](https://github.com/BuilderIO/gpt-crawler/assets/844291/22f27fb5-6ca5-4748-9edd-6bcf00b408cf) + +#### Create a custom assistant + +Use this option for API access to your generated knowledge that you can integrate into your product. + +1. Go to [https://platform.openai.com/assistants](https://platform.openai.com/assistants) +2. Click "+ Create" +3. Choose "upload" and upload the file you generated + +![Gif of how to upload to an assistant](https://github.com/BuilderIO/gpt-crawler/assets/844291/06e6ad36-e2ba-4c6e-8d5a-bf329140de49) + +## Contributing + +Know how to make this project better? Send a PR! + +
+
+ +

+ + + + Made with love by Builder.io + + +

diff --git a/src/gpt-crawler/config.ts b/src/gpt-crawler/config.ts new file mode 100644 index 0000000..82cd967 --- /dev/null +++ b/src/gpt-crawler/config.ts @@ -0,0 +1,8 @@ +import { Config } from "./src/config"; + +export const defaultConfig: Config = { + url: "https://docs.pinecone.io/docs/langchain", + match: "https://docs.pinecone.io/docs/langchain/**", + maxPagesToCrawl: 50, + outputFileName: "output.json", +}; diff --git a/src/gpt-crawler/containerapp/Dockerfile b/src/gpt-crawler/containerapp/Dockerfile new file mode 100644 index 0000000..876a9a1 --- /dev/null +++ b/src/gpt-crawler/containerapp/Dockerfile @@ -0,0 +1,35 @@ +FROM ubuntu:jammy + +# Install Git +RUN apt-get update && \ + apt-get install sudo -y && \ + apt-get install git -y + +# Install Docker +RUN apt-get install ca-certificates curl gnupg -y && \ + curl -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /etc/apt/keyrings/docker.gpg && \ + echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | tee /etc/apt/sources.list.d/docker.list > /dev/null && \ + apt-get update && \ + apt-get install docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin -y + +# Install Nodejs v20 npm +RUN sudo apt-get update && \ + sudo apt-get install -y ca-certificates curl gnupg && \ + sudo mkdir -p /etc/apt/keyrings && \ + curl -fsSL https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key | sudo gpg --dearmor -o /etc/apt/keyrings/nodesource.gpg + +RUN echo "deb [signed-by=/etc/apt/keyrings/nodesource.gpg] https://deb.nodesource.com/node_20.x nodistro main" | sudo tee /etc/apt/sources.list.d/nodesource.list && \ + sudo apt-get update && \ + sudo apt-get install nodejs -y + +# Install gpt-crawler +RUN cd /home && git clone https://github.com/builderio/gpt-crawler && cd gpt-crawler && \ + npm i && \ + npx playwright install && \ + npx playwright install-deps + +# Directory to mount in the docker container to get the output.json data +RUN cd /home && mkdir data + + +WORKDIR /home \ No newline at end of file diff --git a/src/gpt-crawler/containerapp/README.md b/src/gpt-crawler/containerapp/README.md new file mode 100644 index 0000000..d3991ef --- /dev/null +++ b/src/gpt-crawler/containerapp/README.md @@ -0,0 +1,14 @@ +# Containerized crawler + +## Docker image with packaged crawler, with script for building and execution. + +All dependencies set up and configured in the Dockerfile. Requires docker to be installed. + +## Get started + +### Prerequisites + +Be sure you have docker installed + +1. `cd gpt-crawler/containerapp ` +2. `. ./run.sh ` diff --git a/src/gpt-crawler/containerapp/data/config.ts b/src/gpt-crawler/containerapp/data/config.ts new file mode 100644 index 0000000..eb92366 --- /dev/null +++ b/src/gpt-crawler/containerapp/data/config.ts @@ -0,0 +1,8 @@ +import { Config } from "./src/config"; + +export const defaultConfig: Config = { + url: "https://www.builder.io/c/docs/developers", + match: "https://www.builder.io/c/docs/**", + maxPagesToCrawl: 50, + outputFileName: "../data/output.json", +}; diff --git a/src/gpt-crawler/containerapp/data/init.sh b/src/gpt-crawler/containerapp/data/init.sh new file mode 100644 index 0000000..b32e6bd --- /dev/null +++ b/src/gpt-crawler/containerapp/data/init.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +# copy the config when starting the container +cp /home/data/config.ts /home/gpt-crawler/ + +# start the crawler +cd /home/gpt-crawler && npm start + +# Print message after crawling and exit +echo "Crawling complete.." +exit \ No newline at end of file diff --git a/src/gpt-crawler/containerapp/run.sh b/src/gpt-crawler/containerapp/run.sh new file mode 100644 index 0000000..f42cf9f --- /dev/null +++ b/src/gpt-crawler/containerapp/run.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +# Check if there is a Docker image named "crawler" +if ! sudo docker images | grep -w 'crawler' > /dev/null; then + echo "Docker repository 'crawler' not found. Building the image..." + # Build the Docker image with the name 'crawler' + sudo docker build -t crawler . +else + echo "Docker image already built." +fi + +# Ensure that init.sh script is executable +sudo chmod +x ./data/init.sh + +# Starting docker, mount docker.sock to work with docker-in-docker function, mount data directory for input/output from container +sudo docker run --rm -it -v /var/run/docker.sock:/var/run/docker.sock -v ./data:/home/data crawler bash -c "/home/data/init.sh" diff --git a/src/gpt-crawler/src/cli.ts b/src/gpt-crawler/src/cli.ts new file mode 100644 index 0000000..36ebfcd --- /dev/null +++ b/src/gpt-crawler/src/cli.ts @@ -0,0 +1,97 @@ +#!/usr/bin/env node + +import { program } from "commander"; +import { Config } from "./config.js"; +import { crawl, write } from "./core.js"; +import { createRequire } from "node:module"; +import inquirer from "inquirer"; + +const require = createRequire(import.meta.url); +const { version, description } = require("../../package.json"); + +const messages = { + url: "What is the first URL of the website you want to crawl?", + match: "What is the URL pattern you want to match?", + selector: "What is the CSS selector you want to match?", + maxPagesToCrawl: "How many pages do you want to crawl?", + outputFileName: "What is the name of the output file?", +}; + +async function handler(options: Config) { + try { + const { + url, + match, + selector, + maxPagesToCrawl: maxPagesToCrawlStr, + outputFileName, + } = options; + + // @ts-ignore + const maxPagesToCrawl = parseInt(maxPagesToCrawlStr, 10); + + let config: Config = { + url, + match, + selector, + maxPagesToCrawl, + outputFileName, + }; + + if (!config.url || !config.match || !config.selector) { + const questions = []; + + if (!config.url) { + questions.push({ + type: "input", + name: "url", + message: messages.url, + }); + } + + if (!config.match) { + questions.push({ + type: "input", + name: "match", + message: messages.match, + }); + } + + if (!config.selector) { + questions.push({ + type: "input", + name: "selector", + message: messages.selector, + }); + } + + const answers = await inquirer.prompt(questions); + + config = { + ...config, + ...answers, + }; + } + + await crawl(config); + await write(config); + } catch (error) { + console.log(error); + } +} + +program.version(version).description(description); + +program + .option("-u, --url ", messages.url, "") + .option("-m, --match ", messages.match, "") + .option("-s, --selector ", messages.selector, "") + .option("-m, --maxPagesToCrawl ", messages.maxPagesToCrawl, "50") + .option( + "-o, --outputFileName ", + messages.outputFileName, + "output.json", + ) + .action(handler); + +program.parse(); diff --git a/src/gpt-crawler/src/config.ts b/src/gpt-crawler/src/config.ts new file mode 100644 index 0000000..7e5f5fb --- /dev/null +++ b/src/gpt-crawler/src/config.ts @@ -0,0 +1,75 @@ +import { z } from "zod"; + +import type { Page } from "playwright"; + +const Page: z.ZodType = z.any(); + +export const configSchema = z.object({ + /** + * URL to start the crawl, if url is a sitemap, it will crawl all pages in the sitemap + * @example "https://www.builder.io/c/docs/developers" + * @example "https://www.builder.io/sitemap.xml" + * @default "" + */ + url: z.string(), + /** + * Pattern to match against for links on a page to subsequently crawl + * @example "https://www.builder.io/c/docs/**" + * @default "" + */ + match: z.string().or(z.array(z.string())), + + /** + * Selector to grab the inner text from + * @example ".docs-builder-container" + * @default "" + */ + selector: z.string().optional(), + /** + * Don't crawl more than this many pages + * @default 50 + */ + maxPagesToCrawl: z.number().int().positive(), + /** + * File name for the finished data + * @default "output.json" + */ + outputFileName: z.string(), + /** Optional cookie to be set. E.g. for Cookie Consent */ + cookie: z + .object({ + name: z.string(), + value: z.string(), + }) + .optional(), + /** Optional function to run for each page found */ + onVisitPage: z + .function() + .args( + z.object({ + page: Page, + pushData: z.function().args(z.any()).returns(z.promise(z.void())), + }), + ) + .returns(z.promise(z.void())) + .optional(), + /** Optional timeout for waiting for a selector to appear */ + waitForSelectorTimeout: z.number().int().nonnegative().optional(), + /** Optional resources to exclude + * + * @example + * ['png','jpg','jpeg','gif','svg','css','js','ico','woff','woff2','ttf','eot','otf','mp4','mp3','webm','ogg','wav','flac','aac','zip','tar','gz','rar','7z','exe','dmg','apk','csv','xls','xlsx','doc','docx','pdf','epub','iso','dmg','bin','ppt','pptx','odt','avi','mkv','xml','json','yml','yaml','rss','atom','swf','txt','dart','webp','bmp','tif','psd','ai','indd','eps','ps','zipx','srt','wasm','m4v','m4a','webp','weba','m4b','opus','ogv','ogm','oga','spx','ogx','flv','3gp','3g2','jxr','wdp','jng','hief','avif','apng','avifs','heif','heic','cur','ico','ani','jp2','jpm','jpx','mj2','wmv','wma','aac','tif','tiff','mpg','mpeg','mov','avi','wmv','flv','swf','mkv','m4v','m4p','m4b','m4r','m4a','mp3','wav','wma','ogg','oga','webm','3gp','3g2','flac','spx','amr','mid','midi','mka','dts','ac3','eac3','weba','m3u','m3u8','ts','wpl','pls','vob','ifo','bup','svcd','drc','dsm','dsv','dsa','dss','vivo','ivf','dvd','fli','flc','flic','flic','mng','asf','m2v','asx','ram','ra','rm','rpm','roq','smi','smil','wmf','wmz','wmd','wvx','wmx','movie','wri','ins','isp','acsm','djvu','fb2','xps','oxps','ps','eps','ai','prn','svg','dwg','dxf','ttf','fnt','fon','otf','cab'] + */ + resourceExclusions: z.array(z.string()).optional(), + + /** Optional maximum file size in megabytes to include in the output file + * @example 1 + */ + maxFileSize: z.number().int().positive().optional(), + /** Optional maximum number tokens to include in the output file + * @example 5000 + */ + maxTokens: z.number().int().positive().optional(), +}); + +export type Config = z.infer; diff --git a/src/gpt-crawler/src/conv_html_to_markdown.py b/src/gpt-crawler/src/conv_html_to_markdown.py new file mode 100644 index 0000000..5f42fba --- /dev/null +++ b/src/gpt-crawler/src/conv_html_to_markdown.py @@ -0,0 +1,282 @@ +""" +This module provides functionality for converting HTML to Markdown and +formatting a dataset of HTML content into structured Markdown, with added +capabilities of processing text embeddings to identify and +remove redundant content. +""" + +import glob +import json +import logging +from concurrent.futures import ThreadPoolExecutor +from bs4 import BeautifulSoup +from markdownify import markdownify as md +from transformers import AutoTokenizer, AutoModel +import torch + + +class HTMLToMarkdownConverter: + """ + A converter class that transforms HTML content to Markdown + format and processes text embeddings. + + Attributes: + strip_tags (list): A list of HTML tags to be stripped during + conversion. + convert_links (bool): A flag to determine whether links + should be converted. + tokenizer (AutoTokenizer): Tokenizer from the transformers library. + model (AutoModel): Pre-trained model from the transformers library. + """ + + def __init__(self, strip_tags=None, convert_links=True): + """Initialize the converter with configuration options and + Jina embeddings model.""" + self.strip_tags = strip_tags or ["script", "style", "meta"] + self.convert_links = convert_links + self.tokenizer = AutoTokenizer.from_pretrained( + "jinaai/jina-embeddings-v2-small-en", trust_remote_code=True + ) + self.model = AutoModel.from_pretrained("jinaai/jina-embeddings-v2-small-en") + + def mean_pooling(self, model_output, attention_mask): + """Applies mean pooling to the token embeddings to + create sentence embeddings.""" + token_embeddings = model_output[0] + input_mask_expanded = ( + attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() + ) + sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1) + sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9) + return sum_embeddings / sum_mask + + def process_embeddings(self, lines, batch_size=32): + """Processes the embeddings for the given lines in batches.""" + batched_embeddings = [] + for i in range(0, len(lines), batch_size): + batch = lines[i : i + batch_size] + encoded_input = self.tokenizer( + batch, padding=True, truncation=True, return_tensors="pt" + ) + with torch.no_grad(): + model_output = self.model(**encoded_input) + batch_embeddings = self.mean_pooling( + model_output, encoded_input["attention_mask"] + ) + batched_embeddings.extend(batch_embeddings) + + return torch.nn.functional.normalize( + torch.stack(batched_embeddings), p=2, dim=1 + ) + + def remove_redundant_data(self, embeddings, lines): + """Removes redundant lines based on semantic similarity + using embeddings.""" + cleaned_lines = [lines[0]] # Always include the first line + for i in range(1, len(lines)): + similarity = torch.cosine_similarity( + embeddings[i].unsqueeze(0), embeddings[i - 1].unsqueeze(0) + ) + if similarity.item() < 0.86899: # Threshold for redundancy + cleaned_lines.append(lines[i]) + return "\n".join(cleaned_lines) + + def convert(self, html_content): + """Converts HTML content to Markdown format.""" + try: + curated_html = self.curate_content(html_content) + markdown_content = md( + curated_html, + strip_tags=self.strip_tags, + convert_links=self.convert_links, + ).strip() + lines = markdown_content.split("\n") + embeddings = self.process_embeddings(lines) + return self.remove_redundant_data(embeddings, lines) + except Exception as e: + logging.error("Error during conversion: %s", e) + raise + + def curate_content(self, html): + """Curates the HTML content by removing specified elements and tags.""" + try: + soup = BeautifulSoup(html, "html.parser") + for selector in [ + "header", + "footer", + "nav", + ".navbar", + ".menu", + ".footer-links", + "#sidebar", + "#ad-container", + 'div[class*="cookie"], div[class*="banner"]', + "aside", + ".pagination", + "form", + ]: + for element in soup.select(selector): + element.decompose() + for tag in self.strip_tags: + for s in soup(tag): + s.decompose() + return str(soup) + except Exception as e: + logging.error("Error in curating HTML content: %s", e) + return html + + +class DatasetFormatter: + """ + A class to format a dataset of HTML entries into structured Markdown. + + Attributes: + converter (HTMLToMarkdownConverter): An instance of \ + HTMLToMarkdownConverter for HTML to Markdown conversion. + + Methods: + format_entry(entry): Formats a single dataset entry into Markdown. + structure_markdown(title, url, content): Structures \ + Markdown content with headers and links. + format_dataset(data): Formats an entire dataset \ + of entries into Markdown. + """ + + def __init__(self, converter): + self.converter = converter + + def format_entry(self, entry): + """Format a single entry from the dataset.""" + try: + title = entry.get("title", "Untitled") + url = entry.get("url", "") + html_content = entry.get("html", "") + logging.info("Formatted entry: %s", title) + markdown_content = self.converter.convert(html_content) + return self.structure_markdown(title, url, markdown_content) + except Exception as e: + logging.error("Error formatting entry: %s", e) + return "" + + def structure_markdown(self, title, url, content): + """Structure the Markdown content with headers, lists, etc.""" + structured_content = f"## {title}\n\n" + if url: + structured_content += f"[Read More]({url})\n\n" + structured_content += ( + content.strip() + ) # Remove leading and trailing whitespace/newlines + return structured_content + + def format_dataset(self, data): + """Format the entire dataset.""" + formatted_content = [] + for entry in data: + formatted_entry = self.format_entry(entry) + formatted_content.append(formatted_entry) + return "\n\n".join( + formatted_content + ) # Ensure proper newline separation between entries + + +def load_json_files(pattern): + """ + Load data from multiple JSON files matching a pattern. + + Args: + pattern (str): Glob pattern to match files. + + Returns: + list: Aggregated data from all matched files. + """ + aggregated_data = [] + for file_path in glob.glob(pattern): + with open(file_path, "r", encoding="utf-8") as file: + aggregated_data.extend(json.load(file)) + return aggregated_data + + +def save_output_in_chunks(file_path, contents, chunk_size=1024): + """ + Save the given content into a file in chunks. + + Args: + file_path (str): Path where the output should be saved. + contents (iterable): The content to be written to the file. + chunk_size (int): The size of each chunk to be written. + """ + with open(file_path, "w", encoding="utf-8") as file: + for content in contents: + file.write(content) + if len(content) > chunk_size: + file.flush() # Flush after writing a large chunk + + +def chunk_dataset(data, chunk_size): + """ + Yields chunks of the dataset for processing. + + Args: + data (iterable): The dataset to be chunked. + chunk_size (int): The size of each chunk. + + Yields: + iterable: A chunk of the dataset. + """ + logging.info("Dividing dataset into chunks of size %s.", chunk_size) + for i in range(0, len(data), chunk_size): + yield data[i : i + chunk_size] + + +def process_chunk(chunk): + """ + Processes a single chunk of the dataset. + + Args: + chunk (iterable): A chunk of the dataset to be processed. + + Returns: + str: The formatted Markdown content of the chunk. + """ + logging.info("Processing a new chunk of the dataset.") + formatter = DatasetFormatter(HTMLToMarkdownConverter()) + return formatter.format_dataset(chunk) + + +def main(): + """ + Main function to load, process, and save the dataset. + + Performs the steps: + - Load data from JSON. + - Chunk the data. + - Process each chunk in parallel. + - Save the processed data in chunks. + """ + logging.basicConfig(level=logging.INFO) + try: + pattern = "output*.json" # Pattern to match JSON files + original_data = load_json_files(pattern) + chunk_size = 512 # Adjust chunk size as needed + max_threads = 10 # Adjust the maximum number of threads as needed + + chunks = list(chunk_dataset(original_data, chunk_size)) + formatted_contents = [] + + logging.info("Processing and saving dataset in chunks.") + with ThreadPoolExecutor(max_workers=max_threads) as executor: + results = executor.map(process_chunk, chunks) + for result in results: + formatted_contents.append(result) + + output_file_name = "gpt-crawler-curated_markdown.md" + save_output_in_chunks(output_file_name, formatted_contents) + logging.info("Content formatted and saved in chunks successfully.") + + logging.info("\nConversion process successful. Exiting program.") + except Exception as e: + logging.error("An error occurred in the main function: %s", e) + + +if __name__ == "__main__": + main() diff --git a/src/gpt-crawler/src/core.ts b/src/gpt-crawler/src/core.ts new file mode 100644 index 0000000..8e03bbe --- /dev/null +++ b/src/gpt-crawler/src/core.ts @@ -0,0 +1,216 @@ +// For more information, see https://crawlee.dev/ +import { PlaywrightCrawler, downloadListOfUrls } from "crawlee"; +import { readFile, writeFile } from "fs/promises"; +import { glob } from "glob"; +import { Config, configSchema } from "./config.js"; +import { Page } from "playwright"; +import { isWithinTokenLimit } from "gpt-tokenizer"; + +let pageCounter = 0; + +export function getPageHtml(page: Page, selector = "body") { + return page.evaluate((selector) => { + // Check if the selector is an XPath + if (selector.startsWith("/")) { + const elements = document.evaluate( + selector, + document, + null, + XPathResult.ANY_TYPE, + null, + ); + let result = elements.iterateNext(); + return result ? result.textContent || "" : ""; + } else { + // Handle as a CSS selector + const el = document.querySelector(selector) as HTMLElement | null; + return el?.innerText || ""; + } + }, selector); +} + +export async function waitForXPath(page: Page, xpath: string, timeout: number) { + await page.waitForFunction( + (xpath) => { + const elements = document.evaluate( + xpath, + document, + null, + XPathResult.ANY_TYPE, + null, + ); + return elements.iterateNext() !== null; + }, + xpath, + { timeout }, + ); +} + +export async function crawl(config: Config) { + configSchema.parse(config); + + if (process.env.NO_CRAWL !== "true") { + // PlaywrightCrawler crawls the web using a headless + // browser controlled by the Playwright library. + const crawler = new PlaywrightCrawler({ + // Use the requestHandler to process each of the crawled pages. + async requestHandler({ request, page, enqueueLinks, log, pushData }) { + if (config.cookie) { + // Set the cookie for the specific URL + const cookie = { + name: config.cookie.name, + value: config.cookie.value, + url: request.loadedUrl, + }; + await page.context().addCookies([cookie]); + } + + const title = await page.title(); + pageCounter++; + log.info( + `Crawling: Page ${pageCounter} / ${config.maxPagesToCrawl} - URL: ${request.loadedUrl}...`, + ); + + // Use custom handling for XPath selector + if (config.selector) { + if (config.selector.startsWith("/")) { + await waitForXPath( + page, + config.selector, + config.waitForSelectorTimeout ?? 1000, + ); + } else { + await page.waitForSelector(config.selector, { + timeout: config.waitForSelectorTimeout ?? 1000, + }); + } + } + + const html = await getPageHtml(page, config.selector); + + // Save results as JSON to ./storage/datasets/default + await pushData({ title, url: request.loadedUrl, html }); + + if (config.onVisitPage) { + await config.onVisitPage({ page, pushData }); + } + + // Extract links from the current page + // and add them to the crawling queue. + await enqueueLinks({ + globs: + typeof config.match === "string" ? [config.match] : config.match, + }); + }, + // Comment this option to scrape the full website. + maxRequestsPerCrawl: config.maxPagesToCrawl, + // Uncomment this option to see the browser window. + // headless: false, + preNavigationHooks: [ + // Abort requests for certain resource types + async ({ page, log }) => { + // If there are no resource exclusions, return + const RESOURCE_EXCLUSTIONS = config.resourceExclusions ?? []; + if (RESOURCE_EXCLUSTIONS.length === 0) { + return; + } + await page.route(`**\/*.{${RESOURCE_EXCLUSTIONS.join()}}`, (route) => + route.abort("aborted"), + ); + log.info( + `Aborting requests for as this is a resource excluded route`, + ); + }, + ], + }); + + const SITEMAP_SUFFIX = "sitemap.xml"; + const isUrlASitemap = config.url.endsWith(SITEMAP_SUFFIX); + + if (isUrlASitemap) { + const listOfUrls = await downloadListOfUrls({ url: config.url }); + + // Add the initial URL to the crawling queue. + await crawler.addRequests(listOfUrls); + + // Run the crawler + await crawler.run(); + } else { + // Add first URL to the queue and start the crawl. + await crawler.run([config.url]); + } + } +} + +export async function write(config: Config) { + const jsonFiles = await glob("storage/datasets/default/*.json", { + absolute: true, + }); + + console.log(`Found ${jsonFiles.length} files to combine...`); + + let currentResults: Record[] = []; + let currentSize: number = 0; + let fileCounter: number = 1; + const maxBytes: number = config.maxFileSize + ? config.maxFileSize * 1024 * 1024 + : Infinity; + + const getStringByteSize = (str: string): number => + Buffer.byteLength(str, "utf-8"); + + const nextFileName = (): string => + `${config.outputFileName.replace(/\.json$/, "")}-${fileCounter}.json`; + + const writeBatchToFile = async (): Promise => { + await writeFile(nextFileName(), JSON.stringify(currentResults, null, 2)); + console.log(`Wrote ${currentResults.length} items to ${nextFileName()}`); + currentResults = []; + currentSize = 0; + fileCounter++; + }; + + let estimatedTokens: number = 0; + + const addContentOrSplit = async ( + data: Record, + ): Promise => { + const contentString: string = JSON.stringify(data); + const tokenCount: number | false = isWithinTokenLimit( + contentString, + config.maxTokens || Infinity, + ); + + if (typeof tokenCount === "number") { + if (estimatedTokens + tokenCount > config.maxTokens!) { + // Only write the batch if it's not empty (something to write) + if (currentResults.length > 0) { + await writeBatchToFile(); + } + // Since the addition of a single item exceeded the token limit, halve it. + estimatedTokens = Math.floor(tokenCount / 2); + currentResults.push(data); + } else { + currentResults.push(data); + estimatedTokens += tokenCount; + } + } + + currentSize += getStringByteSize(contentString); + if (currentSize > maxBytes) { + await writeBatchToFile(); + } + }; + + // Iterate over each JSON file and process its contents. + for (const file of jsonFiles) { + const fileContent = await readFile(file, "utf-8"); + const data: Record = JSON.parse(fileContent); + await addContentOrSplit(data); + } + + // Check if any remaining data needs to be written to a file. + if (currentResults.length > 0) { + await writeBatchToFile(); + } +} diff --git a/src/gpt-crawler/src/main.ts b/src/gpt-crawler/src/main.ts new file mode 100644 index 0000000..ed24577 --- /dev/null +++ b/src/gpt-crawler/src/main.ts @@ -0,0 +1,5 @@ +import { defaultConfig } from "../config.js"; +import { crawl, write } from "./core.js"; + +await crawl(defaultConfig); +await write(defaultConfig); diff --git a/src/gpt-crawler/tests/test_conv_html_to_markdown.py b/src/gpt-crawler/tests/test_conv_html_to_markdown.py new file mode 100644 index 0000000..42a042b --- /dev/null +++ b/src/gpt-crawler/tests/test_conv_html_to_markdown.py @@ -0,0 +1,76 @@ +import sys +import os +import unittest +import json + +# Add the parent directory to the Python path +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) +from src.conv_html_to_markdown import HTMLToMarkdownConverter, DatasetFormatter + + +class TestHTMLToMarkdownConverter(unittest.TestCase): + def setUp(self): + self.converter = HTMLToMarkdownConverter() + self.formatter = DatasetFormatter(self.converter) + self.html_content = "

This is a test

This is a paragraph.

" + self.markdown_content = "# This is a test\n\nThis is a paragraph." + + def test_convert(self): + self.assertEqual( + self.converter.convert(self.html_content), self.markdown_content + ) + + def test_curate_content(self): + self.assertEqual( + self.converter.curate_content(self.html_content), self.html_content + ) + + def test_format_entry(self): + entry = {"title": "Test", "url": "www.test.com", "html": self.html_content} + self.assertEqual( + self.formatter.format_entry(entry), + f"## Test\n\n[Read More](www.test.com)\n\n{self.markdown_content}", + ) + + def test_structure_markdown(self): + self.assertEqual( + self.formatter.structure_markdown( + "Test", "www.test.com", self.markdown_content + ), + f"## Test\n\n[Read More](www.test.com)\n\n{self.markdown_content}", + ) + + def test_format_dataset(self): + data = [ + {"title": "Test 1", "url": "www.test1.com", "html": self.html_content}, + {"title": "Test 2", "url": "www.test2.com", "html": self.html_content}, + ] + self.assertEqual( + self.formatter.format_dataset(data), + f"## Test 1\n\n[Read More](www.test1.com)\n\n{self.markdown_content}\n\n## Test 2\n\n[Read More](www.test2.com)\n\n{self.markdown_content}", + ) + + def test_load_json(self): + with open("output.json", "r", encoding="utf-8") as file: + expected_data = json.load(file) + self.assertEqual(load_json("output.json"), expected_data) + + def test_chunk_dataset(self): + data = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] + chunk_size = 3 + expected_chunks = [[1, 2, 3], [4, 5, 6], [7, 8, 9], [10]] + self.assertEqual(list(chunk_dataset(data, chunk_size)), expected_chunks) + + def test_process_chunk(self): + chunk = [ + {"title": "Test 1", "url": "www.test1.com", "html": self.html_content}, + {"title": "Test 2", "url": "www.test2.com", "html": self.html_content}, + ] + self.assertEqual( + process_chunk(chunk), + f"## Test 1\n\n[Read More](www.test1.com)\n\n{self.markdown_content}\n\n## Test 2\n\n[Read More](www.test2.com)\n\n{self.markdown_content}", + ) + + +if __name__ == "__main__": + unittest.main