diff --git a/.github/workflows/pylint.yaml b/.github/workflows/pylint.yaml index 890b37d0..17e707e7 100644 --- a/.github/workflows/pylint.yaml +++ b/.github/workflows/pylint.yaml @@ -20,6 +20,7 @@ on: - 'examples/**' - 'gptcache/**' - 'tests/**' + - 'docs/**' - '!**.md' - '.github/workflows/**' workflow_dispatch: @@ -37,3 +38,13 @@ jobs: run: | pip install pylint==2.10.2 make pylint_check + - name: Install doc dependency + shell: bash + working-directory: docs + run: | + pip install -r requirements.txt + - name: Make the readthedoc html + shell: bash + working-directory: docs + run: | + make html diff --git a/.gitignore b/.gitignore index e28aebc2..a972ff9a 100644 --- a/.gitignore +++ b/.gitignore @@ -137,3 +137,5 @@ dmypy.json **/example.py **/example.db **/.chroma +docs/references/* +!docs/references/index.rst \ No newline at end of file diff --git a/README.md b/README.md index 4977b5ff..af774442 100644 --- a/README.md +++ b/README.md @@ -351,6 +351,7 @@ The **Vector Store** module helps find the K most similar requests from the inpu - [x] Support [Hnswlib](https://github.com/nmslib/hnswlib), header-only C++/python library for fast approximate nearest neighbors. - [x] Support [pgvector](https://github.com/pgvector/pgvector), open-source vector similarity search for Postgres. - [x] Support [chroma](https://github.com/chroma-core/chroma), the AI-native open-source embedding database. + - [x] Support [DocArray](https://github.com/docarray/docarray), DocArray is a library for representing, sending and storing multi-modal data, perfect for Machine Learning applications. - [ ] Support qdrant - [ ] Support weaviate - [ ] Support other vector databases. diff --git a/docs/Makefile b/docs/Makefile index d4bb2cbb..c2f2a664 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -5,6 +5,7 @@ # from the environment for the first two. SPHINXOPTS ?= SPHINXBUILD ?= sphinx-build +SPHINXAUTOBUILD ?= sphinx-autobuild SOURCEDIR = . BUILDDIR = _build diff --git a/docs/_exts/indexCon.py b/docs/_exts/indexCon.py index a91e3c0b..0af4f6ac 100644 --- a/docs/_exts/indexCon.py +++ b/docs/_exts/indexCon.py @@ -50,32 +50,7 @@ def preprocess(self): with open('toc.bak', 'r') as t: f.write(t.read()) - - - - - - - - - - - - - - - - - - - - - - - - - if __name__ == '__main__': index = IndexCon('../../README.md') diff --git a/docs/conf.py b/docs/conf.py index 8f7767ed..4f350227 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -12,34 +12,25 @@ import os import sys -from m2r2 import parse_from_file -sys.path.insert(0, os.path.abspath('.')) -sys.path.insert(0, os.path.abspath('..')) -sys.path.insert(0, os.path.abspath('_exts')) +sys.path.insert(0, os.path.abspath(".")) +sys.path.insert(0, os.path.abspath("..")) +sys.path.insert(0, os.path.abspath("_exts")) from _exts.docgen import DocGen from _exts.indexCon import IndexCon -# -- Preactions -------------------------------------------------------------- - -# Prepare docs -docgen = DocGen(output_dir='references') -docgen.generate('gptcache') - -IndexCon('../README.md', 'index.rst') - # -- Project information ----------------------------------------------------- -project = 'GPTCache' -copyright = '2023, Zilliz Inc' -author = 'Zilliz Inc.' +project = "GPTCache" +copyright = "2023, Zilliz Inc" +author = "Zilliz Inc." # The full version, including alpha/beta/rc tags -release = 'main' +release = "main" html_title = project -html_last_updated_fmt = '%b %d, %Y' +html_last_updated_fmt = "%b %d, %Y" # -- General configuration --------------------------------------------------- @@ -49,29 +40,28 @@ # ones. extensions = [ - 'myst_nb', - 'sphinx.ext.autodoc', - 'sphinx.ext.autodoc.typehints', - 'sphinx.ext.autosummary', - 'sphinx.ext.napoleon', - 'sphinx.ext.viewcode', - 'sphinxcontrib.autodoc_pydantic', - 'sphinx_copybutton', - 'sphinx_panels', - 'sphinx_toolbox.collapse' + "myst_nb", + "sphinx.ext.autodoc", + "sphinx.ext.autodoc.typehints", + "sphinx.ext.autosummary", + "sphinx.ext.napoleon", + "sphinx.ext.viewcode", + "sphinx.ext.doctest", + "sphinx.ext.intersphinx", + "sphinx_copybutton", + "sphinx_panels", + "sphinx_toolbox.collapse", + "sphinxcontrib.autodoc_pydantic", ] - -myst_enable_extensions = [ - "dollarmath", - "amsmath", - "deflist", - "html_admonition", - "html_image", - "colon_fence", - "smartquotes", - "replacements" -] +intersphinx_mapping = { + "torch": ("https://pytorch.org/docs/stable/", None), + "numpy": ("https://numpy.org/devdocs/", None), + "python": ("https://docs.python.org/3", None), +} +autodoc_member_order = "bysource" +autodoc_mock_imports = ["httpx"] +autodoc_inherit_docstrings = False autodoc_pydantic_model_show_json = False autodoc_pydantic_field_list_validators = False @@ -83,19 +73,19 @@ autodoc_pydantic_model_undoc_members = False source_suffix = { - '.rst': 'restructuredtext', - '.ipynb': 'myst-nb', - '.md': 'myst-nb', + ".rst": "restructuredtext", + ".ipynb": "myst-nb", + ".md": "myst-nb", } # Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] +templates_path = ["_templates"] # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This pattern also affects html_static_path and html_extra_path. -exclude_patterns = ['_build'] +exclude_patterns = ["_build", "Thumbs.db", ".DS_Store", "references/client*"] # -- Options for HTML output ------------------------------------------------- @@ -105,20 +95,20 @@ # # html_theme = 'alabaster' # html_theme = 'python_docs_theme' -html_theme = 'sphinx_book_theme' +html_theme = "sphinx_book_theme" html_theme_options = { - 'path_to_docs': 'docs', - 'repository_url': 'https://github.com/zilliztech/GPTCache', - 'use_repository_button': True + "path_to_docs": "docs", + "repository_url": "https://github.com/zilliztech/GPTCache", + "use_repository_button": True, } html_context = { - 'display_github': True, # Integrate GitHub - 'github_user': 'zilliztech', # Username - 'github_repo': 'GPTCache', # Repo name - 'github_version': 'main', # Version - 'conf_py_path': '/docs' # Path in the checkout to the docs root + "display_github": True, # Integrate GitHub + "github_user": "zilliztech", # Username + "github_repo": "GPTCache", # Repo name + "github_version": "main", # Version + "conf_py_path": "/docs/", # Path in the checkout to the docs root } html_show_sphinx = False @@ -126,7 +116,15 @@ # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named 'default.css' will overwrite the builtin 'default.css'. -html_static_path = ['_static'] +html_static_path = ["_static"] + +nb_execution_mode = "off" +myst_enable_extensions = ["colon_fence"] + +# -- Preactions -------------------------------------------------------------- + +# Prepare docs +docgen = DocGen(output_dir="references") +docgen.generate("gptcache") -nb_execution_mode = 'off' -myst_enable_extensions = ['colon_fence'] \ No newline at end of file +IndexCon("../README.md", "index.rst") diff --git a/docs/index.rst b/docs/index.rst index b360f7a9..5edbeb15 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -1,126 +1,338 @@ -.. GPTCache documentation master file, created by - sphinx-quickstart on Tue Apr 4 12:07:10 2023. - You can adapt this file completely to your liking, but it should at least - contain the root `toctree` directive. -GPTCache: A Library for Creating Semantic Cache for LLM Queries -=========================================================================== +GPTCache : A Library for Creating Semantic Cache for LLM Queries +================================================================ -Boost LLM API Speed by 100x ⚑, Slash Costs by 10x πŸ’° +Slash Your LLM API Costs by 10x πŸ’°, Boost Speed by 100x ⚑ ----- -.. image:: https://img.shields.io/pypi/v/gptcache?label=Release&color - :width: 100 - :alt: release +.. image:: https://img.shields.io/pypi/v/gptcache?label=Release&color&logo=Python :target: https://pypi.org/project/gptcache/ + :alt: Release -.. image:: https://img.shields.io/pypi/dm/gptcache.svg?color=bright-green - :width: 100 - :alt: pip_downloads + +.. image:: https://img.shields.io/pypi/dm/gptcache.svg?color=bright-green&logo=Pypi :target: https://pypi.org/project/gptcache/ + :alt: pip download + + +.. image:: https://img.shields.io/codecov/c/github/zilliztech/GPTCache/dev?label=Codecov&logo=codecov&token=E30WxqBeJJ + :target: https://codecov.io/gh/zilliztech/GPTCache + :alt: Codecov + .. image:: https://img.shields.io/badge/License-MIT-blue.svg - :width: 100 - :alt: License :target: https://opensource.org/license/mit/ + :alt: License -.. image:: https://dcbadge.vercel.app/api/server/Q8C6WEjSWV?compact=true&style=flat - :width: 100 - :alt: discord - :target: https://discord.gg/Q8C6WEjSWV .. image:: https://img.shields.io/twitter/url/https/twitter.com/zilliz_universe.svg?style=social&label=Follow%20%40Zilliz - :width: 100 - :alt: Twitter :target: https://twitter.com/zilliz_universe + :alt: Twitter + + +.. image:: https://img.shields.io/discord/1092648432495251507?label=Discord&logo=discord + :target: https://discord.gg/Q8C6WEjSWV + :alt: Discord + + +πŸŽ‰ GPTCache has been fully integrated with πŸ¦œοΈπŸ”—\ `LangChain `_ ! Here are detailed `usage instructions `_. + +🐳 `The GPTCache server docker image `_ has been released, which means that **any language** will be able to use GPTCache! + +πŸ“” This project is undergoing swift development, and as such, the API may be subject to change at any time. For the most up-to-date information, please refer to the latest `documentation `_ and `release note `_. Quick Install --------------- +------------- ``pip install gptcache`` -What is GPTCache? +πŸš€ What is GPTCache? -------------------- ChatGPT and various large language models (LLMs) boast incredible versatility, enabling the development of a wide range of applications. However, as your application grows in popularity and encounters higher traffic levels, the expenses related to LLM API calls can become substantial. Additionally, LLM services might exhibit slow response times, especially when dealing with a significant number of requests. To tackle this challenge, we have created GPTCache, a project dedicated to building a semantic cache for storing LLM responses. +😊 Quick Start +-------------- -Getting Started --------------------- +**Note**\ : -**Note**: -- You can quickly try GPTCache and put it into a production environment without heavy development. However, please note that the repository is still under heavy development. -- By default, only a limited number of libraries are installed to support the basic cache functionalities. When you need to use additional features, the related libraries will be **automatically installed**. -- Make sure that the Python version is **3.8.1 or higher**, check: ``python --version`` -- If you encounter issues installing a library due to a low pip version, run: ``python -m pip install --upgrade pip``. +* You can quickly try GPTCache and put it into a production environment without heavy development. However, please note that the repository is still under heavy development. +* By default, only a limited number of libraries are installed to support the basic cache functionalities. When you need to use additional features, the related libraries will be **automatically installed**. +* Make sure that the Python version is **3.8.1 or higher**\ , check: ``python --version`` +* If you encounter issues installing a library due to a low pip version, run: ``python -m pip install --upgrade pip``. dev install -```````````` -:: - +^^^^^^^^^^^ - # clone GPTCache repo - git clone https://github.com/zilliztech/GPTCache.git - cd GPTCache +.. code-block:: bash - # install the repo - pip install -r requirements.txt - python setup.py install + # clone GPTCache repo + git clone -b dev https://github.com/zilliztech/GPTCache.git + cd GPTCache + # install the repo + pip install -r requirements.txt + python setup.py install example usage -`````````````` +^^^^^^^^^^^^^ -These examples will help you understand how to use exact and similar matching with caching. +These examples will help you understand how to use exact and similar matching with caching. You can also run the example on `Colab `_. And more examples you can refer to the `Bootcamp `_ Before running the example, **make sure** the OPENAI_API_KEY environment variable is set by executing ``echo $OPENAI_API_KEY``. -If it is not already set, it can be set by using ``export OPENAI_API_KEY=YOUR_API_KEY`` on Unix/Linux/MacOS systems or `set OPENAI_API_KEY=YOUR_API_KEY` on Windows systems. +If it is not already set, it can be set by using ``export OPENAI_API_KEY=YOUR_API_KEY`` on Unix/Linux/MacOS systems or ``set OPENAI_API_KEY=YOUR_API_KEY`` on Windows systems. + +.. + + It is important to note that this method is only effective temporarily, so if you want a permanent effect, you'll need to modify the environment variable configuration file. For instance, on a Mac, you can modify the file located at ``/etc/profile``. + + +.. collapse:: Click to SHOW examples + + + **OpenAI API original usage** + + .. code-block:: python + + import os + import time + + import openai + + + def response_text(openai_resp): + return openai_resp['choices'][0]['message']['content'] + + + question = 'whatβ€˜s chatgpt' + + **OpenAI API original usage** + openai.api_key = os.getenv("OPENAI_API_KEY") + start_time = time.time() + response = openai.ChatCompletion.create( + model='gpt-3.5-turbo', + messages=[ + { + 'role': 'user', + 'content': question + } + ], + ) + print(f'Question: {question}') + print("Time consuming: {:.2f}s".format(time.time() - start_time)) + print(f'Answer: {response_text(response)}\n') + + **OpenAI API + GPTCache, exact match cache** + + .. + + If you ask ChatGPT the exact same two questions, the answer to the second question will be obtained from the cache without requesting ChatGPT again. + + + .. code-block:: python + + import time + + + def response_text(openai_resp): + return openai_resp['choices'][0]['message']['content'] + + print("Cache loading.....") + + **To use GPTCache, that's all you need** + **-------------------------------------------------** + from gptcache import cache + from gptcache.adapter import openai + + cache.init() + cache.set_openai_key() + **-------------------------------------------------** + + question = "what's github" + for _ in range(2): + start_time = time.time() + response = openai.ChatCompletion.create( + model='gpt-3.5-turbo', + messages=[ + { + 'role': 'user', + 'content': question + } + ], + ) + print(f'Question: {question}') + print("Time consuming: {:.2f}s".format(time.time() - start_time)) + print(f'Answer: {response_text(response)}\n') + + **OpenAI API + GPTCache, similar search cache** + + .. + + After obtaining an answer from ChatGPT in response to several similar questions, the answers to subsequent questions can be retrieved from the cache without the need to request ChatGPT again. + + + .. code-block:: python + + import time + + + def response_text(openai_resp): + return openai_resp['choices'][0]['message']['content'] + + from gptcache import cache + from gptcache.adapter import openai + from gptcache.embedding import Onnx + from gptcache.manager import CacheBase, VectorBase, get_data_manager + from gptcache.similarity_evaluation.distance import SearchDistanceEvaluation + + print("Cache loading.....") + + onnx = Onnx() + data_manager = get_data_manager(CacheBase("sqlite"), VectorBase("faiss", dimension=onnx.dimension)) + cache.init( + embedding_func=onnx.to_embeddings, + data_manager=data_manager, + similarity_evaluation=SearchDistanceEvaluation(), + ) + cache.set_openai_key() + + questions = [ + "what's github", + "can you explain what GitHub is", + "can you tell me more about GitHub" + "what is the purpose of GitHub" + ] + + for question in questions: + start_time = time.time() + response = openai.ChatCompletion.create( + model='gpt-3.5-turbo', + messages=[ + { + 'role': 'user', + 'content': question + } + ], + ) + print(f'Question: {question}') + print("Time consuming: {:.2f}s".format(time.time() - start_time)) + print(f'Answer: {response_text(response)}\n') + + **OpenAI API + GPTCache, use temperature** + + .. + + You can always pass a parameter of temperature while requesting the API service or model. + + The range of ``temperature`` is [0, 2], default value is 0.0. + + A higher temperature means a higher possibility of skipping cache search and requesting large model directly. + When temperature is 2, it will skip cache and send request to large model directly for sure. When temperature is 0, it will search cache before requesting large model service. + + The default ``post_process_messages_func`` is ``temperature_softmax``. In this case, refer to `API reference `_ to learn about how ``temperature`` affects output. + + + .. code-block:: python + + import time + + from gptcache import cache, Config + from gptcache.manager import manager_factory + from gptcache.embedding import Onnx + from gptcache.processor.post import temperature_softmax + from gptcache.similarity_evaluation.distance import SearchDistanceEvaluation + from gptcache.adapter import openai + + cache.set_openai_key() + + onnx = Onnx() + data_manager = manager_factory("sqlite,faiss", vector_params={"dimension": onnx.dimension}) + + cache.init( + embedding_func=onnx.to_embeddings, + data_manager=data_manager, + similarity_evaluation=SearchDistanceEvaluation(), + post_process_messages_func=temperature_softmax + ) + **cache.config = Config(similarity_threshold=0.2)** + + question = "what's github" + + for _ in range(3): + start = time.time() + response = openai.ChatCompletion.create( + model="gpt-3.5-turbo", + **Change temperature here** + messages=[{ + "role": "user", + "content": question + }], + ) + print("Time elapsed:", round(time.time() - start, 3)) + print("Answer:", response["choices"][0]["message"]["content"]) + +To use GPTCache exclusively, only the following lines of code are required, and there is no need to modify any existing code. -> It is important to note that this method is only effective temporarily, so if you want a permanent effect, you'll need to modify the environment variable configuration file. For instance, on a Mac, you can modify the file located at ``/etc/profile``. +.. code-block:: python -To use GPTCache exclusively, only the following lines of code are required, and there is no need to modify any existing code. + from gptcache import cache + from gptcache.adapter import openai -:: + cache.init() + cache.set_openai_key() - from gptcache import cache - from gptcache.adapter import openai +More Docs: - cache.init() - cache.set_openai_key() +* `Usage, how to use GPTCache better `_ +* `Features, all features currently supported by the cache `_ +* `Examples, learn better custom caching `_ -More Docs: +πŸŽ“ Bootcamp +----------- -- `Usage, how to use GPTCache better <./usage.html>`_ -- `Features, all features currently supported by the cache <./feature.html>`_ -- `Examples, learn better custom caching `_ -.. toctree:: - :maxdepth: 1 - :caption: Getting Started - :name: getting-started - :hidden: +* GPTCache with **LangChain** - usage.md - feature.md + * `QA Generation `_ + * `Question Answering `_ + * `SQL Chain `_ + * `BabyAGI User Guide `_ + +* GPTCache with **OpenAI** + + * `Chat completion `_ + * `Language Translation `_ + * `SQL Translate `_ + * `Multimodal: Image Generation `_ + * `Multimodal: Speech to Text `_ + +* GPTCache with **Replicate** + + * `Visual Question Answering `_ + +* GPTCache with **Temperature Param** + + * `OpenAI Chat `_ + * `OpenAI Image Creation `_ -What can this help with? -------------------------- +😎 What can this help with? +--------------------------- GPTCache offers the following primary benefits: -- **Decreased expenses**: Most LLM services charge fees based on a combination of number of requests and `token count `_. By caching query results, GPTCache reduces both the number of requests and the number of tokens sent to the LLM service. This minimizes the overall cost of using the service. -- **Enhanced performance**: LLMs employ generative AI algorithms to generate responses in real-time, a process that can sometimes be time-consuming. However, when a similar query is cached, the response time significantly improves, as the result is fetched directly from the cache, eliminating the need to interact with the LLM service. In most situations, GPTCache can also provide superior query throughput compared to standard LLM services. -- **Improved scalability and availability**: LLM services frequently enforce `rate limits `_, which are constraints that APIs place on the number of times a user or client can access the server within a given timeframe. Hitting a rate limit means that additional requests will be blocked until a certain period has elapsed, leading to a service outage. With GPTCache, you can easily scale to accommodate an increasing volume of of queries, ensuring consistent performance as your application's user base expands. -- **Flexible development environment**: When developing LLM applications, an LLM APIs connection is required to prove concepts. GPTCache offers the same interface as LLM APIs and can store LLM-generated or mocked data. This helps to verify your application's features without connecting to the LLM APIs or even the network. -How does it work? ------------------- +* **Decreased expenses**\ : Most LLM services charge fees based on a combination of number of requests and `token count `_. GPTCache effectively minimizes your expenses by caching query results, which in turn reduces the number of requests and tokens sent to the LLM service. As a result, you can enjoy a more cost-efficient experience when using the service. +* **Enhanced performance**\ : LLMs employ generative AI algorithms to generate responses in real-time, a process that can sometimes be time-consuming. However, when a similar query is cached, the response time significantly improves, as the result is fetched directly from the cache, eliminating the need to interact with the LLM service. In most situations, GPTCache can also provide superior query throughput compared to standard LLM services. +* **Adaptable development and testing environment**\ : As a developer working on LLM applications, you're aware that connecting to LLM APIs is generally necessary, and comprehensive testing of your application is crucial before moving it to a production environment. GPTCache provides an interface that mirrors LLM APIs and accommodates storage of both LLM-generated and mocked data. This feature enables you to effortlessly develop and test your application, eliminating the need to connect to the LLM service. +* **Improved scalability and availability**\ : LLM services frequently enforce `rate limits `_\ , which are constraints that APIs place on the number of times a user or client can access the server within a given timeframe. Hitting a rate limit means that additional requests will be blocked until a certain period has elapsed, leading to a service outage. With GPTCache, you can easily scale to accommodate an increasing volume of of queries, ensuring consistent performance as your application's user base expands. + +πŸ€” How does it work? +-------------------- Online services often exhibit data locality, with users frequently accessing popular or trending content. Cache systems take advantage of this behavior by storing commonly accessed data, which in turn reduces data retrieval time, improves response times, and eases the burden on backend servers. Traditional cache systems typically utilize an exact match between a new query and a cached query to determine if the requested content is available in the cache before fetching the data. @@ -130,55 +342,145 @@ GPTCache employs embedding algorithms to convert queries into embeddings and use Featuring a modular design, GPTCache makes it easy for users to customize their own semantic cache. The system offers various implementations for each module, and users can even develop their own implementations to suit their specific needs. -In a semantic cache, false positives can occur during cache hits and false negatives during cache misses. GPTCache provides three metrics to evaluate its performance: +In a semantic cache, you may encounter false positives during cache hits and false negatives during cache misses. GPTCache offers three metrics to gauge its performance, which are helpful for developers to optimize their caching systems: -- Precision: the ratio of true positives to the total of true positives and false positives. -- Recall: the ratio of true positives to the total of true positives and false negatives. -- Latency: the time required for a query to be processed and the corresponding data to be fetched from the cache. - -A `sample benchmark `_ is included for users to start with assessing the performance of their semantic cache. +* **Hit Ratio**\ : This metric quantifies the cache's ability to fulfill content requests successfully, compared to the total number of requests it receives. A higher hit ratio indicates a more effective cache. +* **Latency**\ : This metric measures the time it takes for a query to be processed and the corresponding data to be retrieved from the cache. Lower latency signifies a more efficient and responsive caching system. +* **Recall**\ : This metric represents the proportion of queries served by the cache out of the total number of queries that should have been served by the cache. Higher recall percentages indicate that the cache is effectively serving the appropriate content. +A `sample benchmark `_ is included for users to start with assessing the performance of their semantic cache. -Modules +πŸ€— Modules ---------- -.. image:: ./GPTCacheStructure.png -You can take a look at modules below to learn more about system design and architecture. +.. image:: GPTCacheStructure.png + :target: GPTCacheStructure.png + :alt: GPTCache Struct + + + +* **LLM Adapter**\ : + The LLM Adapter is designed to integrate different LLM models by unifying their APIs and request protocols. GPTCache offers a standardized interface for this purpose, with current support for ChatGPT integration. + + * [x] Support OpenAI ChatGPT API. + * [x] Support `langchain `_. + * [x] Support `minigpt4 `_. + * [x] Support `Llamacpp `_. + * [x] Support `dolly `_. + * [ ] Support other LLMs, such as Hugging Face Hub, Bard, Anthropic. + +* **Multimodal Adapter (experimental)**\ : + The Multimodal Adapter is designed to integrate different large multimodal models by unifying their APIs and request protocols. GPTCache offers a standardized interface for this purpose, with current support for integrations of image generation, audio transcription. + + * [x] Support OpenAI Image Create API. + * [x] Support OpenAI Audio Transcribe API. + * [x] Support Replicate BLIP API. + * [x] Support Stability Inference API. + * [x] Support Hugging Face Stable Diffusion Pipeline (local inference). + * [ ] Support other multimodal services or self-hosted large multimodal models. + +* **Embedding Generator**\ : + This module is created to extract embeddings from requests for similarity search. GPTCache offers a generic interface that supports multiple embedding APIs, and presents a range of solutions to choose from. + + * [x] Disable embedding. This will turn GPTCache into a keyword-matching cache. + * [x] Support OpenAI embedding API. + * [x] Support `ONNX `_ with the GPTCache/paraphrase-albert-onnx model. + * [x] Support `Hugging Face `_ embedding with transformers, ViTModel, Data2VecAudio. + * [x] Support `Cohere `_ embedding API. + * [x] Support `fastText `_ embedding. + * [x] Support `SentenceTransformers `_ embedding. + * [x] Support `Timm `_ models for image embedding. + * [ ] Support other embedding APIs. + +* **Cache Storage**\ : + **Cache Storage** is where the response from LLMs, such as ChatGPT, is stored. Cached responses are retrieved to assist in evaluating similarity and are returned to the requester if there is a good semantic match. At present, GPTCache supports SQLite and offers a universally accessible interface for extension of this module. + + * [x] Support `SQLite `_. + * [x] Support `DuckDB `_. + * [x] Support `PostgreSQL `_. + * [x] Support `MySQL `_. + * [x] Support `MariaDB `_. + * [x] Support `SQL Server `_. + * [x] Support `Oracle `_. + * [x] Support `Duckdb `_. + * [ ] Support `MongoDB `_. + * [ ] Support `Redis `_. + * [ ] Support `Minio `_. + * [ ] Support `HBase `_. + * [ ] Support `ElasticSearch `_. + * [ ] Support other storages. + +* **Vector Store**\ : + The **Vector Store** module helps find the K most similar requests from the input request's extracted embedding. The results can help assess similarity. GPTCache provides a user-friendly interface that supports various vector stores, including Milvus, Zilliz Cloud, and FAISS. More options will be available in the future. + + * [x] Support `Milvus `_\ , an open-source vector database for production-ready AI/LLM applicaionts. + * [x] Support `Zilliz Cloud `_\ , a fully-managed cloud vector database based on Milvus. + * [x] Support `Milvus Lite `_\ , a lightweight version of Milvus that can be embedded into your Python application. + * [x] Support `FAISS `_\ , a library for efficient similarity search and clustering of dense vectors. + * [x] Support `Hnswlib `_\ , header-only C++/python library for fast approximate nearest neighbors. + * [x] Support `pgvector `_\ , open-source vector similarity search for Postgres. + * [x] Support `chroma `_\ , the AI-native open-source embedding database. + * [ ] Support qdrant + * [ ] Support weaviate + * [ ] Support other vector databases. + +* **Cache Manager**\ : + The **Cache Manager** is responsible for controlling the operation of both the **Cache Storage** and **Vector Store**. + + * **Eviction Policy**\ : + Currently, GPTCache makes decisions about evictions based solely on the number of lines. This approach can result in inaccurate resource evaluation and may cause out-of-memory (OOM) errors. We are actively investigating and developing a more sophisticated strategy. + + * [x] Support LRU eviction policy. + * [x] Support FIFO eviction policy. + * [ ] Support more complicated eviction policies. + +* **Similarity Evaluator**\ : + This module collects data from both the **Cache Storage** and **Vector Store**\ , and uses various strategies to determine the similarity between the input request and the requests from the **Vector Store**. Based on this similarity, it determines whether a request matches the cache. GPTCache provides a standardized interface for integrating various strategies, along with a collection of implementations to use. The following similarity definitions are currently supported or will be supported in the future: + + * [x] The distance we obtain from the **Vector Store**. + * [x] A model-based similarity determined using the GPTCache/albert-duplicate-onnx model from `ONNX `_. + * [x] Exact matches between the input request and the requests obtained from the **Vector Store**. + * [x] Distance represented by applying linalg.norm from numpy to the embeddings. + * [ ] BM25 and other similarity measurements. + * [ ] Support other model serving framework such as PyTorch. + + **Note**\ :Not all combinations of different modules may be compatible with each other. For instance, if we disable the **Embedding Extractor**\ , the **Vector Store** may not function as intended. We are currently working on implementing a combination sanity check for **GPTCache**. + +πŸ˜‡ Roadmap +---------- -- `LLM Adapter `_ -- `Embedding Generator `_ -- `Cache Storage `_ -- `Vector Store `_ -- `Cache Manager `_ -- `Similarity Evaluator `_ +Coming soon! `Stay tuned! `_ -.. toctree:: - :maxdepth: 2 - :caption: Modules - :name: modules - :hidden: +😍 Contributing +--------------- - modules/llm_adapter - modules/embedding_generator - modules/cache_storage - modules/vector_store - modules/cache_manager - modules/similarity_evaluator +We are extremely open to contributions, be it through new features, enhanced infrastructure, or improved documentation. -**Note**: -Not all combinations of different modules may be compatible with each other. For instance, if we disable the **Embedding Extractor**, the **Vector Store** may not function as intended. We are currently working on implementing a combination sanity check for **GPTCache**. +For comprehensive instructions on how to contribute, please refer to our `contribution guide `_. -.. Examples -.. ----------- +.. toctree:: + :maxdepth: 1 + :caption: Getting Started + :name: getting-started + :hidden: + usage.md + feature.md + release_note.md -References ----------------- +.. toctree:: + :maxdepth: 1 + :caption: Bootcamp + :name: bootcamp + :hidden: -For more information about API and examples, you can checkout `API References <./references/index.html>`_. + bootcamp/langchain/index + bootcamp/openai/index + bootcamp/replicate/index + bootcamp/temperature/index .. toctree:: :maxdepth: 1 @@ -188,24 +490,10 @@ For more information about API and examples, you can checkout `API References <. references/index - -Roadmap -------- - -Coming soon! `Stay tuned! `_ - - -Contributing ---------------- - -WWe are extremely open to contributions, be it through new features, enhanced infrastructure, or improved documentation. - -For comprehensive instructions on how to contribute, please refer to our `contribution guide `_. - .. toctree:: :maxdepth: 1 :caption: Contributing :name: contributing :hidden: - contributing.md + contributing.md \ No newline at end of file diff --git a/docs/references/index.rst b/docs/references/index.rst index f112f593..0abbc6a2 100644 --- a/docs/references/index.rst +++ b/docs/references/index.rst @@ -1,2 +1,20 @@ πŸ₯Έ API References ================= + +.. toctree:: + :maxdepth: 1 + :caption: Contents: + + + gptcache + adapter + client + config + core + embedding + manager + processor + report + session + similarity_evaluation + utils \ No newline at end of file diff --git a/docs/usage.md b/docs/usage.md index 605ba69a..8098c7e4 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -316,7 +316,7 @@ for _ in range(3): ### Use GPTCache server -GPTCache now supports building a server with caching and conversation capabilities. You can start a customized GPTCache service within a few lines. Here is a simple example to show how to build and interact with GPTCache server. For more detailed information, arguments, parameters, refer to [this](../examples/README.md). +GPTCache now supports building a server with caching and conversation capabilities. You can start a customized GPTCache service within a few lines. Here is a simple example to show how to build and interact with GPTCache server. For more detailed information, arguments, parameters, refer to [this](https://github.com/zilliztech/gpt-cache/tree/main/examples). **Start server** diff --git a/examples/README.md b/examples/README.md index 210692ba..14a800da 100644 --- a/examples/README.md +++ b/examples/README.md @@ -591,7 +591,7 @@ config: - pre_function: The pre-processing function. - post_function: The post-processing function. -For `model_src`, `evaluation`, `storage_config` options, check [README.md](../README.md) for more. +For `model_src`, `evaluation`, `storage_config` options, check [README.md](https://github.com/zilliztech/gpt-cache/tree/main/examples) for more. **Use the docker to start the GPTCache server**