[vector-db-support] Add support for Milvius.io - part 1 (#470)

LangStream · Sep 22, 2023 · 891ea71 · 891ea71
1 parent e9f9ad2
commit 891ea71
Show file tree

Hide file tree

Showing 26 changed files with 1,877 additions and 148 deletions.
diff --git a/examples/applications/query-milvus/.gitignore b/examples/applications/query-milvus/.gitignore
@@ -0,0 +1 @@
+java/lib/*
diff --git a/examples/applications/query-milvus/README.md b/examples/applications/query-milvus/README.md
@@ -0,0 +1,83 @@
+# Indexing a WebSite
+
+This sample application shows how to use the WebCrawler Source Connector and use Milvus.io as a Vector Database.
+
+## Prerequisites
+
+Create a S3 bucket, it will contain only a metadata file for the WebCrawler.
+
+Start a Milvus.io instance, you can use the following Helm chart:
+
+The LangStream application will create for you a collection named "documents" in "default" database.
+
+```
+documents (  
+  filename string,
+  chunk_id int,
+  num_tokens int,
+  language string,  
+  text string 
+)
+```
+
+
+## Configure access to the Vector Database
+
+Export some ENV variables in order to configure access to the database:
+
+```bash
+export MILVUS_HOST=...
+export MILVUS_PORT=...
+export MILVUS_USERNAME=...
+export MILVUS_PASSWORD=...
+```
+
+
+The examples/secrets/secrets.yaml resolves those environment variables for you.
+When you go in production you are supposed to create a dedicated secrets.yaml file for each environment.
+
+## Configure an S3 bucket to store the status of the Crawler
+
+The Web Crawling Source Connector requires an S3 bucket to store the status of the crawler.
+It doesn't copy the contents of the web pages, it only stores some metadata.
+
+If you are using AWS S3, you can use the following environment variables:
+
+```
+export S3_BUCKET_NAME...  
+export S3_ENDPOINT=https://s3.amazonaws.com      
+export S3_ACCESS_KEY=...
+export S3_SECRET=...
+```
+
+The default configuration uses the internal MinIO service deployed in the local Kubernetes cluster,
+this is useful for testing purposes only and it works only when you deployed LangStream locally.
+
+
+## Configure the pipeline
+
+Edit the file `crawler.yaml` and configure the list of the allowed web domains, this is required in order to not let the crawler escape outside your data.
+Configure the list of seed URLs, for instance with your home page.
+
+The default configuration in this example will crawl the LangStream website.
+
+## Deploy the LangStream application
+
+```
+./bin/langstream apps deploy test -app examples/applications/query_milvus -i examples/instances/kafka-kubernetes.yaml -s examples/secrets/secrets.yaml
+```
+
+## Talk with the Chat bot using the CLI
+Since the application opens a gateway, we can use the gateway API to send and consume messages.
+
+```
+./bin/langstream gateway chat test -cg bot-output -pg user-input -p sessionId=$(uuidgen)
+```
+
+Responses are streamed to the output-topic. If you want to inspect the history of the raw answers you can
+consume from the log-topic using the llm-debug gateway:
+
+```
+./bin/langstream gateway consume test llm-debug
+```
+
diff --git a/examples/applications/query-milvus/chatbot.yaml b/examples/applications/query-milvus/chatbot.yaml
@@ -0,0 +1,94 @@
+#
+# Copyright DataStax, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+topics:
+  - name: "questions-topic"
+    creation-mode: create-if-not-exists
+  - name: "answers-topic"
+    creation-mode: create-if-not-exists
+  - name: "log-topic"
+    creation-mode: create-if-not-exists
+errors:
+    on-failure: "skip"
+pipeline:
+  - name: "convert-to-structure"
+    type: "document-to-json"
+    input: "questions-topic"
+    configuration:
+      text-field: "question"
+  - name: "compute-embeddings"
+    type: "compute-ai-embeddings"
+    configuration:
+      model: "{{{secrets.open-ai.embeddings-model}}}" # This needs to match the name of the model deployment, not the base model
+      embeddings-field: "value.question_embeddings"
+      text: "{{% value.question }}"
+      flush-interval: 0
+  - name: "lookup-related-documents-in-llm"
+    type: "query"
+    configuration:
+      datasource: "MilvusDatasource"
+      query: |
+        {
+          "collection-name": "documents",
+          "vectors": ?,
+          "top-k": 1
+          "output-fields": ["text"]
+        }
+      fields:
+        - "value.question_embeddings"
+      output-field: "value.related_documents"
+  - name: "ai-chat-completions"
+    type: "ai-chat-completions"
+
+    configuration:
+      model: "{{{secrets.open-ai.chat-completions-model}}}" # This needs to be set to the model deployment name, not the base name
+      # on the log-topic we add a field with the answer
+      completion-field: "value.answer"
+      # we are also logging the prompt we sent to the LLM
+      log-field: "value.prompt"
+      # here we configure the streaming behavior
+      # as soon as the LLM answers with a chunk we send it to the answers-topic
+      stream-to-topic: "answers-topic"
+      # on the streaming answer we send the answer as whole message
+      # the 'value' syntax is used to refer to the whole value of the message
+      stream-response-completion-field: "value"
+      # we want to stream the answer as soon as we have 20 chunks
+      # in order to reduce latency for the first message the agent sends the first message
+      # with 1 chunk, then with 2 chunks....up to the min-chunks-per-message value
+      # eventually we want to send bigger messages to reduce the overhead of each message on the topic
+      min-chunks-per-message: 20
+      messages:
+        - role: system
+          content: |
+              An user is going to perform a questions, The documents below may help you in answering to their questions.
+              Please try to leverage them in your answer as much as possible.
+              Take into consideration that the user is always asking questions about the LangStream project.
+              If you provide code or YAML snippets, please explicitly state that they are examples.
+              Do not provide information that is not related to the LangStream project.
+            
+              Documents:
+              {{%# value.related_documents}}
+              {{% text}}
+              {{%/ value.related_documents}}
+        - role: user
+          content: "{{% value.question}}"
+  - name: "cleanup-response"
+    type: "drop-fields"
+    output: "log-topic"
+    configuration:
+      fields:
+        - "question_embeddings"
+        - "related_documents"
diff --git a/examples/applications/query-milvus/configuration.yaml b/examples/applications/query-milvus/configuration.yaml
@@ -0,0 +1,34 @@
+#
+#
+# Copyright DataStax, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+configuration:
+  resources:
+  - type: "open-ai-configuration"
+    name: "OpenAI Azure configuration"
+    configuration:
+      url: "{{ secrets.open-ai.url }}"
+      access-key: "{{ secrets.open-ai.access-key }}"
+      provider: "{{ secrets.open-ai.provider }}"
+  - type: "datasource"
+    name: "MilvusDatasource"
+    configuration:
+      service: "milvus"
+      username: "{{{ secrets.milvus.username }}}"
+      password: "{{{ secrets.milvus.password }}}"
+      host: "{{{ secrets.milvus.host }}}"
+      port: "{{{ secrets.milvus.port }}}"
+
diff --git a/examples/applications/query-milvus/crawler.yaml b/examples/applications/query-milvus/crawler.yaml
@@ -0,0 +1,97 @@
+#
+# Copyright DataStax, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+name: "Crawl a website"
+topics:
+  - name: "chunks-topic"
+    creation-mode: create-if-not-exists
+resources:
+      size: 2
+pipeline:
+  - name: "Crawl the WebSite"
+    type: "webcrawler-source"
+    configuration:
+      seed-urls: ["https://docs.langstream.ai/"]
+      allowed-domains: ["https://docs.langstream.ai"]
+      forbidden-paths: []
+      min-time-between-requests: 500
+      reindex-interval-seconds: 3600
+      max-error-count: 5
+      max-urls: 1000
+      max-depth: 50
+      handle-robots-file: true
+      user-agent: "" # this is computed automatically, but you can override it
+      scan-html-documents: true
+      http-timeout: 10000
+      handle-cookies: true
+      max-unflushed-pages: 100
+      bucketName: "{{{secrets.s3.bucket-name}}}"
+      endpoint: "{{{secrets.s3.endpoint}}}"
+      access-key: "{{{secrets.s3.access-key}}}"
+      secret-key: "{{{secrets.s3.secret}}}"
+      region: "{{{secrets.s3.region}}}"
+  - name: "Extract text"
+    type: "text-extractor"
+  - name: "Normalise text"
+    type: "text-normaliser"
+    configuration:
+      make-lowercase: true
+      trim-spaces: true
+  - name: "Detect language"
+    type: "language-detector"
+    configuration:
+      allowedLanguages: ["en", "fr"]
+      property: "language"
+  - name: "Split into chunks"
+    type: "text-splitter"
+    configuration:
+      splitter_type: "RecursiveCharacterTextSplitter"
+      chunk_size: 400
+      separators: ["\n\n", "\n", " ", ""]
+      keep_separator: false
+      chunk_overlap: 100
+      length_function: "cl100k_base"
+  - name: "Convert to structured data"
+    type: "document-to-json"
+    configuration:
+      text-field: text
+      copy-properties: true
+  - name: "prepare-structure"
+    type: "compute"
+    configuration:
+      fields:
+        - name: "value.filename"
+          expression: "properties.url"
+          type: STRING
+        - name: "value.chunk_id"
+          expression: "properties.chunk_id"
+          type: STRING
+        - name: "value.language"
+          expression: "properties.language"
+          type: STRING
+        - name: "value.chunk_num_tokens"
+          expression: "properties.chunk_num_tokens"
+          type: STRING
+  - name: "compute-embeddings"
+    id: "step1"
+    type: "compute-ai-embeddings"
+    output: "chunks-topic"
+    configuration:
+      model: "text-embedding-ada-002" # This needs to match the name of the model deployment, not the base model
+      embeddings-field: "value.embeddings_vector"
+      text: "{{% value.text }}"
+      batch-size: 10
+      flush-interval: 500
diff --git a/examples/applications/query-milvus/gateways.yaml b/examples/applications/query-milvus/gateways.yaml
@@ -0,0 +1,43 @@
+#
+#
+# Copyright DataStax, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+gateways:
+  - id: "user-input"
+    type: produce
+    topic: "questions-topic"
+    parameters:
+      - sessionId
+    produceOptions:
+      headers:
+        - key: langstream-client-session-id
+          valueFromParameters: sessionId
+
+  - id: "bot-output"
+    type: consume
+    topic: "answers-topic"
+    parameters:
+      - sessionId
+    consumeOptions:
+      filters:
+        headers:
+          - key: langstream-client-session-id
+            valueFromParameters: sessionId
+
+
+  - id: "llm-debug"
+    type: consume
+    topic: "log-topic"