milvus build utilities, manifests, and seed generation script WIP

Signed-off-by: greg pereira <[email protected]>
instructlab · May 26, 2024 · 8e88733 · 8e88733
1 parent e85a64d
commit 8e88733
Show file tree

Hide file tree

Showing 17 changed files with 348 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -12,6 +12,9 @@ models
 generated
 .idea
 .DS_Store
+milvus/seed/data/*
+*.venv
+*venv
 
 # UI assets
 **/node_modules

diff --git a/milvus/build/Containerfile b/milvus/build/Containerfile
@@ -0,0 +1,2 @@
+FROM docker.io/milvusdb/milvus:master-20240426-bed6363f
+ADD embedEtcd.yaml /milvus/configs/embedEtcd.yaml
diff --git a/milvus/build/Makefile b/milvus/build/Makefile
@@ -0,0 +1,55 @@
+REGISTRY ?= quay.io
+REGISTRY_ORG ?= ai-lab
+COMPONENT = vector_dbs
+
+IMAGE ?= $(REGISTRY)/$(REGISTRY_ORG)/$(COMPONENT)/milvus:latest
+
+ARCH ?= $(shell uname -m)
+PLATFORM ?= linux/$(ARCH)
+
+gRCP_PORT := 19530
+REST_PORT := 9091
+CLIENT_PORT := 2379
+
+LIB_MILVUS_DIR_MOUNTPATH := $(shell pwd)/volumes/milvus
+
+.PHONY: build
+build:
+	podman build --platform $(PLATFORM) -f Containerfile -t ${IMAGE} .
+
+.PHONY: run
+run:
+	podman run -d \
+		--name milvus-standalone \
+		--security-opt seccomp:unconfined \
+		-e ETCD_USE_EMBED=true \
+		-e ETCD_CONFIG_PATH=/milvus/configs/embedEtcd.yaml \
+		-e COMMON_STORAGETYPE=local \
+		-v $(LIB_MILVUS_DIR_MOUNTPATH):/var/lib/milvus \
+		-p $(gRCP_PORT):$(gRCP_PORT) \
+		-p $(REST_PORT):$(REST_PORT) \
+		-p $(CLIENT_PORT):$(CLIENT_PORT) \
+		--health-cmd="curl -f http://localhost:$(REST_PORT)/healthz" \
+		--health-interval=30s \
+		--health-start-period=90s \
+		--health-timeout=20s \
+		--health-retries=3 \
+		$(IMAGE) \
+		milvus run standalone 1> /dev/null
+
+.PHONY: stop
+stop:
+	-podman stop milvus-standalone
+
+.PHONY: delete
+delete:
+	-podman rm milvus-standalone -f
+
+.PHONY: podman-clean
+podman-clean:
+	@container_ids=$$(podman ps -a --format "{{.ID}} {{.Image}}" | awk '$$2 == "$(IMAGE)" {print $$1}'); \
+	echo "removing all containers with IMAGE=$(IMAGE)"; \
+    for id in $$container_ids; do \
+        echo "Removing container: $$id,"; \
+        podman rm -f $$id; \
+    done
diff --git a/milvus/build/embedEtcd.yaml b/milvus/build/embedEtcd.yaml
@@ -0,0 +1,5 @@
+listen-client-urls: http://0.0.0.0:2379
+advertise-client-urls: http://0.0.0.0:2379
+quota-backend-bytes: 4294967296
+auto-compaction-mode: revision
+auto-compaction-retention: '1000'
diff --git a/milvus/build/merlinite-qq.sh b/milvus/build/merlinite-qq.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+## EXPECTED INPUT IS STRING ECAPSULATED
+input="$1"
+echo "input: $input"
+request_body='{"model":"ibm/merlinite-7b","logprobs":false,"messages":[{"role": "system","content": "You are an AI language model developed by IBM Research. You are a cautious assistant. You carefully follow instructions. You are helpful and harmless and you follow ethical guidelines and promote positive behavior."},{"role":"user","content": "'$input'"}],"stream":false}'
+echo $request_body
+curl -X 'POST' 'https://merlinite-7b-vllm-openai.apps.fmaas-backend.fmaas.res.ibm.com/v1/chat/completions' -H 'accept: application/json' -H 'Content-Type: application/json' -k -d $request_body
diff --git a/milvus/build/volumes/Containerfile b/milvus/build/volumes/Containerfile
@@ -0,0 +1,2 @@
+FROM docker.io/milvusdb/milvus:master-20240426-bed6363f
+ADD embedEtcd.yaml /milvus/configs/embedEtcd.yaml
diff --git a/milvus/build/volumes/Makefile b/milvus/build/volumes/Makefile
@@ -0,0 +1,55 @@
+REGISTRY ?= quay.io
+REGISTRY_ORG ?= ai-lab
+COMPONENT = vector_dbs
+
+IMAGE ?= $(REGISTRY)/$(REGISTRY_ORG)/$(COMPONENT)/milvus:latest
+
+ARCH ?= $(shell uname -m)
+PLATFORM ?= linux/$(ARCH)
+
+gRCP_PORT := 19530
+REST_PORT := 9091
+CLIENT_PORT := 2379
+
+LIB_MILVUS_DIR_MOUNTPATH := $(shell pwd)/volumes/milvus
+
+.PHONY: build
+build:
+	podman build --platform $(PLATFORM) -f Containerfile -t ${IMAGE} .
+
+.PHONY: run
+run:
+	podman run -it \
+		--name milvus-standalone \
+		--security-opt seccomp:unconfined \
+		-e ETCD_USE_EMBED=true \
+		-e ETCD_CONFIG_PATH=/milvus/configs/embedEtcd.yaml \
+		-e COMMON_STORAGETYPE=local \
+		-v $(LIB_MILVUS_DIR_MOUNTPATH):/var/lib/milvus \
+		-p $(gRCP_PORT):$(gRCP_PORT) \
+		-p $(REST_PORT):$(REST_PORT) \
+		-p $(CLIENT_PORT):$(CLIENT_PORT) \
+		--health-cmd="curl -f http://localhost:$(REST_PORT)/healthz" \
+		--health-interval=30s \
+		--health-start-period=90s \
+		--health-timeout=20s \
+		--health-retries=3 \
+		$(IMAGE) \
+		milvus run standalone 1> /dev/null
+
+.PHONY: stop
+stop:
+	-podman stop milvus-standalone
+
+.PHONY: delete
+delete:
+	-podman rm milvus-standalone -f
+
+.PHONY: podman-clean
+podman-clean:
+	@container_ids=$$(podman ps --format "{{.ID}} {{.Image}}" | awk '$$2 == "$(IMAGE)" {print $$1}'); \
+	echo "removing all containers with IMAGE=$(IMAGE)"; \
+    for id in $$container_ids; do \
+        echo "Removing container: $$id,"; \
+        podman rm -f $$id; \
+    done
diff --git a/milvus/build/volumes/embedEtcd.yaml b/milvus/build/volumes/embedEtcd.yaml
@@ -0,0 +1,5 @@
+listen-client-urls: http://0.0.0.0:2379
+advertise-client-urls: http://0.0.0.0:2379
+quota-backend-bytes: 4294967296
+auto-compaction-mode: revision
+auto-compaction-retention: '1000'
diff --git a/milvus/build/volumes/milvus/.gitkeep b/milvus/build/volumes/milvus/.gitkeep
diff --git a/milvus/seed/.env.example b/milvus/seed/.env.example
@@ -0,0 +1,3 @@
+MODEL_NAME=
+MODEL_ENDPOINT=
+MODEL_TOKEN=
diff --git a/milvus/seed/README.md b/milvus/seed/README.md
diff --git a/milvus/seed/client.py b/milvus/seed/client.py
@@ -0,0 +1,58 @@
+import httpx
+import os
+import ssl
+
+# manage ENV
+model_endpoint=os.getenv('MODEL_ENDPOINT')
+if model_endpoint == "":
+    model_endpoint = "http://localhost:8001"
+
+model_name=os.getenv('MODEL_NAME')
+if model_name == "":
+    model_name = "ibm/merlinite-7b"
+
+model_token=os.getenv('MODEL_TOKEN')
+
+# HTTPS client
+client_key_path = "/home/fedora/client-tls-key.pem2"
+client_crt_path = "/home/fedora/client-tls-crt.pem2"
+server_ca_crt   = "/home/fedora/server-ca-crt.pem2"
+
+ssl_context = ssl.create_default_context(cafile=server_ca_crt)
+ssl_context.load_cert_chain(certfile=client_crt_path, keyfile=client_key_path)
+
+client = httpx.Client(verify=ssl_context)
+
+
+def get_openai_response(prompt, **kwargs):
+    url = model_endpoint
+    headers = {
+        "Authorization": f"Bearer {model_token}",
+        "Content-Type": "application/json"
+    }
+    data = {
+        "model": model_name,
+        "max_tokens": 4096,
+        "messages": [
+            {
+                "role": "system",
+                "content": "You are an AI language model developed by IBM Research. You are a cautious assistant that carefully follows instructions. You are helpful and harmless and you follow ethical guidelines and promote positive behavior."
+            },
+            {
+                "role":"user",
+                "content": prompt
+            }
+        ],
+        "logprobs":False,
+        "stream":False
+    }
+
+    response = client.post(url, json=data, headers=headers)
+    response.raise_for_status()
+    return response.json()
+
+question = """ Question: I am training for an upcoming marathon but I am completely out of shape! Can you help me to implement a plan to prepare me for running a marathon in 12 weeks?
+
+Answer: Let's think step by step. """
+
+# get_openai_response(question)
diff --git a/milvus/seed/new-seed.py b/milvus/seed/new-seed.py
@@ -0,0 +1,31 @@
+import os
+from pymilvus import MilvusClient, DataType
+from langchain_experimental.text_splitter import SemanticChunker
+from langchain_community.document_loaders import PyPDFLoader, WebBaseLoader
+from langchain_community.embeddings import HuggingFaceBgeEmbeddings, HuggingFaceInstructEmbeddings
+from tika import parser # pip install tika
+
+def log_step(step_num, step_name) -> None:
+    print("-----------------------------------------------")
+    print(f"{step_num}. {step_name}")
+    print("-----------------------------------------------")
+
+model_name = "ibm/merlinite-7b"
+model_kwargs = {"device": "cpu"}
+encode_kwargs = {"normalize_embeddings": True}
+
+log_step(0, "Generate embeddings")
+embeddings = HuggingFaceBgeEmbeddings(
+    model_name=model_name,
+    model_kwargs=model_kwargs,
+    encode_kwargs=encode_kwargs,
+    query_instruction = "search_query:",
+    embed_instruction = "search_document:"
+)
+
+
+# data_url = "https://orkerhulen.dk/onewebmedia/DnD%205e%20Players%20Handbook%20%28BnW%20OCR%29.pdf"
+# loader = WebBaseLoader(data_url)
+# data = loader.load()
+raw = parser.from_file("data/DnD-5e-Handbook.pdf")
+print(raw['content'])
diff --git a/milvus/seed/requirements-lock.txt b/milvus/seed/requirements-lock.txt
@@ -0,0 +1,78 @@
+aiohttp==3.9.5
+aiosignal==1.3.1
+annotated-types==0.7.0
+anyio==4.3.0
+attrs==23.2.0
+beautifulsoup4==4.12.3
+bs4==0.0.2
+certifi==2024.2.2
+charset-normalizer==3.3.2
+dataclasses-json==0.6.6
+distro==1.9.0
+environs==9.5.0
+filelock==3.14.0
+frozenlist==1.4.1
+fsspec==2024.5.0
+grpcio==1.63.0
+h11==0.14.0
+httpcore==1.0.5
+httpx==0.27.0
+huggingface-hub==0.23.1
+idna==3.7
+Jinja2==3.1.4
+joblib==1.4.2
+jsonpatch==1.33
+jsonpointer==2.4
+langchain==0.2.1
+langchain-community==0.2.1
+langchain-core==0.2.1
+langchain-experimental==0.0.59
+langchain-openai==0.1.7
+langchain-text-splitters==0.2.0
+langsmith==0.1.63
+MarkupSafe==2.1.5
+marshmallow==3.21.2
+milvus-lite==2.4.5
+mpmath==1.3.0
+multidict==6.0.5
+mypy-extensions==1.0.0
+networkx==3.3
+numpy==1.26.4
+openai==1.30.3
+orjson==3.10.3
+packaging==23.2
+pandas==2.2.2
+pillow==10.3.0
+protobuf==5.27.0
+pydantic==2.7.1
+pydantic_core==2.18.2
+pymilvus==2.4.3
+python-dateutil==2.9.0.post0
+python-dotenv==1.0.1
+pytz==2024.1
+PyYAML==6.0.1
+regex==2024.5.15
+requests==2.32.2
+safetensors==0.4.3
+scikit-learn==1.5.0
+scipy==1.13.1
+sentence-transformers==2.7.0
+six==1.16.0
+sniffio==1.3.1
+soupsieve==2.5
+SQLAlchemy==2.0.30
+sympy==1.12
+tenacity==8.3.0
+threadpoolctl==3.5.0
+tika==2.6.0
+tiktoken==0.7.0
+tokenizers==0.19.1
+torch==2.3.0
+tqdm==4.66.4
+transformers==4.41.1
+typing-inspect==0.9.0
+typing_extensions==4.12.0
+tzdata==2024.1
+ujson==5.10.0
+urllib3==2.2.1
+yarl==1.9.4
diff --git a/milvus/seed/requirements.txt b/milvus/seed/requirements.txt
@@ -0,0 +1,9 @@
+pymilvus==2.4.3
+langchain==0.2.1
+langchain-community==0.2.1
+langchain-core==0.2.1
+langchain-openai==0.1.7
+langchain-experimental==0.0.59
+tika==2.6.0
+sentence-transformers==2.7.0
+beautifulsoup4==4.12.3
diff --git a/milvus/seed/seed.py b/milvus/seed/seed.py
@@ -0,0 +1,34 @@
+import os
+from pymilvus import MilvusClient, DataType
+from langchain_experimental.text_splitter import SemanticChunker
+from langchain_community.document_loaders import PyPDFLoader, WebBaseLoader
+from langchain_community.embeddings import HuggingFaceBgeEmbeddings, HuggingFaceInstructEmbeddings
+from tika import parser # pip install tika
+
+def log_step(step_num, step_name) -> None:
+    print("-----------------------------------------------")
+    print(f"{step_num}. {step_name}")
+    print("-----------------------------------------------")
+
+model_name = "ibm/merlinite-7b"
+model_kwargs = {"device": "gpu"}
+encode_kwargs = {"normalize_embeddings": True}
+
+log_step(0, "Generate embeddings")
+embeddings = HuggingFaceBgeEmbeddings(
+    model_name=model_name,
+    model_kwargs=model_kwargs,
+    encode_kwargs=encode_kwargs,
+    query_instruction = "search_query:",
+    embed_instruction = "search_document:"
+)
+
+log_step(1, "Init text splitter")
+text_splitter = SemanticChunker(embeddings=embeddings)
+log_step(2, "Read Raw data from PDF")
+raw = parser.from_file("data/DnD-5e-Handbook.pdf")
+log_step(3, "Text splitting")
+print(len(raw['content']))
+docs = text_splitter.create_documents([raw['content']])
+# log_step(4, "Log result")
+# print(len(docs))
diff --git a/ui/compose.ui b/ui/compose.ui
@@ -93,6 +93,7 @@ services:
       COMMON_STORAGETYPE: "local"
     volumes:
       - /home/fedora/milvus-volume:/var/lib/milvus
+      - /home/fedora/instruct-lab-bot/milvus/seed:data/milvus/seed
     ports:
       - 19530:19530
       - 9091:9091
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		FROM docker.io/milvusdb/milvus:master-20240426-bed6363f
		ADD embedEtcd.yaml /milvus/configs/embedEtcd.yaml