diff --git a/.gitattributes b/.gitattributes
index 3d3699c..c0e9441 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -1 +1 @@
-holmes/.git_archival.json export-subst
\ No newline at end of file
+holmes/.git_archival.json export-subst
diff --git a/.github/workflows/build-binaries-and-brew.yaml b/.github/workflows/build-binaries-and-brew.yaml
index 653f581..f238f2e 100644
--- a/.github/workflows/build-binaries-and-brew.yaml
+++ b/.github/workflows/build-binaries-and-brew.yaml
@@ -22,7 +22,7 @@ jobs:
uses: actions/setup-python@v2
with:
python-version: '3.11'
-
+
- name: Install dependencies
if: matrix.os != 'windows-latest'
run: |
@@ -43,7 +43,7 @@ jobs:
if: matrix.os == 'ubuntu-20.04'
run: |
sudo apt-get install -y binutils
-
+
- name: Update package version (Linux)
if: matrix.os == 'ubuntu-20.04'
run: sed -i 's/__version__ = .*/__version__ = "${{ github.ref_name }}"/g' holmes/__init__.py
@@ -67,7 +67,7 @@ jobs:
# regarding the tiktoken part of the command, see https://github.com/openai/tiktoken/issues/80
# regarding the litellm part of the command, see https://github.com/pyinstaller/pyinstaller/issues/8620#issuecomment-2186540504
run: |
- pyinstaller holmes.py --add-data 'holmes/plugins/runbooks/*:holmes/plugins/runbooks' --add-data 'holmes/plugins/prompts/*:holmes/plugins/prompts' --add-data 'holmes/plugins/toolsets/*:holmes/plugins/toolsets' --hidden-import=tiktoken_ext.openai_public --hidden-import=tiktoken_ext --hiddenimport litellm.llms.tokenizers --hiddenimport litellm.litellm_core_utils.tokenizers --collect-data litellm
+ pyinstaller holmes.py --add-data 'holmes/plugins/runbooks/*:holmes/plugins/runbooks' --add-data 'holmes/plugins/prompts/*:holmes/plugins/prompts' --add-data 'holmes/plugins/toolsets/*:holmes/plugins/toolsets' --hidden-import=tiktoken_ext.openai_public --hidden-import=tiktoken_ext --hiddenimport litellm.llms.tokenizers --hiddenimport litellm.litellm_core_utils.tokenizers --collect-data litellm
ls dist
- name: Zip the application (Unix)
@@ -91,7 +91,7 @@ jobs:
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
with:
- upload_url: ${{ github.event.release.upload_url }}
+ upload_url: ${{ github.event.release.upload_url }}
asset_path: ./holmes-${{ matrix.os }}-${{ github.ref_name }}.zip
asset_name: holmes-${{ matrix.os }}-${{ github.ref_name }}.zip
asset_content_type: application/octet-stream
@@ -105,7 +105,7 @@ jobs:
check-latest:
needs: build
runs-on: ubuntu-20.04
- outputs:
+ outputs:
IS_LATEST: ${{ steps.check-latest.outputs.release == github.ref_name }}
steps:
- id: check-latest
diff --git a/.github/workflows/build-docker-images.yaml b/.github/workflows/build-docker-images.yaml
index ba85866..0bfcbf2 100644
--- a/.github/workflows/build-docker-images.yaml
+++ b/.github/workflows/build-docker-images.yaml
@@ -77,7 +77,7 @@ jobs:
# Note: this ignores the "Set as latest release" checkbox in the GitHub UI
# it isn't possible to check whether that was set or not
# so if you do not want to override the "latest" tag, you should mark the release as a prerelease or a draft
- # for prereleases and drafts we don't tag latest
+ # for prereleases and drafts we don't tag latest
- name: Tag and push Docker image as latest if applicable
if: ${{ github.event.release.prerelease == false && github.event.release.draft == false }}
run: |
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 2eaf552..68b44a6 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -6,4 +6,16 @@ repos:
- id: poetry-lock
pass_filenames: false
args:
- - --no-update
\ No newline at end of file
+ - --no-update
+ - repo: https://github.com/astral-sh/ruff-pre-commit
+ rev: v0.7.2
+ hooks:
+ - id: ruff
+ entry: ruff check --fix
+ - id: ruff-format
+ - repo: https://github.com/pre-commit/pre-commit-hooks
+ rev: v5.0.0
+ hooks:
+ - id: detect-private-key
+ - id: end-of-file-fixer
+ - id: trailing-whitespace
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 6d95164..2b36acb 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -15,7 +15,7 @@ Please make sure to read and observe our [Code of Conduct](https://github.com/ro
## Reporting bugs
-We encourage those interested to contribute code and also appreciate when issues are reported.
+We encourage those interested to contribute code and also appreciate when issues are reported.
- Create a new issue and label is as `bug`
- Clearly state how to reproduce the bug:
@@ -23,7 +23,7 @@ We encourage those interested to contribute code and also appreciate when issues
- Which steps are required to reproduce
- As LLMs answers may differ between runs - Does it always reproduce, or occasionally?
-
+
## Contributing Code
- Fork the repository and clone it locally.
diff --git a/Dockerfile b/Dockerfile
index f33f84b..ff138d5 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -26,22 +26,34 @@ ENV PATH="$VIRTUAL_ENV/bin:$PATH"
RUN curl -fsSL https://pkgs.k8s.io/core:/stable:/v1.29/deb/Release.key -o Release.key
# Set the architecture-specific kube lineage URLs
-ARG ARM_URL=https://github.com/Avi-Robusta/kube-lineage/releases/download/v2.2.1/kube-lineage-macos-latest-v2.2.1
-ARG AMD_URL=https://github.com/Avi-Robusta/kube-lineage/releases/download/v2.2.1/kube-lineage-ubuntu-latest-v2.2.1
+ARG KUBE_LINEAGE_ARM_URL=https://github.com/Avi-Robusta/kube-lineage/releases/download/v2.2.1/kube-lineage-macos-latest-v2.2.1
+ARG KUBE_LINEAGE_AMD_URL=https://github.com/Avi-Robusta/kube-lineage/releases/download/v2.2.1/kube-lineage-ubuntu-latest-v2.2.1
# Define a build argument to identify the platform
ARG TARGETPLATFORM
# Conditional download based on the platform
RUN if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
- curl -L -o kube-lineage $ARM_URL; \
+ curl -L -o kube-lineage $KUBE_LINEAGE_ARM_URL; \
elif [ "$TARGETPLATFORM" = "linux/amd64" ]; then \
- curl -L -o kube-lineage $AMD_URL; \
+ curl -L -o kube-lineage $KUBE_LINEAGE_AMD_URL; \
else \
echo "Unsupported platform: $TARGETPLATFORM"; exit 1; \
fi
RUN chmod 777 kube-lineage
RUN ./kube-lineage --version
-RUN curl -sSL -o argocd-linux-amd64 https://github.com/argoproj/argo-cd/releases/latest/download/argocd-linux-amd64
+# Set the architecture-specific argocd URLs
+ARG ARGOCD_ARM_URL=https://github.com/argoproj/argo-cd/releases/latest/download/argocd-linux-arm64
+ARG ARGOCD_AMD_URL=https://github.com/argoproj/argo-cd/releases/latest/download/argocd-linux-amd64
+# Conditional download based on the platform
+RUN if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
+ curl -L -o argocd $ARGOCD_ARM_URL; \
+ elif [ "$TARGETPLATFORM" = "linux/amd64" ]; then \
+ curl -L -o argocd $ARGOCD_AMD_URL; \
+ else \
+ echo "Unsupported platform: $TARGETPLATFORM"; exit 1; \
+ fi
+RUN chmod 777 argocd
+RUN ./argocd --help
# Install Helm
RUN curl https://baltocdn.com/helm/signing.asc | gpg --dearmor -o /usr/share/keyrings/helm.gpg \
@@ -100,8 +112,7 @@ COPY --from=builder /app/kube-lineage /usr/local/bin
RUN kube-lineage --version
# Set up ArgoCD
-COPY --from=builder /app/argocd-linux-amd64 /usr/local/bin/argocd
-RUN chmod 555 /usr/local/bin/argocd
+COPY --from=builder /app/argocd /usr/local/bin/argocd
RUN argocd --help
# Set up Helm
diff --git a/Dockerfile.dev b/Dockerfile.dev
index 438e275..be62433 100644
--- a/Dockerfile.dev
+++ b/Dockerfile.dev
@@ -59,7 +59,7 @@ ARG PRIVATE_PACKAGE_REGISTRY="none"
RUN if [ "${PRIVATE_PACKAGE_REGISTRY}" != "none" ]; then \
pip config set global.index-url "${PRIVATE_PACKAGE_REGISTRY}"; \
fi \
- && pip install poetry
+ && pip install poetry
ARG POETRY_REQUESTS_TIMEOUT
RUN poetry config virtualenvs.create false
COPY pyproject.toml poetry.lock /app/
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..3525541
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,4 @@
+
+
+check:
+ poetry run pre-commit run -a
diff --git a/README.md b/README.md
index 0f3618a..edb1cde 100644
--- a/README.md
+++ b/README.md
@@ -631,31 +631,31 @@ Using Grafana Loki
HolmesGPT can consult logs from [Loki](https://grafana.com/oss/loki/) by proxying through a [Grafana](https://grafana.com/oss/grafana/) instance.
-There are 2 parts to configuring access to Grafana Loki: Access/Authentication and search terms.
+To configure loki toolset:
-For access and authentication, add the following environment variables:
-
-* `GRAFANA_URL` - e.g. https://my-org.grafana.net
-* `GRAFANA_API_KEY` - e.g. glsa_bsm6ZS_sdfs25f
+```yaml
+toolsets:
+ grafana/loki:
+ enabled: true
+ config:
+ api_key: "{{ env.GRAFANA_API_KEY }}"
+ url: "http://loki-url"
+```
For search terms, you can optionally tweak the search terms used by the toolset.
-This is done by appending the following to your Holmes configuration file:
+This is done by appending the following to your Holmes grafana/loki configuration:
```yaml
-grafana:
- url: https://my-org.grafana.net #
- api_key: glsa_bsm6ZS_sdfs25f
- loki:
- pod_name_search_key: "pod"
- namespace_search_key: "namespace"
- node_name_search_key: "node"
+pod_name_search_key: "pod"
+namespace_search_key: "namespace"
+node_name_search_key: "node"
```
> You only need to tweak the configuration file if your Loki logs settings for pod, namespace and node differ from the above defaults.
-The Loki toolset is configured the using the same Grafana settings as the Grafana Tempo toolset.
+
Using Grafana Tempo
@@ -664,8 +664,6 @@ HolmesGPT can fetch trace information from Grafana Tempo to debug performance re
Tempo is configured the using the same Grafana settings as the Grafana Loki toolset.
-grafana:
- url: https://my-org.grafana.net #
@@ -875,9 +873,9 @@ Configure Slack to send notifications to specific channels. Provide your Slack t
OpenSearch Integration
The OpenSearch toolset (`opensearch`) allows Holmes to consult an opensearch cluster for its health, settings and shards information.
-The toolset supports multiple opensearch or elasticsearch clusters that are configured by editing Holmes' configuration file (or in cluster to the configuration secret):
+The toolset supports multiple opensearch or elasticsearch clusters that are configured by editing Holmes' configuration file:
-```
+```
opensearch_clusters:
- hosts:
- https://my_elasticsearch.us-central1.gcp.cloud.es.io:443
diff --git a/examples/custom_llm.py b/examples/custom_llm.py
index f3ec0c0..61d04fd 100644
--- a/examples/custom_llm.py
+++ b/examples/custom_llm.py
@@ -1,17 +1,14 @@
-
from typing import Any, Dict, List, Optional, Type, Union
-from holmes.config import Config
from holmes.core.llm import LLM
from litellm.types.utils import ModelResponse
from holmes.core.tool_calling_llm import ToolCallingLLM
from holmes.core.tools import Tool, ToolExecutor
from holmes.plugins.toolsets import load_builtin_toolsets
-from rich.console import Console
from pydantic import BaseModel
from holmes.plugins.prompts import load_and_render_prompt
-import sys
-class MyCustomLLM(LLM):
+
+class MyCustomLLM(LLM):
def get_context_window_size(self) -> int:
return 128000
@@ -21,36 +18,41 @@ def get_maximum_output_token(self) -> int:
def count_tokens_for_message(self, messages: list[dict]) -> int:
return 1
- def completion(self, messages: List[Dict[str, Any]], tools: Optional[List[Tool]] = [], tool_choice: Optional[Union[str, dict]] = None, response_format: Optional[Union[dict, Type[BaseModel]]] = None, temperature:Optional[float] = None, drop_params: Optional[bool] = None) -> ModelResponse:
- return ModelResponse(choices=[{
- "finish_reason": "stop",
- "index": 0,
- "message": {
- "role": "assistant",
- "content": "There are no issues with your cluster"
- }
- }],
- usage={
- "prompt_tokens": 0, # Integer
- "completion_tokens": 0,
- "total_tokens": 0
- }
- )
+ def completion(
+ self,
+ messages: List[Dict[str, Any]],
+ tools: Optional[List[Tool]] = [],
+ tool_choice: Optional[Union[str, dict]] = None,
+ response_format: Optional[Union[dict, Type[BaseModel]]] = None,
+ temperature: Optional[float] = None,
+ drop_params: Optional[bool] = None,
+ ) -> ModelResponse:
+ return ModelResponse(
+ choices=[
+ {
+ "finish_reason": "stop",
+ "index": 0,
+ "message": {
+ "role": "assistant",
+ "content": "There are no issues with your cluster",
+ },
+ }
+ ],
+ usage={
+ "prompt_tokens": 0, # Integer
+ "completion_tokens": 0,
+ "total_tokens": 0,
+ },
+ )
def ask_holmes():
- console = Console()
-
prompt = "what issues do I have in my cluster"
system_prompt = load_and_render_prompt("builtin://generic_ask.jinja2")
tool_executor = ToolExecutor(load_builtin_toolsets())
- ai = ToolCallingLLM(
- tool_executor,
- max_steps=10,
- llm=MyCustomLLM()
- )
+ ai = ToolCallingLLM(tool_executor, max_steps=10, llm=MyCustomLLM())
response = ai.prompt_call(system_prompt, prompt)
diff --git a/examples/custom_runbooks.yaml b/examples/custom_runbooks.yaml
index b5d76e6..57e2e39 100644
--- a/examples/custom_runbooks.yaml
+++ b/examples/custom_runbooks.yaml
@@ -4,4 +4,4 @@ runbooks:
instructions: >
Analyze pod logs for errors and also read the monogodb logs
Correlate between the two logs and try to find the root cause of the issue.
- Based on the logs, report the session ids of impacted transactions
\ No newline at end of file
+ Based on the logs, report the session ids of impacted transactions
diff --git a/examples/custom_toolset.yaml b/examples/custom_toolset.yaml
index 1f21411..a5b516f 100644
--- a/examples/custom_toolset.yaml
+++ b/examples/custom_toolset.yaml
@@ -11,7 +11,7 @@ toolsets:
docs_url: "https://kubernetes.io/docs/home/"
# Icon URL. Used for display in the UI
icon_url: "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRPKA-U9m5BxYQDF1O7atMfj9EMMXEoGu4t0Q&s"
- # Tags for categorizing toolsets, 'core' will be used for all Holmes features (both cli's commands and chats in UI).
+ # Tags for categorizing toolsets, 'core' will be used for all Holmes features (both cli's commands and chats in UI).
# The 'cluster' tag is used for UI functionality, while 'cli' is for for command-line specific tools
tags:
- core
@@ -24,7 +24,7 @@ toolsets:
- name: "switch_cluster"
# The LLM looks at this description when deciding what tools are relevant for each task
description: "Used to switch between multiple kubernetes contexts(clusters)"
-
+
# A templated bash command using Jinja2 templates
# The LLM can only control parameters that you expose as template variables like {{ this_variable }}
command: "kubectl config use-context {{ cluster_name }}"
diff --git a/helm/holmes/Chart.yaml b/helm/holmes/Chart.yaml
index 96bdcef..cb3e8e1 100644
--- a/helm/holmes/Chart.yaml
+++ b/helm/holmes/Chart.yaml
@@ -7,4 +7,4 @@ type: application
# we use 0.0.1 as a placeholder for the version` because Helm wont allow `0.0.0` and we want to be able to run
# `helm install` on development checkouts without updating this file. the version doesn't matter in that case anyway
version: 0.0.1
-appVersion: 0.0.0
\ No newline at end of file
+appVersion: 0.0.0
diff --git a/helm/holmes/templates/holmesgpt-service-account.yaml b/helm/holmes/templates/holmesgpt-service-account.yaml
index baee4e9..64c3f55 100644
--- a/helm/holmes/templates/holmesgpt-service-account.yaml
+++ b/helm/holmes/templates/holmesgpt-service-account.yaml
@@ -229,4 +229,4 @@ subjects:
- kind: ServiceAccount
name: {{ .Release.Name }}-holmes-service-account
namespace: {{ .Release.Namespace }}
-{{- end }}
\ No newline at end of file
+{{- end }}
diff --git a/holmes/.git_archival.json b/holmes/.git_archival.json
index 8da9c6a..30e70ca 100644
--- a/holmes/.git_archival.json
+++ b/holmes/.git_archival.json
@@ -5,4 +5,3 @@
"refs": "$Format:%D$",
"describe": "$Format:%(describe:tags=true,match=v[0-9]*)$"
}
-
diff --git a/holmes/__init__.py b/holmes/__init__.py
index 7ce3586..dc81558 100644
--- a/holmes/__init__.py
+++ b/holmes/__init__.py
@@ -4,7 +4,7 @@
import sys
# For relative imports to work in Python 3.6 - see https://stackoverflow.com/a/49375740
-this_path = os.path.dirname(os.path.realpath(__file__))
+this_path = os.path.dirname(os.path.realpath(__file__))
sys.path.append(this_path)
# This is patched by github actions during release
@@ -19,28 +19,50 @@ def get_version() -> str:
# we are running from an unreleased dev version
try:
# Get the latest git tag
- tag = subprocess.check_output(["git", "describe", "--tags"], stderr=subprocess.STDOUT, cwd=this_path).decode().strip()
+ tag = (
+ subprocess.check_output(
+ ["git", "describe", "--tags"], stderr=subprocess.STDOUT, cwd=this_path
+ )
+ .decode()
+ .strip()
+ )
# Get the current branch name
- branch = subprocess.check_output(["git", "rev-parse", "--abbrev-ref", "HEAD"], stderr=subprocess.STDOUT, cwd=this_path).decode().strip()
+ branch = (
+ subprocess.check_output(
+ ["git", "rev-parse", "--abbrev-ref", "HEAD"],
+ stderr=subprocess.STDOUT,
+ cwd=this_path,
+ )
+ .decode()
+ .strip()
+ )
# Check if there are uncommitted changes
- status = subprocess.check_output(["git", "status", "--porcelain"], stderr=subprocess.STDOUT, cwd=this_path).decode().strip()
+ status = (
+ subprocess.check_output(
+ ["git", "status", "--porcelain"],
+ stderr=subprocess.STDOUT,
+ cwd=this_path,
+ )
+ .decode()
+ .strip()
+ )
dirty = "-dirty" if status else ""
return f"{tag}-{branch}{dirty}"
-
+
except Exception:
pass
# we are running without git history, but we still might have git archival data (e.g. if we were pip installed)
- archival_file_path = os.path.join(this_path, '.git_archival.json')
+ archival_file_path = os.path.join(this_path, ".git_archival.json")
if os.path.exists(archival_file_path):
try:
- with open(archival_file_path, 'r') as f:
+ with open(archival_file_path, "r") as f:
archival_data = json.load(f)
return f"{archival_data['refs']}-{archival_data['hash-short']}"
except Exception:
pass
- return f"dev-version"
+ return "dev-version"
diff --git a/holmes/common/env_vars.py b/holmes/common/env_vars.py
index 4414b79..a8e00d6 100644
--- a/holmes/common/env_vars.py
+++ b/holmes/common/env_vars.py
@@ -7,10 +7,14 @@ def load_bool(env_var, default: bool):
return json.loads(s.lower())
-ENABLED_BY_DEFAULT_TOOLSETS = os.environ.get('ENABLED_BY_DEFAULT_TOOLSETS', 'kubernetes/core,kubernetes/logs,robusta,internet')
-HOLMES_HOST = os.environ.get('HOLMES_HOST', '0.0.0.0')
-HOLMES_PORT = int(os.environ.get('HOLMES_PORT', 5050))
-ROBUSTA_CONFIG_PATH = os.environ.get('ROBUSTA_CONFIG_PATH', "/etc/robusta/config/active_playbooks.yaml")
+ENABLED_BY_DEFAULT_TOOLSETS = os.environ.get(
+ "ENABLED_BY_DEFAULT_TOOLSETS", "kubernetes/core,kubernetes/logs,robusta,internet"
+)
+HOLMES_HOST = os.environ.get("HOLMES_HOST", "0.0.0.0")
+HOLMES_PORT = int(os.environ.get("HOLMES_PORT", 5050))
+ROBUSTA_CONFIG_PATH = os.environ.get(
+ "ROBUSTA_CONFIG_PATH", "/etc/robusta/config/active_playbooks.yaml"
+)
ROBUSTA_ACCOUNT_ID = os.environ.get("ROBUSTA_ACCOUNT_ID", "")
STORE_URL = os.environ.get("STORE_URL", "")
diff --git a/holmes/config.py b/holmes/config.py
index ca6edd2..3fbc444 100644
--- a/holmes/config.py
+++ b/holmes/config.py
@@ -1,4 +1,3 @@
-from functools import lru_cache
import logging
import os
import yaml
@@ -6,10 +5,9 @@
from holmes.core.llm import LLM, DefaultLLM
from typing import Any, Dict, List, Optional
-from typing import List, Optional
-from pydantic import FilePath, SecretStr, Field
+from pydantic import FilePath, SecretStr
from pydash.arrays import concat
@@ -496,7 +494,7 @@ def merge_and_override_bultin_toolsets_with_toolsets_config(
@classmethod
def load_from_file(cls, config_file: Optional[str], **kwargs) -> "Config":
if config_file is not None:
- logging.debug(f"Loading config from file %s", config_file)
+ logging.debug("Loading config from file %s", config_file)
config_from_file = load_model_from_file(cls, config_file)
elif os.path.exists(DEFAULT_CONFIG_LOCATION):
logging.debug(
diff --git a/holmes/core/conversations.py b/holmes/core/conversations.py
index 2481b4c..6ba5bb5 100644
--- a/holmes/core/conversations.py
+++ b/holmes/core/conversations.py
@@ -6,6 +6,7 @@
ConversationInvestigationResult,
ToolCallConversationResult,
IssueChatRequest,
+ WorkloadHealthChatRequest,
)
from holmes.plugins.prompts import load_and_render_prompt
from holmes.core.tool_calling_llm import ToolCallingLLM
@@ -14,6 +15,7 @@
DEFAULT_TOOL_SIZE = 10000
+
def calculate_tool_size(
ai: ToolCallingLLM, messages_without_tools: list[dict], number_of_tools: int
) -> int:
@@ -136,14 +138,30 @@ def handle_issue_conversation(
return system_prompt
-def build_issue_chat_messages(issue_chat_request: IssueChatRequest, ai: ToolCallingLLM,
- global_instructions: Optional[Instructions] = None):
+def build_issue_chat_messages(
+ issue_chat_request: IssueChatRequest,
+ ai: ToolCallingLLM,
+ global_instructions: Optional[Instructions] = None,
+):
"""
This function generates a list of messages for issue conversation and ensures that the message sequence adheres to the model's context window limitations
by truncating tool outputs as necessary before sending to llm.
We always expect conversation_history to be passed in the openAI format which is supported by litellm and passed back by us.
That's why we assume that first message in the conversation is system message and truncate tools for it.
+
+ System prompt handling:
+ 1. For new conversations (empty conversation_history):
+ - Creates a new system prompt using generic_ask_for_issue_conversation.jinja2 template
+ - Includes investigation analysis, tools (if any), and issue type information
+ - If there are tools, calculates appropriate tool size and truncates tool outputs
+
+ 2. For existing conversations:
+ - Preserves the conversation history
+ - Updates the first message (system prompt) with recalculated content
+ - Truncates tool outputs if necessary to fit context window
+ - Maintains the original conversation flow while ensuring context limits
+
Example structure of conversation history:
conversation_history = [
# System prompt
@@ -180,11 +198,12 @@ def build_issue_chat_messages(issue_chat_request: IssueChatRequest, ai: ToolCall
tools_for_investigation = issue_chat_request.investigation_result.tools
if not conversation_history or len(conversation_history) == 0:
- user_prompt = add_global_instructions_to_user_prompt(user_prompt, global_instructions)
+ user_prompt = add_global_instructions_to_user_prompt(
+ user_prompt, global_instructions
+ )
number_of_tools_for_investigation = len(tools_for_investigation)
if number_of_tools_for_investigation == 0:
-
system_prompt = load_and_render_prompt(
template_path,
{
@@ -255,7 +274,9 @@ def build_issue_chat_messages(issue_chat_request: IssueChatRequest, ai: ToolCall
},
]
- user_prompt = add_global_instructions_to_user_prompt(user_prompt, global_instructions)
+ user_prompt = add_global_instructions_to_user_prompt(
+ user_prompt, global_instructions
+ )
conversation_history.append(
{
@@ -310,9 +331,58 @@ def build_issue_chat_messages(issue_chat_request: IssueChatRequest, ai: ToolCall
def build_chat_messages(
- ask: str, conversation_history: Optional[List[Dict[str, str]]], ai: ToolCallingLLM,
- global_instructions: Optional[Instructions] = None
+ ask: str,
+ conversation_history: Optional[List[Dict[str, str]]],
+ ai: ToolCallingLLM,
+ global_instructions: Optional[Instructions] = None,
) -> List[dict]:
+ """
+ This function generates a list of messages for general chat conversation and ensures that the message sequence adheres to the model's context window limitations
+ by truncating tool outputs as necessary before sending to llm.
+
+ We always expect conversation_history to be passed in the openAI format which is supported by litellm and passed back by us.
+ That's why we assume that first message in the conversation is system message and truncate tools for it.
+
+ System prompt handling:
+ 1. For new conversations (empty conversation_history):
+ - Creates a new system prompt using generic_ask_conversation.jinja2 template
+ - Uses an empty template context (no specific analysis or tools required)
+ - Adds global instructions to the user prompt if provided
+
+ 2. For existing conversations:
+ - Preserves the conversation history as is
+ - No need to update system prompt as it doesn't contain tool-specific content
+ - Only truncates tool messages if they exist in the conversation
+ - Maintains the original conversation flow while ensuring context limits
+
+ Example structure of conversation history:
+ conversation_history = [
+ # System prompt for general chat
+ {"role": "system", "content": "...."},
+ # User message with a general question
+ {"role": "user", "content": "Can you analyze the logs from my application?"},
+ # Assistant initiates a tool call
+ {
+ "role": "assistant",
+ "content": None,
+ "tool_call": {
+ "name": "fetch_application_logs",
+ "arguments": "{\"service\": \"backend\", \"time_range\": \"last_hour\"}"
+ }
+ },
+ # Tool/Function response
+ {
+ "role": "tool",
+ "name": "fetch_application_logs",
+ "content": "{\"log_entries\": [\"Error in processing request\", \"Connection timeout\"]}"
+ },
+ # Assistant's final response to the user
+ {
+ "role": "assistant",
+ "content": "I've analyzed your application logs and found some issues: there are error messages related to request processing and connection timeouts."
+ },
+ ]
+ """
template_path = "builtin://generic_ask_conversation.jinja2"
if not conversation_history or len(conversation_history) == 0:
@@ -330,9 +400,9 @@ def build_chat_messages(
},
]
return messages
-
+
ask = add_global_instructions_to_user_prompt(ask, global_instructions)
-
+
conversation_history.append(
{
"role": "user",
@@ -354,3 +424,197 @@ def build_chat_messages(
)
truncate_tool_messages(conversation_history, tool_size)
return conversation_history
+
+
+def build_workload_health_chat_messages(
+ workload_health_chat_request: WorkloadHealthChatRequest,
+ ai: ToolCallingLLM,
+ global_instructions: Optional[Instructions] = None,
+):
+ """
+ This function generates a list of messages for workload health conversation and ensures that the message sequence adheres to the model's context window limitations
+ by truncating tool outputs as necessary before sending to llm.
+
+ We always expect conversation_history to be passed in the openAI format which is supported by litellm and passed back by us.
+ That's why we assume that first message in the conversation is system message and truncate tools for it.
+
+ System prompt handling:
+ 1. For new conversations (empty conversation_history):
+ - Creates a new system prompt using kubernetes_workload_chat.jinja2 template
+ - Includes workload analysis, tools (if any), and resource information
+ - If there are tools, calculates appropriate tool size and truncates tool outputs
+
+ 2. For existing conversations:
+ - Preserves the conversation history
+ - Updates the first message (system prompt) with recalculated content
+ - Truncates tool outputs if necessary to fit context window
+ - Maintains the original conversation flow while ensuring context limits
+
+ Example structure of conversation history:
+ conversation_history = [
+ # System prompt with workload analysis
+ {"role": "system", "content": "...."},
+ # User message asking about workload health
+ {"role": "user", "content": "What's the current health status of my deployment?"},
+ # Assistant initiates a tool call
+ {
+ "role": "assistant",
+ "content": None,
+ "tool_call": {
+ "name": "check_workload_metrics",
+ "arguments": "{\"namespace\": \"default\", \"workload\": \"my-deployment\"}"
+ }
+ },
+ # Tool/Function response
+ {
+ "role": "tool",
+ "name": "check_workload_metrics",
+ "content": "{\"cpu_usage\": \"45%\", \"memory_usage\": \"60%\", \"status\": \"Running\"}"
+ },
+ # Assistant's final response to the user
+ {
+ "role": "assistant",
+ "content": "Your deployment is running normally with CPU usage at 45% and memory usage at 60%."
+ },
+ ]
+ """
+
+ template_path = "builtin://kubernetes_workload_chat.jinja2"
+
+ conversation_history = workload_health_chat_request.conversation_history
+ user_prompt = workload_health_chat_request.ask
+ workload_analysis = workload_health_chat_request.workload_health_result.analysis
+ tools_for_workload = workload_health_chat_request.workload_health_result.tools
+ resource = workload_health_chat_request.resource
+
+ if not conversation_history or len(conversation_history) == 0:
+ user_prompt = add_global_instructions_to_user_prompt(
+ user_prompt, global_instructions
+ )
+
+ number_of_tools_for_workload = len(tools_for_workload)
+ if number_of_tools_for_workload == 0:
+ system_prompt = load_and_render_prompt(
+ template_path,
+ {
+ "workload_analysis": workload_analysis,
+ "tools_called_for_workload": tools_for_workload,
+ "resource": resource,
+ },
+ )
+ messages = [
+ {
+ "role": "system",
+ "content": system_prompt,
+ },
+ {
+ "role": "user",
+ "content": user_prompt,
+ },
+ ]
+ return messages
+
+ template_context_without_tools = {
+ "workload_analysis": workload_analysis,
+ "tools_called_for_workload": None,
+ "resource": resource,
+ }
+ system_prompt_without_tools = load_and_render_prompt(
+ template_path, template_context_without_tools
+ )
+ messages_without_tools = [
+ {
+ "role": "system",
+ "content": system_prompt_without_tools,
+ },
+ {
+ "role": "user",
+ "content": user_prompt,
+ },
+ ]
+ tool_size = calculate_tool_size(
+ ai, messages_without_tools, number_of_tools_for_workload
+ )
+
+ truncated_workload_result_tool_calls = [
+ ToolCallConversationResult(
+ name=tool.name,
+ description=tool.description,
+ output=tool.output[:tool_size],
+ )
+ for tool in tools_for_workload
+ ]
+
+ truncated_template_context = {
+ "workload_analysis": workload_analysis,
+ "tools_called_for_workload": truncated_workload_result_tool_calls,
+ "resource": resource,
+ }
+ system_prompt_with_truncated_tools = load_and_render_prompt(
+ template_path, truncated_template_context
+ )
+ return [
+ {
+ "role": "system",
+ "content": system_prompt_with_truncated_tools,
+ },
+ {
+ "role": "user",
+ "content": user_prompt,
+ },
+ ]
+
+ user_prompt = add_global_instructions_to_user_prompt(
+ user_prompt, global_instructions
+ )
+
+ conversation_history.append(
+ {
+ "role": "user",
+ "content": user_prompt,
+ }
+ )
+ number_of_tools = len(tools_for_workload) + len(
+ [message for message in conversation_history if message.get("role") == "tool"]
+ )
+
+ if number_of_tools == 0:
+ return conversation_history
+
+ conversation_history_without_tools = [
+ message for message in conversation_history if message.get("role") != "tool"
+ ]
+ template_context_without_tools = {
+ "workload_analysis": workload_analysis,
+ "tools_called_for_workload": None,
+ "resource": resource,
+ }
+ system_prompt_without_tools = load_and_render_prompt(
+ template_path, template_context_without_tools
+ )
+ conversation_history_without_tools[0]["content"] = system_prompt_without_tools
+
+ tool_size = calculate_tool_size(
+ ai, conversation_history_without_tools, number_of_tools
+ )
+
+ truncated_workload_result_tool_calls = [
+ ToolCallConversationResult(
+ name=tool.name, description=tool.description, output=tool.output[:tool_size]
+ )
+ for tool in tools_for_workload
+ ]
+
+ template_context = {
+ "workload_analysis": workload_analysis,
+ "tools_called_for_workload": truncated_workload_result_tool_calls,
+ "resource": resource,
+ }
+ system_prompt_with_truncated_tools = load_and_render_prompt(
+ template_path, template_context
+ )
+ conversation_history[0]["content"] = system_prompt_with_truncated_tools
+
+ truncate_tool_messages(conversation_history, tool_size)
+
+ return conversation_history
diff --git a/holmes/core/investigation.py b/holmes/core/investigation.py
index c7d7879..7dcb4aa 100644
--- a/holmes/core/investigation.py
+++ b/holmes/core/investigation.py
@@ -1,18 +1,17 @@
-
-from typing import Optional
-from rich.console import Console
from holmes.common.env_vars import HOLMES_POST_PROCESSING_PROMPT
from holmes.config import Config
+from holmes.core.investigation_structured_output import process_response_into_sections
from holmes.core.issue import Issue
from holmes.core.models import InvestigateRequest, InvestigationResult
from holmes.core.supabase_dal import SupabaseDal
from holmes.utils.robusta import load_robusta_api_key
-def investigate_issues(investigate_request: InvestigateRequest, dal: SupabaseDal, config: Config):
+
+def investigate_issues(
+ investigate_request: InvestigateRequest, dal: SupabaseDal, config: Config
+):
load_robusta_api_key(dal=dal, config=config)
- context = dal.get_issue_data(
- investigate_request.context.get("robusta_issue_id")
- )
+ context = dal.get_issue_data(investigate_request.context.get("robusta_issue_id"))
resource_instructions = dal.get_resource_instructions(
"alert", investigate_request.context.get("issue_type")
@@ -36,13 +35,15 @@ def investigate_issues(investigate_request: InvestigateRequest, dal: SupabaseDal
issue,
prompt=investigate_request.prompt_template,
post_processing_prompt=HOLMES_POST_PROCESSING_PROMPT,
- sections=investigate_request.sections,
instructions=resource_instructions,
- global_instructions=global_instructions
+ global_instructions=global_instructions,
)
+
+ (text_response, sections) = process_response_into_sections(investigation.result)
+
return InvestigationResult(
- analysis=investigation.result,
- sections=investigation.sections,
+ analysis=text_response,
+ sections=sections,
tool_calls=investigation.tool_calls or [],
instructions=investigation.instructions,
)
diff --git a/holmes/core/investigation_structured_output.py b/holmes/core/investigation_structured_output.py
index 4038443..9c24465 100644
--- a/holmes/core/investigation_structured_output.py
+++ b/holmes/core/investigation_structured_output.py
@@ -1,24 +1,33 @@
-from typing import Any, Dict
+from typing import Any, Dict, Optional, Tuple, Union
+import json
-DEFAULT_SECTIONS = {
- "Alert Explanation": "1-2 sentences explaining the alert itself - note don't say \"The alert indicates a warning event related to a Kubernetes pod doing blah\" rather just say \"The pod XYZ did blah\" because that is what the user actually cares about",
- "Possible Root causes": "What conclusions can you reach based on the data you found? what are possible root causes (if you have enough conviction to say) or what uncertainty remains",
- "Next Steps": "what you would do next to troubleshoot this issue, any commands that could be run to fix it, or other ways to solve it (prefer giving precise bash commands when possible)",
+from pydantic import RootModel
+
+InputSectionsDataType = Dict[str, str]
+
+OutputSectionsDataType = Optional[Dict[str, Union[str, None]]]
+
+SectionsData = RootModel[OutputSectionsDataType]
+
+DEFAULT_SECTIONS: InputSectionsDataType = {
+ "Alert Explanation": '1-2 sentences explaining the alert itself - note don\'t say "The alert indicates a warning event related to a Kubernetes pod doing blah" rather just say "The pod XYZ did blah" because that is what the user actually cares about',
+ "Investigation": "What you checked and found",
+ "Conclusions and Possible Root causes": "What conclusions can you reach based on the data you found? what are possible root causes (if you have enough conviction to say) or what uncertainty remains. Don't say root cause but 'possible root causes'. Be clear to distinguish between what you know for certain and what is a possible explanation",
+ "Next Steps": "What you would do next to troubleshoot this issue, any commands that could be run to fix it, or other ways to solve it (prefer giving precise bash commands when possible)",
"Related logs": "Truncate and share the most relevant logs, especially if these explain the root cause. For example: \nLogs from pod robusta-holmes:\n```\n```\n. Always embed the surroundding +/- 5 log lines to any relevant logs. ",
"App or Infra?": "Explain whether the issue is more likely an infrastructure or an application level issue and why you think that.",
- "External links": "Provide links to external sources. Where to look when investigating this issue. For example provide links to relevant runbooks, etc. Add a short sentence describing each link."
+ "External links": "Provide links to external sources. Where to look when investigating this issue. For example provide links to relevant runbooks, etc. Add a short sentence describing each link.",
}
-def get_output_format_for_investigation(sections: Dict[str, str]) -> Dict[str, Any]:
+def get_output_format_for_investigation(
+ sections: InputSectionsDataType,
+) -> Dict[str, Any]:
properties = {}
required_fields = []
for title, description in sections.items():
- properties[title] = {
- "type": ["string", "null"],
- "description": description
- }
+ properties[title] = {"type": ["string", "null"], "description": description}
required_fields.append(title)
schema = {
@@ -26,19 +35,46 @@ def get_output_format_for_investigation(sections: Dict[str, str]) -> Dict[str, A
"type": "object",
"required": required_fields,
"properties": properties,
- "additionalProperties": False
+ "additionalProperties": False,
}
- output_format = { "type": "json_schema", "json_schema": { "name": "InvestigationResult", "schema": schema, "strict": False} }
+ output_format = {
+ "type": "json_schema",
+ "json_schema": {
+ "name": "InvestigationResult",
+ "schema": schema,
+ "strict": False,
+ },
+ }
return output_format
-def combine_sections(sections: Any) -> str:
- if isinstance(sections, dict):
- content = ''
- for section_title, section_content in sections.items():
- if section_content:
- # content = content + f'\n# {" ".join(section_title.split("_")).title()}\n{section_content}'
- content = content + f'\n# {section_title}\n{section_content}\n'
- return content
- return f"{sections}"
+
+def combine_sections(sections: Dict) -> str:
+ content = ""
+ for section_title, section_content in sections.items():
+ if section_content:
+ content = content + f"\n# {section_title}\n{section_content}\n"
+ return content
+
+
+def process_response_into_sections(response: Any) -> Tuple[str, OutputSectionsDataType]:
+ if isinstance(response, dict):
+ # No matter if the result is already structured, we want to go through the code below to validate the JSON
+ response = json.dumps(response)
+
+ if not isinstance(response, str):
+ # if it's not a string, we make it so as it'll be parsed later
+ response = str(response)
+
+ try:
+ parsed_json = json.loads(response)
+ # TODO: force dict values into a string would make this more resilient as SectionsData only accept none/str as values
+ sections = SectionsData(root=parsed_json).root
+ if sections:
+ combined = combine_sections(sections)
+ return (combined, sections)
+ except Exception:
+ pass
+
+ return (response, None)
diff --git a/holmes/core/issue.py b/holmes/core/issue.py
index f811cd0..71dae44 100644
--- a/holmes/core/issue.py
+++ b/holmes/core/issue.py
@@ -1,29 +1,28 @@
-from datetime import datetime
from strenum import StrEnum
from typing import Optional
from pydantic import BaseModel, ConfigDict
-class IssueStatus (StrEnum):
+class IssueStatus(StrEnum):
OPEN = "open"
CLOSED = "closed"
# TODO: look at finding in Robusta
class Issue(BaseModel):
- model_config = ConfigDict(extra='forbid', validate_default=True)
+ model_config = ConfigDict(extra="forbid", validate_default=True)
# Identifier for the issue - source + issue_id should be unique
- id: str
+ id: str
- # Name of the issue - not necessarily unique
- name: str
+ # Name of the issue - not necessarily unique
+ name: str
# Source of the issue - e.g. jira
source_type: str
- # Identifier for the instance of the source - e.g. Jira project key
+ # Identifier for the instance of the source - e.g. Jira project key
source_instance_id: str
# Link to the issue, when available
@@ -31,12 +30,12 @@ class Issue(BaseModel):
# Raw object from the source - e.g. a dict from the source's API
raw: dict = None
-
+
# these fields are all optional and used for visual presentation of the issue
# there may not be a 1:1 mapping between source fields and these fields, which is OK
# e.g. jira issues can have arbitrary statuses like 'closed' and 'resolved' whereas for presentation sake
# we want to classify as open/closed so we can color the issue red/green
- # if these fields are not present, an LLM may be used to guess them
+ # if these fields are not present, an LLM may be used to guess them
presentation_status: Optional[IssueStatus] = None
# Markdown with key metadata about the issue. Suggested format is several lines each styled as "*X*: Y" and separated by \n
@@ -53,4 +52,3 @@ class Issue(BaseModel):
# created_at: Optional[datetime] = None # Timestamp of when the issue was created
# updated_at: Optional[datetime] = None # Timestamp of when the issue was last updated
# metadata: Optional[dict] = None # All additional metadata from the source (can be hierchical - e.g. dicts in dicts
-
\ No newline at end of file
diff --git a/holmes/core/llm.py b/holmes/core/llm.py
index a790222..e0f9f4d 100644
--- a/holmes/core/llm.py
+++ b/holmes/core/llm.py
@@ -1,10 +1,8 @@
-
import logging
from abc import abstractmethod
from typing import Any, Dict, List, Optional, Type, Union
from litellm.types.utils import ModelResponse
-from pydantic.types import SecretStr
from holmes.core.tools import Tool
from pydantic import BaseModel
@@ -19,11 +17,12 @@ def environ_get_safe_int(env_var, default="0"):
except ValueError:
return int(default)
+
OVERRIDE_MAX_OUTPUT_TOKEN = environ_get_safe_int("OVERRIDE_MAX_OUTPUT_TOKEN")
OVERRIDE_MAX_CONTENT_SIZE = environ_get_safe_int("OVERRIDE_MAX_CONTENT_SIZE")
-class LLM:
+class LLM:
@abstractmethod
def get_context_window_size(self) -> int:
pass
@@ -37,21 +36,24 @@ def count_tokens_for_message(self, messages: list[dict]) -> int:
pass
@abstractmethod
- def completion(self, messages: List[Dict[str, Any]], tools: Optional[List[Tool]] = [], tool_choice: Optional[Union[str, dict]] = None, response_format: Optional[Union[dict, Type[BaseModel]]] = None, temperature:Optional[float] = None, drop_params: Optional[bool] = None) -> ModelResponse:
+ def completion(
+ self,
+ messages: List[Dict[str, Any]],
+ tools: Optional[List[Tool]] = [],
+ tool_choice: Optional[Union[str, dict]] = None,
+ response_format: Optional[Union[dict, Type[BaseModel]]] = None,
+ temperature: Optional[float] = None,
+ drop_params: Optional[bool] = None,
+ ) -> ModelResponse:
pass
class DefaultLLM(LLM):
-
model: str
api_key: Optional[str]
base_url: Optional[str]
- def __init__(
- self,
- model: str,
- api_key: Optional[str] = None
- ):
+ def __init__(self, model: str, api_key: Optional[str] = None):
self.model = model
self.api_key = api_key
self.base_url = None
@@ -61,7 +63,7 @@ def __init__(
self.check_llm(self.model, self.api_key)
- def check_llm(self, model:str, api_key:Optional[str]):
+ def check_llm(self, model: str, api_key: Optional[str]):
logging.debug(f"Checking LiteLLM model {model}")
# TODO: this WAS a hack to get around the fact that we can't pass in an api key to litellm.validate_environment
# so without this hack it always complains that the environment variable for the api key is missing
@@ -78,26 +80,28 @@ def check_llm(self, model:str, api_key:Optional[str]):
# Required variables for WatsonX:
# - WATSONX_URL: Base URL of your WatsonX instance (required)
# - WATSONX_APIKEY or WATSONX_TOKEN: IBM Cloud API key or IAM auth token (one is required)
- model_requirements = {'missing_keys': [], 'keys_in_environment': True}
+ model_requirements = {"missing_keys": [], "keys_in_environment": True}
if api_key:
os.environ["WATSONX_APIKEY"] = api_key
- if not "WATSONX_URL" in os.environ:
- model_requirements['missing_keys'].append("WATSONX_URL")
- model_requirements['keys_in_environment'] = False
- if not "WATSONX_APIKEY" in os.environ and not "WATSONX_TOKEN" in os.environ:
- model_requirements['missing_keys'].extend(["WATSONX_APIKEY", "WATSONX_TOKEN"])
- model_requirements['keys_in_environment'] = False
+ if "WATSONX_URL" not in os.environ:
+ model_requirements["missing_keys"].append("WATSONX_URL")
+ model_requirements["keys_in_environment"] = False
+ if "WATSONX_APIKEY" not in os.environ and "WATSONX_TOKEN" not in os.environ:
+ model_requirements["missing_keys"].extend(
+ ["WATSONX_APIKEY", "WATSONX_TOKEN"]
+ )
+ model_requirements["keys_in_environment"] = False
# WATSONX_PROJECT_ID is required because we don't let user pass it to completion call directly
- if not "WATSONX_PROJECT_ID" in os.environ:
- model_requirements['missing_keys'].append("WATSONX_PROJECT_ID")
- model_requirements['keys_in_environment'] = False
+ if "WATSONX_PROJECT_ID" not in os.environ:
+ model_requirements["missing_keys"].append("WATSONX_PROJECT_ID")
+ model_requirements["keys_in_environment"] = False
# https://docs.litellm.ai/docs/providers/watsonx#usage---models-in-deployment-spaces
# using custom watsonx deployments might require to set WATSONX_DEPLOYMENT_SPACE_ID env
if "watsonx/deployment/" in self.model:
logging.warning(
- "Custom WatsonX deployment detected. You may need to set the WATSONX_DEPLOYMENT_SPACE_ID "
- "environment variable for proper functionality. For more information, refer to the documentation: "
- "https://docs.litellm.ai/docs/providers/watsonx#usage---models-in-deployment-spaces"
+ "Custom WatsonX deployment detected. You may need to set the WATSONX_DEPLOYMENT_SPACE_ID "
+ "environment variable for proper functionality. For more information, refer to the documentation: "
+ "https://docs.litellm.ai/docs/providers/watsonx#usage---models-in-deployment-spaces"
)
else:
#
@@ -107,8 +111,9 @@ def check_llm(self, model:str, api_key:Optional[str]):
model_requirements = litellm.validate_environment(model=model)
if not model_requirements["keys_in_environment"]:
- raise Exception(f"model {model} requires the following environment variables: {model_requirements['missing_keys']}")
-
+ raise Exception(
+ f"model {model} requires the following environment variables: {model_requirements['missing_keys']}"
+ )
def _strip_model_prefix(self) -> str:
"""
@@ -117,36 +122,49 @@ def _strip_model_prefix(self) -> str:
https://raw.githubusercontent.com/BerriAI/litellm/main/model_prices_and_context_window.json
"""
model_name = self.model
- if model_name.startswith('openai/'):
- model_name = model_name[len('openai/'):] # Strip the 'openai/' prefix
- elif model_name.startswith('bedrock/'):
- model_name = model_name[len('bedrock/'):] # Strip the 'bedrock/' prefix
- elif model_name.startswith('vertex_ai/'):
- model_name = model_name[len('vertex_ai/'):] # Strip the 'vertex_ai/' prefix
+ if model_name.startswith("openai/"):
+ model_name = model_name[len("openai/") :] # Strip the 'openai/' prefix
+ elif model_name.startswith("bedrock/"):
+ model_name = model_name[len("bedrock/") :] # Strip the 'bedrock/' prefix
+ elif model_name.startswith("vertex_ai/"):
+ model_name = model_name[
+ len("vertex_ai/") :
+ ] # Strip the 'vertex_ai/' prefix
return model_name
-
# this unfortunately does not seem to work for azure if the deployment name is not a well-known model name
- #if not litellm.supports_function_calling(model=model):
+ # if not litellm.supports_function_calling(model=model):
# raise Exception(f"model {model} does not support function calling. You must use HolmesGPT with a model that supports function calling.")
+
def get_context_window_size(self) -> int:
if OVERRIDE_MAX_CONTENT_SIZE:
- logging.debug(f"Using override OVERRIDE_MAX_CONTENT_SIZE {OVERRIDE_MAX_CONTENT_SIZE}")
+ logging.debug(
+ f"Using override OVERRIDE_MAX_CONTENT_SIZE {OVERRIDE_MAX_CONTENT_SIZE}"
+ )
return OVERRIDE_MAX_CONTENT_SIZE
model_name = os.environ.get("MODEL_TYPE", self._strip_model_prefix())
try:
- return litellm.model_cost[model_name]['max_input_tokens']
+ return litellm.model_cost[model_name]["max_input_tokens"]
except Exception:
- logging.warning(f"Couldn't find model's name {model_name} in litellm's model list, fallback to 128k tokens for max_input_tokens")
+ logging.warning(
+ f"Couldn't find model's name {model_name} in litellm's model list, fallback to 128k tokens for max_input_tokens"
+ )
return 128000
def count_tokens_for_message(self, messages: list[dict]) -> int:
- return litellm.token_counter(model=self.model,
- messages=messages)
+ return litellm.token_counter(model=self.model, messages=messages)
- def completion(self, messages: List[Dict[str, Any]], tools: Optional[List[Tool]] = [], tool_choice: Optional[Union[str, dict]] = None, response_format: Optional[Union[dict, Type[BaseModel]]] = None, temperature:Optional[float] = None, drop_params: Optional[bool] = None) -> ModelResponse:
+ def completion(
+ self,
+ messages: List[Dict[str, Any]],
+ tools: Optional[List[Tool]] = [],
+ tool_choice: Optional[Union[str, dict]] = None,
+ response_format: Optional[Union[dict, Type[BaseModel]]] = None,
+ temperature: Optional[float] = None,
+ drop_params: Optional[bool] = None,
+ ) -> ModelResponse:
result = litellm.completion(
model=self.model,
api_key=self.api_key,
@@ -156,7 +174,7 @@ def completion(self, messages: List[Dict[str, Any]], tools: Optional[List[Tool]]
base_url=self.base_url,
temperature=temperature,
response_format=response_format,
- drop_params=drop_params
+ drop_params=drop_params,
)
if isinstance(result, ModelResponse):
@@ -166,12 +184,16 @@ def completion(self, messages: List[Dict[str, Any]], tools: Optional[List[Tool]]
def get_maximum_output_token(self) -> int:
if OVERRIDE_MAX_OUTPUT_TOKEN:
- logging.debug(f"Using OVERRIDE_MAX_OUTPUT_TOKEN {OVERRIDE_MAX_OUTPUT_TOKEN}")
+ logging.debug(
+ f"Using OVERRIDE_MAX_OUTPUT_TOKEN {OVERRIDE_MAX_OUTPUT_TOKEN}"
+ )
return OVERRIDE_MAX_OUTPUT_TOKEN
model_name = os.environ.get("MODEL_TYPE", self._strip_model_prefix())
try:
- return litellm.model_cost[model_name]['max_output_tokens']
+ return litellm.model_cost[model_name]["max_output_tokens"]
except Exception:
- logging.warning(f"Couldn't find model's name {model_name} in litellm's model list, fallback to 4096 tokens for max_output_tokens")
+ logging.warning(
+ f"Couldn't find model's name {model_name} in litellm's model list, fallback to 4096 tokens for max_output_tokens"
+ )
return 4096
diff --git a/holmes/core/models.py b/holmes/core/models.py
index 71683cf..0520647 100644
--- a/holmes/core/models.py
+++ b/holmes/core/models.py
@@ -1,3 +1,4 @@
+from holmes.core.investigation_structured_output import InputSectionsDataType
from holmes.core.tool_calling_llm import ToolCallResult
from typing import Optional, List, Dict, Any, Union
from pydantic import BaseModel, model_validator
@@ -21,7 +22,7 @@ class InvestigateRequest(BaseModel):
include_tool_calls: bool = False
include_tool_call_results: bool = False
prompt_template: str = "builtin://generic_investigation.jinja2"
- sections: Optional[Dict[str, str]] = None
+ sections: Optional[InputSectionsDataType] = None
# TODO in the future
# response_handler: ...
@@ -93,10 +94,16 @@ class ChatRequestBaseModel(BaseModel):
@model_validator(mode="before")
def check_first_item_role(cls, values):
conversation_history = values.get("conversation_history")
- if conversation_history and isinstance(conversation_history, list) and len(conversation_history)>0:
+ if (
+ conversation_history
+ and isinstance(conversation_history, list)
+ and len(conversation_history) > 0
+ ):
first_item = conversation_history[0]
if not first_item.get("role") == "system":
- raise ValueError("The first item in conversation_history must contain 'role': 'system'")
+ raise ValueError(
+ "The first item in conversation_history must contain 'role': 'system'"
+ )
return values
@@ -126,3 +133,14 @@ class ChatResponse(BaseModel):
analysis: str
conversation_history: list[dict]
tool_calls: Optional[List[ToolCallResult]] = []
+
+
+class WorkloadHealthInvestigationResult(BaseModel):
+ analysis: Optional[str] = None
+ tools: Optional[List[ToolCallConversationResult]] = []
+
+
+class WorkloadHealthChatRequest(ChatRequestBaseModel):
+ ask: str
+ workload_health_result: WorkloadHealthInvestigationResult
+ resource: dict
diff --git a/holmes/core/performance_timing.py b/holmes/core/performance_timing.py
index 115c5e6..3c5b07b 100644
--- a/holmes/core/performance_timing.py
+++ b/holmes/core/performance_timing.py
@@ -7,6 +7,7 @@
LOG_PERFORMANCE,
)
+
class PerformanceTiming:
def __init__(self, name):
if not LOG_PERFORMANCE:
@@ -40,10 +41,13 @@ def end(self):
self.ended = True
current_time = time.time()
time_since_start = int((current_time - self.start_time) * 1000)
- message = f'{self.name}(TOTAL) {time_since_start}ms'
+ message = f"{self.name}(TOTAL) {time_since_start}ms"
logging.info(message)
for label, time_since_last, time_since_start in self.timings:
- logging.info(f'\t{self.name}({label}) +{time_since_last}ms {time_since_start}ms')
+ logging.info(
+ f"\t{self.name}({label}) +{time_since_last}ms {time_since_start}ms"
+ )
+
def log_function_timing(func):
@wraps(func)
@@ -54,4 +58,5 @@ def function_timing_wrapper(*args, **kwargs):
total_time = int((end_time - start_time) * 1000)
logging.info(f'Function "{func.__name__}()" took {total_time}ms')
return result
+
return function_timing_wrapper
diff --git a/holmes/core/runbooks.py b/holmes/core/runbooks.py
index ea277e7..0f2b60d 100644
--- a/holmes/core/runbooks.py
+++ b/holmes/core/runbooks.py
@@ -13,10 +13,14 @@ def get_instructions_for_issue(self, issue: Issue) -> List[str]:
for runbook in self.runbooks:
if runbook.match.issue_id and not runbook.match.issue_id.match(issue.id):
continue
- if runbook.match.issue_name and not runbook.match.issue_name.match(issue.name):
+ if runbook.match.issue_name and not runbook.match.issue_name.match(
+ issue.name
+ ):
continue
- if runbook.match.source and not runbook.match.source.match(issue.source_type):
+ if runbook.match.source and not runbook.match.source.match(
+ issue.source_type
+ ):
continue
instructions.append(runbook.instructions)
-
+
return instructions
diff --git a/holmes/core/supabase_dal.py b/holmes/core/supabase_dal.py
index e62a0e2..323421e 100644
--- a/holmes/core/supabase_dal.py
+++ b/holmes/core/supabase_dal.py
@@ -52,13 +52,16 @@ class RobustaToken(BaseModel):
class SupabaseDal:
-
def __init__(self):
self.enabled = self.__init_config()
if not self.enabled:
- logging.info("Not connecting to Robusta platform - robusta token not provided - using ROBUSTA_AI will not be possible")
+ logging.info(
+ "Not connecting to Robusta platform - robusta token not provided - using ROBUSTA_AI will not be possible"
+ )
return
- logging.info(f"Initializing Robusta platform connection for account {self.account_id}")
+ logging.info(
+ f"Initializing Robusta platform connection for account {self.account_id}"
+ )
options = ClientOptions(postgrest_client_timeout=SUPABASE_TIMEOUT_SECONDS)
self.client = create_client(self.url, self.api_key, options)
self.user_id = self.sign_in()
@@ -69,6 +72,7 @@ def __init__(self):
def patch_postgrest_execute(self):
logging.info("Patching postgres execute")
+
# This is somewhat hacky.
def execute_with_retry(_self):
try:
@@ -77,7 +81,9 @@ def execute_with_retry(_self):
message = exc.message or ""
if exc.code == "PGRST301" or "expired" in message.lower():
# JWT expired. Sign in again and retry the query
- logging.error("JWT token expired/invalid, signing in to Supabase again")
+ logging.error(
+ "JWT token expired/invalid, signing in to Supabase again"
+ )
self.sign_in()
# update the session to the new one, after re-sign in
_self.session = self.client.postgrest.session
@@ -99,11 +105,11 @@ def __load_robusta_config() -> Optional[RobustaToken]:
return RobustaToken(**json.loads(decoded))
except binascii.Error:
raise Exception(
- f"binascii.Error encountered. The Robusta UI token is not a valid base64."
+ "binascii.Error encountered. The Robusta UI token is not a valid base64."
)
except json.JSONDecodeError:
raise Exception(
- f"json.JSONDecodeError encountered. The Robusta UI token could not be parsed as JSON after being base64 decoded."
+ "json.JSONDecodeError encountered. The Robusta UI token could not be parsed as JSON after being base64 decoded."
)
if not os.path.exists(config_file_path):
@@ -119,28 +125,28 @@ def __load_robusta_config() -> Optional[RobustaToken]:
token = conf["robusta_sink"].get("token")
if not token:
raise Exception(
- f"No robusta token provided to Holmes. "
- f"Please set a valid Robusta UI token. "
- f"See https://docs.robusta.dev/master/configuration/ai-analysis.html#choosing-and-configuring-an-ai-provider for instructions."
+ "No robusta token provided to Holmes. "
+ "Please set a valid Robusta UI token. "
+ "See https://docs.robusta.dev/master/configuration/ai-analysis.html#choosing-and-configuring-an-ai-provider for instructions."
)
if "{{" in token:
raise ValueError(
- f"The robusta token configured for Holmes appears to be a templating placeholder (e.g. `{{ env.UI_SINK_TOKEN }}`). "
- f"Ensure your Helm chart or environment variables are set correctly. "
- f"If you store the token in a secret, you must also pass "
- f"the environment variable ROBUSTA_UI_TOKEN to Holmes. "
- f"See https://docs.robusta.dev/master/configuration/ai-analysis.html#configuring-holmesgpt-access-to-saas-data for instructions."
+ "The robusta token configured for Holmes appears to be a templating placeholder (e.g. `{ env.UI_SINK_TOKEN }`). "
+ "Ensure your Helm chart or environment variables are set correctly. "
+ "If you store the token in a secret, you must also pass "
+ "the environment variable ROBUSTA_UI_TOKEN to Holmes. "
+ "See https://docs.robusta.dev/master/configuration/ai-analysis.html#configuring-holmesgpt-access-to-saas-data for instructions."
)
try:
decoded = base64.b64decode(token)
return RobustaToken(**json.loads(decoded))
except binascii.Error:
raise Exception(
- f"binascii.Error encountered. The robusta token provided to Holmes is not a valid base64."
+ "binascii.Error encountered. The robusta token provided to Holmes is not a valid base64."
)
except json.JSONDecodeError:
raise Exception(
- f"json.JSONDecodeError encountered. The Robusta token provided to Holmes could not be parsed as JSON after being base64 decoded."
+ "json.JSONDecodeError encountered. The Robusta token provided to Holmes could not be parsed as JSON after being base64 decoded."
)
return None
@@ -166,8 +172,12 @@ def __init_config(self) -> bool:
def sign_in(self) -> str:
logging.info("Supabase DAL login")
- res = self.client.auth.sign_in_with_password({"email": self.email, "password": self.password})
- self.client.auth.set_session(res.session.access_token, res.session.refresh_token)
+ res = self.client.auth.sign_in_with_password(
+ {"email": self.email, "password": self.password}
+ )
+ self.client.auth.set_session(
+ res.session.access_token, res.session.refresh_token
+ )
self.client.postgrest.auth(res.session.access_token)
return res.user.id
@@ -181,11 +191,10 @@ def get_issue_data(self, issue_id: Optional[str]) -> Optional[Dict]:
issue_data = None
try:
issue_response = (
- self.client
- .table(ISSUES_TABLE)
- .select("*")
- .filter("id", "eq", issue_id)
- .execute()
+ self.client.table(ISSUES_TABLE)
+ .select("*")
+ .filter("id", "eq", issue_id)
+ .execute()
)
if len(issue_response.data):
issue_data = issue_response.data[0]
@@ -196,25 +205,29 @@ def get_issue_data(self, issue_id: Optional[str]) -> Optional[Dict]:
if not issue_data:
return None
evidence = (
- self.client
- .table(EVIDENCE_TABLE)
+ self.client.table(EVIDENCE_TABLE)
.select("*")
.filter("issue_id", "eq", issue_id)
.execute()
)
enrichment_blacklist = {"text_file", "graph", "ai_analysis", "holmes"}
- data = [enrich for enrich in evidence.data if enrich.get("enrichment_type") not in enrichment_blacklist]
+ data = [
+ enrich
+ for enrich in evidence.data
+ if enrich.get("enrichment_type") not in enrichment_blacklist
+ ]
issue_data["evidence"] = data
return issue_data
- def get_resource_instructions(self, type: str, name: Optional[str]) -> Optional[ResourceInstructions]:
+ def get_resource_instructions(
+ self, type: str, name: Optional[str]
+ ) -> Optional[ResourceInstructions]:
if not self.enabled or not name:
return None
res = (
- self.client
- .table(RUNBOOKS_TABLE)
+ self.client.table(RUNBOOKS_TABLE)
.select("runbook")
.eq("account_id", self.account_id)
.eq("subject_type", type)
@@ -232,7 +245,9 @@ def get_resource_instructions(self, type: str, name: Optional[str]) -> Optional[
if url:
documents.append(ResourceInstructionDocument(url=url))
else:
- logging.warning(f"Unsupported runbook for subject_type={type} / subject_name={name}: {document_data}")
+ logging.warning(
+ f"Unsupported runbook for subject_type={type} / subject_name={name}: {document_data}"
+ )
return ResourceInstructions(instructions=instructions, documents=documents)
@@ -241,20 +256,19 @@ def get_resource_instructions(self, type: str, name: Optional[str]) -> Optional[
def get_global_instructions_for_account(self) -> Optional[Instructions]:
try:
res = (
- self.client
- .table(RUNBOOKS_TABLE)
- .select("runbook")
- .eq("account_id", self.account_id)
- .eq("subject_type", "Account")
- .execute()
- )
+ self.client.table(RUNBOOKS_TABLE)
+ .select("runbook")
+ .eq("account_id", self.account_id)
+ .eq("subject_type", "Account")
+ .execute()
+ )
if res.data:
instructions = res.data[0].get("runbook").get("instructions")
return Instructions(instructions=instructions)
except Exception:
logging.exception("Failed to fetch global instructions", exc_info=True)
-
+
return None
def create_session_token(self) -> str:
@@ -265,14 +279,17 @@ def create_session_token(self) -> str:
"user_id": self.user_id,
"token": token,
"type": "HOLMES",
- }, returning=ReturnMethod.minimal # must use this, because the user cannot read this table
+ },
+ returning=ReturnMethod.minimal, # must use this, because the user cannot read this table
).execute()
return token
def get_ai_credentials(self) -> Tuple[str, str]:
if not self.enabled:
- raise Exception("You're trying to use ROBUSTA_AI, but Cannot get credentials for ROBUSTA_AI. Store not initialized.")
-
+ raise Exception(
+ "You're trying to use ROBUSTA_AI, but Cannot get credentials for ROBUSTA_AI. Store not initialized."
+ )
+
with self.lock:
session_token = self.token_cache.get("session_token")
if not session_token:
@@ -296,8 +313,7 @@ def get_workload_issues(self, resource: dict, since_hours: float) -> List[str]:
logging.debug(f"getting issues for workload {svc_key}")
try:
res = (
- self.client
- .table(ISSUES_TABLE)
+ self.client.table(ISSUES_TABLE)
.select("id, creation_date, aggregation_key")
.eq("account_id", self.account_id)
.eq("cluster", cluster)
@@ -317,73 +333,81 @@ def get_workload_issues(self, resource: dict, since_hours: float) -> List[str]:
unique_issues: list[str] = list(issue_dict.values())
res = (
- self.client
- .table(EVIDENCE_TABLE)
+ self.client.table(EVIDENCE_TABLE)
.select("data, enrichment_type")
.in_("issue_id", unique_issues)
.execute()
)
enrichment_blacklist = {"text_file", "graph", "ai_analysis", "holmes"}
- data = [evidence.get("data") for evidence in res.data if evidence.get("enrichment_type") not in enrichment_blacklist]
+ data = [
+ evidence.get("data")
+ for evidence in res.data
+ if evidence.get("enrichment_type") not in enrichment_blacklist
+ ]
return data
- except:
+ except Exception:
logging.exception("failed to fetch workload issues data", exc_info=True)
return []
def upsert_holmes_status(self, holmes_status_data: dict) -> None:
if not self.enabled:
- logging.info("Robusta store not initialized. Skipping upserting holmes status.")
+ logging.info(
+ "Robusta store not initialized. Skipping upserting holmes status."
+ )
return
-
+
updated_at = datetime.now().isoformat()
try:
- res = (
- self.client
- .table(HOLMES_STATUS_TABLE)
- .upsert({
- "account_id": self.account_id,
- "updated_at": updated_at,
- **holmes_status_data,
- },
- on_conflict='account_id, cluster_id')
+ (
+ self.client.table(HOLMES_STATUS_TABLE)
+ .upsert(
+ {
+ "account_id": self.account_id,
+ "updated_at": updated_at,
+ **holmes_status_data,
+ },
+ on_conflict="account_id, cluster_id",
+ )
.execute()
)
except Exception as error:
- logging.error(f"Error happened during upserting holmes status: {error}",
- exc_info=True)
+ logging.error(
+ f"Error happened during upserting holmes status: {error}", exc_info=True
+ )
return None
-
+
def sync_toolsets(self, toolsets: list[dict], cluster_name: str) -> None:
if not toolsets:
logging.warning("No toolsets were provided for synchronization.")
return
-
+
if not self.enabled:
- logging.info("Robusta store not initialized. Skipping sync holmes toolsets.")
+ logging.info(
+ "Robusta store not initialized. Skipping sync holmes toolsets."
+ )
return
-
- provided_toolset_names = [toolset['toolset_name'] for toolset in toolsets]
-
+
+ provided_toolset_names = [toolset["toolset_name"] for toolset in toolsets]
+
try:
self.client.table(HOLMES_TOOLSET).upsert(
- toolsets,
- on_conflict='account_id, cluster_id, toolset_name'
+ toolsets, on_conflict="account_id, cluster_id, toolset_name"
).execute()
logging.info("Toolsets upserted successfully.")
-
- self.client.table(HOLMES_TOOLSET).delete().eq("account_id",
- self.account_id).eq(
- 'cluster_id', cluster_name).not_.in_(
- 'toolset_name', provided_toolset_names
+ self.client.table(HOLMES_TOOLSET).delete().eq(
+ "account_id", self.account_id
+ ).eq("cluster_id", cluster_name).not_.in_(
+ "toolset_name", provided_toolset_names
).execute()
logging.info("Toolsets synchronized successfully.")
except Exception as e:
- logging.exception(f"An error occurred during toolset synchronization: {e}",
- exc_info=True)
+ logging.exception(
+ f"An error occurred during toolset synchronization: {e}", exc_info=True
+ )
diff --git a/holmes/core/tool_calling_llm.py b/holmes/core/tool_calling_llm.py
index b0c1a44..75353ae 100644
--- a/holmes/core/tool_calling_llm.py
+++ b/holmes/core/tool_calling_llm.py
@@ -3,7 +3,11 @@
import logging
import textwrap
from typing import List, Optional, Dict, Type, Union
-from holmes.core.investigation_structured_output import DEFAULT_SECTIONS, get_output_format_for_investigation, combine_sections
+from holmes.core.investigation_structured_output import (
+ DEFAULT_SECTIONS,
+ InputSectionsDataType,
+ get_output_format_for_investigation,
+)
from holmes.core.performance_timing import PerformanceTiming
from holmes.utils.tags import format_tags_in_string, parse_messages_tags
from holmes.plugins.prompts import load_and_render_prompt
@@ -30,11 +34,9 @@ class ToolCallResult(BaseModel):
class LLMResult(BaseModel):
tool_calls: Optional[List[ToolCallResult]] = None
- sections: Optional[Dict[str, Union[str, None]]] = None
result: Optional[str] = None
unprocessed_result: Optional[str] = None
instructions: List[str] = []
-
# TODO: clean up these two
prompt: Optional[str] = None
messages: Optional[List[dict]] = None
@@ -57,12 +59,13 @@ class ResourceInstructionDocument(BaseModel):
class Instructions(BaseModel):
instructions: List[str] = []
+
class ResourceInstructions(BaseModel):
instructions: List[str] = []
documents: List[ResourceInstructionDocument] = []
-class ToolCallingLLM:
+class ToolCallingLLM:
llm: LLM
def __init__(self, tool_executor: ToolExecutor, max_steps: int, llm: LLM):
@@ -91,7 +94,6 @@ def messages_call(
post_process_prompt: Optional[str] = None,
response_format: Optional[Union[dict, Type[BaseModel]]] = None,
) -> LLMResult:
-
return self.call(messages, post_process_prompt, response_format)
def call(
@@ -124,7 +126,6 @@ def call(
)
perf_timing.measure("truncate_messages_to_fit_context")
-
logging.debug(f"sending messages={messages}\n\ntools={tools}")
try:
full_response = self.llm.completion(
@@ -140,9 +141,8 @@ def call(
perf_timing.measure("llm.completion")
# catch a known error that occurs with Azure and replace the error message with something more obvious to the user
except BadRequestError as e:
- if (
- "Unrecognized request arguments supplied: tool_choice, tools"
- in str(e)
+ if "Unrecognized request arguments supplied: tool_choice, tools" in str(
+ e
):
raise Exception(
"The Azure model you chose is not supported. Model version 1106 and higher required."
@@ -159,22 +159,12 @@ def call(
tools_to_call = getattr(response_message, "tool_calls", None)
text_response = response_message.content
- sections:Optional[Dict[str, str]] = None
- if isinstance(text_response, str):
- try:
- parsed_json = json.loads(text_response)
- text_response = parsed_json
- except json.JSONDecodeError:
- pass
- if not isinstance(text_response, str):
- sections = text_response
- text_response = combine_sections(sections)
if not tools_to_call:
# For chatty models post process and summarize the result
# this only works for calls where user prompt is explicitly passed through
if post_process_prompt and user_prompt:
- logging.info(f"Running post processing on investigation.")
+ logging.info("Running post processing on investigation.")
raw_response = text_response
post_processed_response = self._post_processing_call(
prompt=user_prompt,
@@ -185,7 +175,6 @@ def call(
perf_timing.end()
return LLMResult(
result=post_processed_response,
- sections=sections,
unprocessed_result=raw_response,
tool_calls=tool_calls,
prompt=json.dumps(messages, indent=2),
@@ -195,7 +184,6 @@ def call(
perf_timing.end()
return LLMResult(
result=text_response,
- sections=sections,
tool_calls=tool_calls,
prompt=json.dumps(messages, indent=2),
messages=messages,
@@ -231,7 +219,6 @@ def _invoke_tool(
logging.warning(
f"Failed to parse arguments for tool: {tool_name}. args: {tool_to_call.function.arguments}"
)
-
tool_call_id = tool_to_call.id
tool = self.tool_executor.get_tool_by_name(tool_name)
@@ -358,7 +345,7 @@ def investigate(
console: Optional[Console] = None,
global_instructions: Optional[Instructions] = None,
post_processing_prompt: Optional[str] = None,
- sections: Optional[Dict[str, str]] = None
+ sections: Optional[InputSectionsDataType] = None,
) -> LLMResult:
runbooks = self.runbook_manager.get_instructions_for_issue(issue)
@@ -376,9 +363,11 @@ def investigate(
console.print(
"[bold]No runbooks found for this issue. Using default behaviour. (Add runbooks to guide the investigation.)[/bold]"
)
- system_prompt = load_and_render_prompt(prompt, {"issue": issue, "sections": sections})
+ system_prompt = load_and_render_prompt(
+ prompt, {"issue": issue, "sections": sections}
+ )
- if instructions != None and len(instructions.documents) > 0:
+ if instructions is not None and len(instructions.documents) > 0:
docPrompts = []
for document in instructions.documents:
docPrompts.append(
@@ -393,7 +382,11 @@ def investigate(
user_prompt = f'My instructions to check \n"""{user_prompt}"""'
- if global_instructions and global_instructions.instructions and len(global_instructions.instructions[0]) > 0:
+ if (
+ global_instructions
+ and global_instructions.instructions
+ and len(global_instructions.instructions[0]) > 0
+ ):
user_prompt += f"\n\nGlobal Instructions (use only if relevant): {global_instructions.instructions[0]}\n"
user_prompt = f"{user_prompt}\n This is context from the issue {issue.raw}"
@@ -403,6 +396,11 @@ def investigate(
)
logging.debug("Rendered user prompt:\n%s", textwrap.indent(user_prompt, " "))
- res = self.prompt_call(system_prompt, user_prompt, post_processing_prompt, response_format=get_output_format_for_investigation(sections))
+ res = self.prompt_call(
+ system_prompt,
+ user_prompt,
+ post_processing_prompt,
+ response_format=get_output_format_for_investigation(sections),
+ )
res.instructions = runbooks
return res
diff --git a/holmes/core/tools.py b/holmes/core/tools.py
index f0515d6..636d397 100644
--- a/holmes/core/tools.py
+++ b/holmes/core/tools.py
@@ -326,7 +326,7 @@ def check_prerequisites(self):
and prereq.expected_output not in result.stdout
):
self._status = ToolsetStatusEnum.FAILED
- self._error = f"Prerequisites check gave wrong output"
+ self._error = "Prerequisites check gave wrong output"
return
except subprocess.CalledProcessError as e:
self._status = ToolsetStatusEnum.FAILED
@@ -356,6 +356,9 @@ def check_prerequisites(self):
self._status = ToolsetStatusEnum.ENABLED
+ def get_example_config(self) -> Dict[str, Any]:
+ return {}
+
class YAMLToolset(Toolset):
tools: List[YAMLTool]
diff --git a/holmes/main.py b/holmes/main.py
index 6f9637c..0e524c0 100644
--- a/holmes/main.py
+++ b/holmes/main.py
@@ -1,3 +1,4 @@
+# ruff: noqa: E402
import os
from holmes.utils.cert_utils import add_custom_certificate
@@ -55,6 +56,7 @@ class Verbosity(Enum):
VERBOSE = 2
VERY_VERBOSE = 3
+
def cli_flags_to_verbosity(verbose_flags: List[bool]) -> Verbosity:
if verbose_flags is None or len(verbose_flags) == 0:
return Verbosity.NORMAL
@@ -65,6 +67,7 @@ def cli_flags_to_verbosity(verbose_flags: List[bool]) -> Verbosity:
else:
return Verbosity.VERY_VERBOSE
+
def suppress_noisy_logs():
# disable INFO logs from OpenAI
logging.getLogger("httpx").setLevel(logging.WARNING)
@@ -80,33 +83,44 @@ def suppress_noisy_logs():
# suppress UserWarnings from the slack_sdk module
warnings.filterwarnings("ignore", category=UserWarning, module="slack_sdk.*")
+
def init_logging(verbose_flags: List[bool] = None):
verbosity = cli_flags_to_verbosity(verbose_flags)
if verbosity == Verbosity.VERY_VERBOSE:
- logging.basicConfig(level=logging.DEBUG, format="%(message)s", handlers=[RichHandler(show_level=False, show_time=False)])
+ logging.basicConfig(
+ level=logging.DEBUG,
+ format="%(message)s",
+ handlers=[RichHandler(show_level=False, show_time=False)],
+ )
elif verbosity == Verbosity.VERBOSE:
- logging.basicConfig(level=logging.INFO, format="%(message)s", handlers=[RichHandler(show_level=False, show_time=False)])
+ logging.basicConfig(
+ level=logging.INFO,
+ format="%(message)s",
+ handlers=[RichHandler(show_level=False, show_time=False)],
+ )
logging.getLogger().setLevel(logging.DEBUG)
suppress_noisy_logs()
else:
- logging.basicConfig(level=logging.INFO, format="%(message)s", handlers=[RichHandler(show_level=False, show_time=False)])
+ logging.basicConfig(
+ level=logging.INFO,
+ format="%(message)s",
+ handlers=[RichHandler(show_level=False, show_time=False)],
+ )
suppress_noisy_logs()
logging.debug(f"verbosity is {verbosity}")
return Console()
+
# Common cli options
# The defaults for options that are also in the config file MUST be None or else the cli defaults will override settings in the config file
opt_api_key: Optional[str] = typer.Option(
None,
help="API key to use for the LLM (if not given, uses environment variables OPENAI_API_KEY or AZURE_API_KEY)",
)
-opt_model: Optional[str] = typer.Option(
- None,
- help="Model to use for the LLM"
-)
+opt_model: Optional[str] = typer.Option(None, help="Model to use for the LLM")
opt_config_file: Optional[Path] = typer.Option(
None,
"--config",
@@ -182,7 +196,8 @@ def init_logging(verbose_flags: List[bool] = None):
# Common help texts
system_prompt_help = "Advanced. System prompt for LLM. Values starting with builtin:// are loaded from holmes/plugins/prompts, values starting with file:// are loaded from the given path, other values are interpreted as a prompt string"
-def parse_documents(documents:Optional[str]) -> List[ResourceInstructionDocument]:
+
+def parse_documents(documents: Optional[str]) -> List[ResourceInstructionDocument]:
resource_documents = []
if documents is not None:
@@ -193,6 +208,7 @@ def parse_documents(documents:Optional[str]) -> List[ResourceInstructionDocument
return resource_documents
+
def handle_result(
result: LLMResult,
console: Console,
@@ -205,11 +221,14 @@ def handle_result(
if destination == DestinationType.CLI:
if show_tool_output and result.tool_calls:
for tool_call in result.tool_calls:
- console.print(f"[bold magenta]Used Tool:[/bold magenta]", end="")
+ console.print("[bold magenta]Used Tool:[/bold magenta]", end="")
# we need to print this separately with markup=False because it contains arbitrary text and we don't want console.print to interpret it
- console.print(f"{tool_call.description}. Output=\n{tool_call.result}", markup=False)
+ console.print(
+ f"{tool_call.description}. Output=\n{tool_call.result}",
+ markup=False,
+ )
- console.print(f"[bold green]AI:[/bold green]", end=" ")
+ console.print("[bold green]AI:[/bold green]", end=" ")
console.print(Markdown(result.result))
if add_separator:
console.print(Rule())
@@ -236,7 +255,6 @@ def ask(
destination: Optional[DestinationType] = opt_destination,
slack_token: Optional[str] = opt_slack_token,
slack_channel: Optional[str] = opt_slack_channel,
-
# advanced options for this command
system_prompt: Optional[str] = typer.Option(
"builtin://generic_ask.jinja2", help=system_prompt_help
@@ -254,7 +272,7 @@ def ask(
),
json_output_file: Optional[str] = opt_json_output_file,
echo_request: bool = opt_echo_request,
- post_processing_prompt: Optional[str] = opt_post_processing_prompt
+ post_processing_prompt: Optional[str] = opt_post_processing_prompt,
):
"""
Ask any question and answer using available tools
@@ -270,7 +288,9 @@ def ask(
slack_channel=slack_channel,
)
system_prompt = load_and_render_prompt(system_prompt)
- ai = config.create_console_toolcalling_llm(allowed_toolsets=allowed_toolsets, dal=None)
+ ai = config.create_console_toolcalling_llm(
+ allowed_toolsets=allowed_toolsets, dal=None
+ )
if echo_request:
console.print("[bold yellow]User:[/bold yellow] " + prompt)
for path in include_file:
@@ -290,7 +310,9 @@ def ask(
raw={"prompt": prompt},
source_instance_id=socket.gethostname(),
)
- handle_result(response, console, destination, config, issue, show_tool_output, False)
+ handle_result(
+ response, console, destination, config, issue, show_tool_output, False
+ )
@investigate_app.command()
@@ -302,7 +324,7 @@ def alertmanager(
),
alertmanager_label: Optional[List[str]] = typer.Option(
[],
- help="For filtering alerts with a specific label. Must be of format key=value. If --alertmanager-label is passed multiple times, alerts must match ALL labels"
+ help="For filtering alerts with a specific label. Must be of format key=value. If --alertmanager-label is passed multiple times, alerts must match ALL labels",
),
alertmanager_username: Optional[str] = typer.Option(
None, help="Username to use for basic auth"
@@ -314,9 +336,7 @@ def alertmanager(
None, help="Load alertmanager alerts from a file (used by the test framework)"
),
alertmanager_limit: Optional[int] = typer.Option(
- None,
- "-n",
- help="Limit the number of alerts to process"
+ None, "-n", help="Limit the number of alerts to process"
),
# common options
api_key: Optional[str] = opt_api_key,
@@ -335,7 +355,7 @@ def alertmanager(
system_prompt: Optional[str] = typer.Option(
"builtin://generic_investigation.jinja2", help=system_prompt_help
),
- post_processing_prompt: Optional[str] = opt_post_processing_prompt
+ post_processing_prompt: Optional[str] = opt_post_processing_prompt,
):
"""
Investigate a Prometheus/Alertmanager alert
@@ -355,21 +375,25 @@ def alertmanager(
slack_token=slack_token,
slack_channel=slack_channel,
custom_toolsets=custom_toolsets,
- custom_runbooks=custom_runbooks
+ custom_runbooks=custom_runbooks,
)
- ai = config.create_console_issue_investigator(console, allowed_toolsets=allowed_toolsets)
+ ai = config.create_console_issue_investigator(
+ console, allowed_toolsets=allowed_toolsets
+ )
source = config.create_alertmanager_source()
try:
issues = source.fetch_issues()
except Exception as e:
- logging.error(f"Failed to fetch issues from alertmanager", exc_info=e)
+ logging.error("Failed to fetch issues from alertmanager", exc_info=e)
return
if alertmanager_limit is not None:
- console.print(f"[bold yellow]Limiting to {alertmanager_limit}/{len(issues)} issues.[/bold yellow]")
+ console.print(
+ f"[bold yellow]Limiting to {alertmanager_limit}/{len(issues)} issues.[/bold yellow]"
+ )
issues = issues[:alertmanager_limit]
if alertmanager_alertname is not None:
@@ -390,11 +414,11 @@ def alertmanager(
prompt=system_prompt,
console=console,
instructions=None,
- post_processing_prompt=post_processing_prompt)
+ post_processing_prompt=post_processing_prompt,
+ )
results.append({"issue": issue.model_dump(), "result": result.model_dump()})
handle_result(result, console, destination, config, issue, False, True)
-
if json_output_file:
write_json_file(json_output_file, results)
@@ -409,7 +433,8 @@ def generate_alertmanager_tests(
None, help="Password to use for basic auth"
),
output: Optional[Path] = typer.Option(
- None, help="Path to dump alertmanager alerts as json (if not given, output curl commands instead)"
+ None,
+ help="Path to dump alertmanager alerts as json (if not given, output curl commands instead)",
),
config_file: Optional[str] = opt_config_file,
verbose: Optional[List[bool]] = opt_verbose,
@@ -437,12 +462,12 @@ def jira(
jira_url: Optional[str] = typer.Option(
None,
help="Jira url - e.g. https://your-company.atlassian.net",
- envvar="JIRA_URL"
+ envvar="JIRA_URL",
),
jira_username: Optional[str] = typer.Option(
None,
help="The email address with which you log into Jira",
- envvar="JIRA_USERNAME"
+ envvar="JIRA_USERNAME",
),
jira_api_key: str = typer.Option(
None,
@@ -452,9 +477,7 @@ def jira(
None,
help="Investigate tickets matching a JQL query (e.g. 'project=DEFAULT_PROJECT')",
),
- update: Optional[bool] = typer.Option(
- False, help="Update Jira with AI results"
- ),
+ update: Optional[bool] = typer.Option(False, help="Update Jira with AI results"),
# common options
api_key: Optional[str] = opt_api_key,
model: Optional[str] = opt_model,
@@ -469,7 +492,7 @@ def jira(
system_prompt: Optional[str] = typer.Option(
"builtin://generic_investigation.jinja2", help=system_prompt_help
),
- post_processing_prompt: Optional[str] = opt_post_processing_prompt
+ post_processing_prompt: Optional[str] = opt_post_processing_prompt,
):
"""
Investigate a Jira ticket
@@ -485,14 +508,16 @@ def jira(
jira_api_key=jira_api_key,
jira_query=jira_query,
custom_toolsets=custom_toolsets,
- custom_runbooks=custom_runbooks
+ custom_runbooks=custom_runbooks,
+ )
+ ai = config.create_console_issue_investigator(
+ console, allowed_toolsets=allowed_toolsets
)
- ai = config.create_console_issue_investigator(console, allowed_toolsets=allowed_toolsets)
source = config.create_jira_source()
try:
issues = source.fetch_issues()
except Exception as e:
- logging.error(f"Failed to fetch issues from Jira", exc_info=e)
+ logging.error("Failed to fetch issues from Jira", exc_info=e)
return
console.print(
@@ -509,7 +534,8 @@ def jira(
prompt=system_prompt,
console=console,
instructions=None,
- post_processing_prompt=post_processing_prompt)
+ post_processing_prompt=post_processing_prompt,
+ )
console.print(Rule())
console.print(f"[bold green]AI analysis of {issue.url}[/bold green]")
@@ -532,10 +558,12 @@ def jira(
@investigate_app.command()
def github(
github_url: str = typer.Option(
- "https://api.github.com", help="The GitHub api base url (e.g: https://api.github.com)"
+ "https://api.github.com",
+ help="The GitHub api base url (e.g: https://api.github.com)",
),
github_owner: Optional[str] = typer.Option(
- None, help="The GitHub repository Owner, eg: if the repository url is https://github.com/robusta-dev/holmesgpt, the owner is robusta-dev"
+ None,
+ help="The GitHub repository Owner, eg: if the repository url is https://github.com/robusta-dev/holmesgpt, the owner is robusta-dev",
),
github_pat: str = typer.Option(
None,
@@ -544,9 +572,7 @@ def github(
None,
help="The GitHub repository name, eg: if the repository url is https://github.com/robusta-dev/holmesgpt, the repository name is holmesgpt",
),
- update: Optional[bool] = typer.Option(
- False, help="Update GitHub with AI results"
- ),
+ update: Optional[bool] = typer.Option(False, help="Update GitHub with AI results"),
github_query: Optional[str] = typer.Option(
"is:issue is:open",
help="Investigate tickets matching a GitHub query (e.g. 'is:issue is:open')",
@@ -564,7 +590,7 @@ def github(
system_prompt: Optional[str] = typer.Option(
"builtin://generic_investigation.jinja2", help=system_prompt_help
),
- post_processing_prompt: Optional[str] = opt_post_processing_prompt
+ post_processing_prompt: Optional[str] = opt_post_processing_prompt,
):
"""
Investigate a GitHub issue
@@ -581,33 +607,37 @@ def github(
github_repository=github_repository,
github_query=github_query,
custom_toolsets=custom_toolsets,
- custom_runbooks=custom_runbooks
+ custom_runbooks=custom_runbooks,
+ )
+ ai = config.create_issue_invcreate_console_issue_investigatorestigator(
+ console, allowed_toolsets
)
- ai = config.create_issue_invcreate_console_issue_investigatorestigator(console, allowed_toolsets)
source = config.create_github_source()
try:
issues = source.fetch_issues()
except Exception as e:
- logging.error(f"Failed to fetch issues from GitHub", exc_info=e)
+ logging.error("Failed to fetch issues from GitHub", exc_info=e)
return
console.print(
f"[bold yellow]Analyzing {len(issues)} GitHub Issues.[/bold yellow] [red]Press Ctrl+C to stop.[/red]"
)
for i, issue in enumerate(issues):
- console.print(f"[bold yellow]Analyzing GitHub issue {i+1}/{len(issues)}: {issue.name}...[/bold yellow]")
+ console.print(
+ f"[bold yellow]Analyzing GitHub issue {i+1}/{len(issues)}: {issue.name}...[/bold yellow]"
+ )
result = ai.investigate(
issue=issue,
prompt=system_prompt,
console=console,
instructions=None,
- post_processing_prompt=post_processing_prompt)
+ post_processing_prompt=post_processing_prompt,
+ )
console.print(Rule())
console.print(f"[bold green]AI analysis of {issue.url}[/bold green]")
- console.print(Markdown(result.result.replace(
- "\n", "\n\n")), style="bold green")
+ console.print(Markdown(result.result.replace("\n", "\n\n")), style="bold green")
console.print(Rule())
if update:
source.write_back_result(issue.id, result)
@@ -617,16 +647,20 @@ def github(
f"[bold]Not updating issue {issue.url}. Use the --update option to do so.[/bold]"
)
+
@investigate_app.command()
def pagerduty(
pagerduty_api_key: str = typer.Option(
- None, help="The PagerDuty API key. This can be found in the PagerDuty UI under Integrations > API Access Keys."
+ None,
+ help="The PagerDuty API key. This can be found in the PagerDuty UI under Integrations > API Access Keys.",
),
pagerduty_user_email: Optional[str] = typer.Option(
- None, help="When --update is set, which user will be listed as the user who updated the ticket. (Must be the email of a valid user in your PagerDuty account.)"
+ None,
+ help="When --update is set, which user will be listed as the user who updated the ticket. (Must be the email of a valid user in your PagerDuty account.)",
),
pagerduty_incident_key: Optional[str] = typer.Option(
- None, help="If provided, only analyze a single PagerDuty incident matching this key"
+ None,
+ help="If provided, only analyze a single PagerDuty incident matching this key",
),
update: Optional[bool] = typer.Option(
False, help="Update PagerDuty with AI results"
@@ -645,7 +679,7 @@ def pagerduty(
system_prompt: Optional[str] = typer.Option(
"builtin://generic_investigation.jinja2", help=system_prompt_help
),
- post_processing_prompt: Optional[str] = opt_post_processing_prompt
+ post_processing_prompt: Optional[str] = opt_post_processing_prompt,
):
"""
Investigate a PagerDuty incident
@@ -660,14 +694,14 @@ def pagerduty(
pagerduty_user_email=pagerduty_user_email,
pagerduty_incident_key=pagerduty_incident_key,
custom_toolsets=custom_toolsets,
- custom_runbooks=custom_runbooks
+ custom_runbooks=custom_runbooks,
)
ai = config.create_console_issue_investigator(console, allowed_toolsets)
source = config.create_pagerduty_source()
try:
issues = source.fetch_issues()
except Exception as e:
- logging.error(f"Failed to fetch issues from PagerDuty", exc_info=e)
+ logging.error("Failed to fetch issues from PagerDuty", exc_info=e)
return
console.print(
@@ -676,19 +710,21 @@ def pagerduty(
results = []
for i, issue in enumerate(issues):
- console.print(f"[bold yellow]Analyzing PagerDuty incident {i+1}/{len(issues)}: {issue.name}...[/bold yellow]")
+ console.print(
+ f"[bold yellow]Analyzing PagerDuty incident {i+1}/{len(issues)}: {issue.name}...[/bold yellow]"
+ )
result = ai.investigate(
issue=issue,
prompt=system_prompt,
console=console,
instructions=None,
- post_processing_prompt=post_processing_prompt)
+ post_processing_prompt=post_processing_prompt,
+ )
console.print(Rule())
console.print(f"[bold green]AI analysis of {issue.url}[/bold green]")
- console.print(Markdown(result.result.replace(
- "\n", "\n\n")), style="bold green")
+ console.print(Markdown(result.result.replace("\n", "\n\n")), style="bold green")
console.print(Rule())
if update:
source.write_back_result(issue.id, result)
@@ -702,16 +738,16 @@ def pagerduty(
if json_output_file:
write_json_file(json_output_file, results)
+
@investigate_app.command()
def opsgenie(
- opsgenie_api_key: str = typer.Option(
- None, help="The OpsGenie API key"
- ),
+ opsgenie_api_key: str = typer.Option(None, help="The OpsGenie API key"),
opsgenie_team_integration_key: str = typer.Option(
None, help=OPSGENIE_TEAM_INTEGRATION_KEY_HELP
),
opsgenie_query: Optional[str] = typer.Option(
- None, help="E.g. 'message: Foo' (see https://support.atlassian.com/opsgenie/docs/search-queries-for-alerts/)"
+ None,
+ help="E.g. 'message: Foo' (see https://support.atlassian.com/opsgenie/docs/search-queries-for-alerts/)",
),
update: Optional[bool] = typer.Option(
False, help="Update OpsGenie with AI results"
@@ -730,7 +766,7 @@ def opsgenie(
"builtin://generic_investigation.jinja2", help=system_prompt_help
),
post_processing_prompt: Optional[str] = opt_post_processing_prompt,
- documents: Optional[str] = opt_documents
+ documents: Optional[str] = opt_documents,
):
"""
Investigate an OpsGenie alert
@@ -745,32 +781,34 @@ def opsgenie(
opsgenie_team_integration_key=opsgenie_team_integration_key,
opsgenie_query=opsgenie_query,
custom_toolsets=custom_toolsets,
- custom_runbooks=custom_runbooks
+ custom_runbooks=custom_runbooks,
)
ai = config.create_console_issue_investigator(console, allowed_toolsets)
source = config.create_opsgenie_source()
try:
issues = source.fetch_issues()
except Exception as e:
- logging.error(f"Failed to fetch issues from OpsGenie", exc_info=e)
+ logging.error("Failed to fetch issues from OpsGenie", exc_info=e)
return
console.print(
f"[bold yellow]Analyzing {len(issues)} OpsGenie alerts.[/bold yellow] [red]Press Ctrl+C to stop.[/red]"
)
for i, issue in enumerate(issues):
- console.print(f"[bold yellow]Analyzing OpsGenie alert {i+1}/{len(issues)}: {issue.name}...[/bold yellow]")
+ console.print(
+ f"[bold yellow]Analyzing OpsGenie alert {i+1}/{len(issues)}: {issue.name}...[/bold yellow]"
+ )
result = ai.investigate(
issue=issue,
prompt=system_prompt,
console=console,
instructions=None,
- post_processing_prompt=post_processing_prompt)
+ post_processing_prompt=post_processing_prompt,
+ )
console.print(Rule())
console.print(f"[bold green]AI analysis of {issue.url}[/bold green]")
- console.print(Markdown(result.result.replace(
- "\n", "\n\n")), style="bold green")
+ console.print(Markdown(result.result.replace("\n", "\n\n")), style="bold green")
console.print(Rule())
if update:
source.write_back_result(issue.id, result)
diff --git a/holmes/plugins/destinations/__init__.py b/holmes/plugins/destinations/__init__.py
index 0c97d44..41fa2a2 100644
--- a/holmes/plugins/destinations/__init__.py
+++ b/holmes/plugins/destinations/__init__.py
@@ -1,5 +1,6 @@
from strenum import StrEnum
+
class DestinationType(StrEnum):
SLACK = "slack"
CLI = "cli"
diff --git a/holmes/plugins/destinations/slack/__init__.py b/holmes/plugins/destinations/slack/__init__.py
index a12b115..1d8b71a 100644
--- a/holmes/plugins/destinations/slack/__init__.py
+++ b/holmes/plugins/destinations/slack/__init__.py
@@ -1 +1,2 @@
+# ruff: noqa: F401
from .plugin import SlackDestination
diff --git a/holmes/plugins/destinations/slack/plugin.py b/holmes/plugins/destinations/slack/plugin.py
index 357c2e4..2d40ba7 100644
--- a/holmes/plugins/destinations/slack/plugin.py
+++ b/holmes/plugins/destinations/slack/plugin.py
@@ -73,7 +73,7 @@ def send_issue(self, issue: Issue, result: LLMResult) -> None:
)
elif e.response.data["error"] == "invalid_auth":
logging.error(
- f"Unable to authenticate using the provided Slack token. Please verify the setting of --slack-token"
+ "Unable to authenticate using the provided Slack token. Please verify the setting of --slack-token"
)
else:
logging.error(f"Error sending message: {e}. message={text}")
@@ -108,7 +108,7 @@ def __send_prompt_for_debugging(self, parent_thread, result: LLMResult) -> None:
text = "*🐞 DEBUG: messages with OpenAI*"
file_response = self.client.files_upload_v2(
- content=result.prompt, title=f"ai-prompt"
+ content=result.prompt, title="ai-prompt"
)
permalink = file_response["file"]["permalink"]
text += f"\n`<{permalink}|ai-prompt>`"
diff --git a/holmes/plugins/interfaces.py b/holmes/plugins/interfaces.py
index 5f265b7..461ee88 100644
--- a/holmes/plugins/interfaces.py
+++ b/holmes/plugins/interfaces.py
@@ -1,9 +1,8 @@
-from typing import List, Iterable, Pattern
-from pydantic import BaseModel
-from openai import AzureOpenAI, OpenAI
+from typing import List, Iterable
from holmes.core.issue import Issue
from holmes.core.tool_calling_llm import LLMResult
+
# Sources must implement this
class SourcePlugin:
def fetch_issues(self) -> List[Issue]:
@@ -17,9 +16,9 @@ def stream_issues(self) -> Iterable[Issue]:
def write_back_result(self, issue_id: str, result_data: LLMResult) -> None:
raise NotImplementedError()
+
# Destinations must implement this
class DestinationPlugin:
-
def send_issue(self, issue: Issue, result: LLMResult):
raise NotImplementedError()
diff --git a/holmes/plugins/prompts/__init__.py b/holmes/plugins/prompts/__init__.py
index 5dde457..000d671 100644
--- a/holmes/plugins/prompts/__init__.py
+++ b/holmes/plugins/prompts/__init__.py
@@ -4,6 +4,7 @@
THIS_DIR = os.path.abspath(os.path.dirname(__file__))
+
def load_prompt(prompt: str) -> str:
"""
prompt is either in the format 'builtin://' or 'file://' or a regular string
@@ -12,14 +13,15 @@ def load_prompt(prompt: str) -> str:
regular strings are returned as is (as literal strings)
"""
if prompt.startswith("builtin://"):
- path = os.path.join(THIS_DIR, prompt[len("builtin://"):])
+ path = os.path.join(THIS_DIR, prompt[len("builtin://") :])
elif prompt.startswith("file://"):
- path = prompt[len("file://"):]
+ path = prompt[len("file://") :]
else:
return prompt
-
+
return open(path).read()
+
def load_and_render_prompt(prompt: str, context: dict = None) -> str:
"""
prompt is in the format 'builtin://' or 'file://' or a regular string
diff --git a/holmes/plugins/prompts/_general_instructions.jinja2 b/holmes/plugins/prompts/_general_instructions.jinja2
index 8126075..996cbfe 100644
--- a/holmes/plugins/prompts/_general_instructions.jinja2
+++ b/holmes/plugins/prompts/_general_instructions.jinja2
@@ -8,7 +8,6 @@ In general:
* in this case, try to find substrings or search for the correct spellings
* always provide detailed information like exact resource names, versions, labels, etc
* even if you found the root cause, keep investigating to find other possible root causes and to gather data for the answer like exact names
-* when giving an answer don't say root cause but "possible root causes" and be clear to distinguish between what you know for certain and what is a possible explanation
* if a runbook url is present as well as tool that can fetch it, you MUST fetch the runbook before beginning your investigation.
* if you don't know, say that the analysis was inconclusive.
* if there are multiple possible causes list them in a numbered list.
diff --git a/holmes/plugins/prompts/generic_ask.jinja2 b/holmes/plugins/prompts/generic_ask.jinja2
index 2932690..d78ef32 100644
--- a/holmes/plugins/prompts/generic_ask.jinja2
+++ b/holmes/plugins/prompts/generic_ask.jinja2
@@ -27,4 +27,4 @@ Relevant logs:
2021-01-01T00:00:00.000Z [ERROR] Missing required field 'email' in request body
```
-Validation error led to unhandled Java exception causing a crash.
\ No newline at end of file
+Validation error led to unhandled Java exception causing a crash.
diff --git a/holmes/plugins/prompts/generic_ask_for_issue_conversation.jinja2 b/holmes/plugins/prompts/generic_ask_for_issue_conversation.jinja2
index f718279..8a67401 100644
--- a/holmes/plugins/prompts/generic_ask_for_issue_conversation.jinja2
+++ b/holmes/plugins/prompts/generic_ask_for_issue_conversation.jinja2
@@ -4,8 +4,8 @@ Do not say 'based on the tool output' or explicitly refer to tools at all.
If you output an answer and then realize you need to call more tools or there are possible next steps, you may do so by calling tools at that point in time.
### Context Awareness:
-Be aware that this conversation is follow-up questions to a prior investigation conducted for the {{issue}}.
-However, not all questions may be directly related to that investigation.
+Be aware that this conversation is follow-up questions to a prior investigation conducted for the {{issue}}.
+However, not all questions may be directly related to that investigation.
Use results of the investigation and conversation history to maintain continuity when appropriate, ensuring efficiency in your responses.
#### Results of issue Investigation:
@@ -46,4 +46,4 @@ Relevant logs:
2021-01-01T00:00:00.000Z [ERROR] Missing required field 'email' in request body
```
-Validation error led to unhandled Java exception causing a crash.
\ No newline at end of file
+Validation error led to unhandled Java exception causing a crash.
diff --git a/holmes/plugins/prompts/generic_post_processing.jinja2 b/holmes/plugins/prompts/generic_post_processing.jinja2
index 3016581..3f2d07b 100644
--- a/holmes/plugins/prompts/generic_post_processing.jinja2
+++ b/holmes/plugins/prompts/generic_post_processing.jinja2
@@ -10,4 +10,4 @@ This is the original prompt:
{{ prompt }}
This is the investigation to summarize:
-{{ investigation }}
\ No newline at end of file
+{{ investigation }}
diff --git a/holmes/plugins/prompts/kubernetes_workload_ask.jinja2 b/holmes/plugins/prompts/kubernetes_workload_ask.jinja2
index a1237eb..7948228 100644
--- a/holmes/plugins/prompts/kubernetes_workload_ask.jinja2
+++ b/holmes/plugins/prompts/kubernetes_workload_ask.jinja2
@@ -19,9 +19,9 @@ Use these rules when deciding how to apply them:
* Before finalizing your answer double-check if any Global Instructions apply. If so, ensure you have correctly followed those instructions.
In general:
-* when it can provide extra information, first run as many tools as you need to gather more information, then respond.
+* when it can provide extra information, first run as many tools as you need to gather more information, then respond.
* if possible, do so repeatedly with different tool calls each time to gather more information.
-* do not stop investigating until you are at the final root cause you are able to find.
+* do not stop investigating until you are at the final root cause you are able to find.
* use the "five whys" methodology to find the root cause.
* for example, if you found a problem in microservice A that is due to an error in microservice B, look at microservice B too and find the error in that.
* if you cannot find the resource/application that the user referred to, assume they made a typo or included/excluded characters like - and.
@@ -40,7 +40,7 @@ In general:
* do not give an answer like "The pod is pending" as that doesn't state why the pod is pending and how to fix it.
* do not give an answer like "Pod's node affinity/selector doesn't match any available nodes" because that doesn't include data on WHICH label doesn't match
* if investigating an issue on many pods, there is no need to check more than 3 individual pods in the same deployment. pick up to a representative 3 from each deployment if relevant
-* if you find errors and warning in a pods logs and you believe they indicate a real issue. consider the pod as not healthy.
+* if you find errors and warning in a pods logs and you believe they indicate a real issue. consider the pod as not healthy.
* if the user says something isn't working, ALWAYS:
** use kubectl_describe on the owner workload + individual pods and look for any transient issues they might have been referring to
** check the application aspects with kubectl_logs + kubectl_previous_logs and other relevant tools
@@ -76,4 +76,4 @@ Here are issues and configuration changes that happend to this kubernetes worklo
{% for a in alerts %}
{{ a }}
{% endfor %}
-{% endif %}
\ No newline at end of file
+{% endif %}
diff --git a/holmes/plugins/prompts/kubernetes_workload_chat.jinja2 b/holmes/plugins/prompts/kubernetes_workload_chat.jinja2
new file mode 100644
index 0000000..cc63b18
--- /dev/null
+++ b/holmes/plugins/prompts/kubernetes_workload_chat.jinja2
@@ -0,0 +1,38 @@
+You are a tool-calling AI assist provided with common DevOps and IT tools that you can use to troubleshoot problems or answer questions.
+Whenever possible, you MUST first use tools to investigate, then answer the question.
+Do not say 'based on the tool output' or explicitly refer to tools at all.
+If you output an answer and then realize you need to call more tools or there are possible next steps, you may do so by calling tools at that point in time.
+
+### Context Awareness:
+Be aware that this conversation is follow-up questions to a prior investigation conducted for the {{resource}}.
+However, not all questions may be directly related to that investigation.
+Use results of the investigation and conversation history to maintain continuity when appropriate, ensuring efficiency in your responses.
+
+#### Results of Workload Health Check Analysis:
+{{workload_analysis}}
+
+{% if tools_called_for_workload %}
+Tools used for the workload analysis:
+{% for tool in tools_called_for_workload %}
+ {{ tool }}
+{% endfor %}
+{% endif %}
+
+
+{% include '_global_instructions.jinja2' %}
+{% include '_general_instructions.jinja2' %}
+
+Style guide:
+* Reply with terse output.
+* Be painfully concise.
+* Leave out "the" and filler words when possible.
+* Be terse but not at the expense of leaving out important data like the root cause and how to fix.
+
+Examples:
+
+User: Why did the workload-example app crash?
+(Call tool kubectl_find_resource kind=pod keyword=workload`)
+(Call tool kubectl_previous_logs namespace=demos pod=workload-example-1299492-d9g9d # this pod name was found from the previous tool call)
+
+AI: `workload-example-1299492-d9g9d` crashed due to email validation error during HTTP request for /api/create_user
+Relevant logs:
diff --git a/holmes/plugins/runbooks/__init__.py b/holmes/plugins/runbooks/__init__.py
index 948a634..0259299 100644
--- a/holmes/plugins/runbooks/__init__.py
+++ b/holmes/plugins/runbooks/__init__.py
@@ -1,20 +1,19 @@
import os
import os.path
-from typing import List, Literal, Optional, Pattern, Union
+from typing import List, Optional, Pattern
-from pydantic import BaseModel, ConfigDict, Field, PrivateAttr
+from pydantic import BaseModel, PrivateAttr
from holmes.utils.pydantic_utils import RobustaBaseConfig, load_model_from_file
THIS_DIR = os.path.abspath(os.path.dirname(__file__))
-class IssueMatcher (RobustaBaseConfig):
- issue_id: Optional[Pattern] = None # unique id
- issue_name: Optional[Pattern] = None # not necessary unique
+
+class IssueMatcher(RobustaBaseConfig):
+ issue_id: Optional[Pattern] = None # unique id
+ issue_name: Optional[Pattern] = None # not necessary unique
source: Optional[Pattern] = None
-class RunbookContext(RobustaBaseConfig):
- type: "URL"
class Runbook(RobustaBaseConfig):
match: IssueMatcher
@@ -28,15 +27,18 @@ def set_path(self, path: str):
def get_path(self) -> str:
return self._path
+
class ListOfRunbooks(BaseModel):
runbooks: List[Runbook]
+
def load_runbooks_from_file(path: str) -> List[Runbook]:
data: ListOfRunbooks = load_model_from_file(ListOfRunbooks, file_path=path)
for runbook in data.runbooks:
runbook.set_path(path)
return data.runbooks
+
def load_builtin_runbooks() -> List[Runbook]:
all_runbooks = []
for filename in os.listdir(THIS_DIR):
diff --git a/holmes/plugins/runbooks/jira.yaml b/holmes/plugins/runbooks/jira.yaml
index 546fc1c..448bfb7 100644
--- a/holmes/plugins/runbooks/jira.yaml
+++ b/holmes/plugins/runbooks/jira.yaml
@@ -9,4 +9,4 @@ runbooks:
Ignore issues related to jira itself, like plugin or licensing problems.
Never give an answer like "XYZ is experiencing an issue, as indicated by the Jira issue. Further investigation is needed to determine the exact cause."
You are the agent that is supposed to investigate so do so!
- If you have references to a service or a component, start by searching for related infrastructure or resources using tools that take keywords
\ No newline at end of file
+ If you have references to a service or a component, start by searching for related infrastructure or resources using tools that take keywords
diff --git a/holmes/plugins/runbooks/kube-prometheus-stack.yaml b/holmes/plugins/runbooks/kube-prometheus-stack.yaml
index a2c59a2..b785411 100644
--- a/holmes/plugins/runbooks/kube-prometheus-stack.yaml
+++ b/holmes/plugins/runbooks/kube-prometheus-stack.yaml
@@ -7,4 +7,4 @@ runbooks:
instructions: >
Check if the cluster is a managed cluster like EKS by fetching nodes and looking at their labels.
If so, tell the user this is likely a known false positive in the kube-prometheus-stack alert because Prometheus can't scrape the scheduler which is managed by the cloud provider.
- On the other hand, if this is a self-managed Kubernetes, either the scheduler is really down (unlikely) or it is running but Prometheus can't scrape it.
\ No newline at end of file
+ On the other hand, if this is a self-managed Kubernetes, either the scheduler is really down (unlikely) or it is running but Prometheus can't scrape it.
diff --git a/holmes/plugins/sources/github/__init__.py b/holmes/plugins/sources/github/__init__.py
index 719c146..39d48e5 100644
--- a/holmes/plugins/sources/github/__init__.py
+++ b/holmes/plugins/sources/github/__init__.py
@@ -1,9 +1,8 @@
import logging
+from typing import List
from holmes.core.tool_calling_llm import LLMResult
from holmes.plugins.interfaces import SourcePlugin
from holmes.core.issue import Issue
-from typing import List, Pattern
-from holmes.core.tool_calling_llm import LLMResult
import requests
@@ -16,25 +15,26 @@ def __init__(self, url: str, owner: str, repository: str, pat: str, query: str):
self.query = query
def fetch_issues(self) -> List[Issue]:
- logging.info(f"Fetching All issues from {self.url} for repository {self.owner}/{self.repository}")
+ logging.info(
+ f"Fetching All issues from {self.url} for repository {self.owner}/{self.repository}"
+ )
try:
data = []
url = f"{self.url}/search/issues"
headers = {
"Authorization": f"token {self.pat}",
"Accept": "application/vnd.github.v3+json",
- "X-GitHub-Api-Version": "2022-11-28"
- }
- params = {
- "per_page": "100"
+ "X-GitHub-Api-Version": "2022-11-28",
}
+ params = {"per_page": "100"}
default_q = f"repo:{self.owner}/{self.repository}"
params["q"] = f"{default_q} {self.query}"
while url:
- response = requests.get(
- url=url, headers=headers, params=params)
+ response = requests.get(url=url, headers=headers, params=params)
if response.status_code != 200:
- raise Exception(f"Failed to get issues:{response.status_code} {response.text}")
+ raise Exception(
+ f"Failed to get issues:{response.status_code} {response.text}"
+ )
logging.info(f"Got {response}")
response.raise_for_status()
data.extend(response.json().get("items", []))
@@ -45,7 +45,7 @@ def fetch_issues(self) -> List[Issue]:
url = link.split(";")[0].strip()[1:-1]
return [self.convert_to_issue(issue) for issue in data]
except requests.RequestException as e:
- raise ConnectionError(f"Failed to fetch data from GitHub.") from e
+ raise ConnectionError("Failed to fetch data from GitHub.") from e
def convert_to_issue(self, github_issue):
return Issue(
@@ -62,12 +62,14 @@ def write_back_result(self, issue_id: str, result_data: LLMResult) -> None:
headers = {
"Authorization": f"token {self.pat}",
"Accept": "application/vnd.github.v3+json",
- "X-GitHub-Api-Version": "2022-11-28"
+ "X-GitHub-Api-Version": "2022-11-28",
}
response = requests.post(
url=url,
- json={"body": f"Automatic AI Investigation by Robusta:\n\n{result_data.result}\n"},
- headers=headers
+ json={
+ "body": f"Automatic AI Investigation by Robusta:\n\n{result_data.result}\n"
+ },
+ headers=headers,
)
response.raise_for_status()
diff --git a/holmes/plugins/sources/jira/__init__.py b/holmes/plugins/sources/jira/__init__.py
index 764354c..72c1b80 100644
--- a/holmes/plugins/sources/jira/__init__.py
+++ b/holmes/plugins/sources/jira/__init__.py
@@ -1,15 +1,12 @@
import logging
-from typing import List, Literal, Optional, Pattern
+from typing import List
-import humanize
import requests
-from pydantic import BaseModel, SecretStr, ValidationError, parse_obj_as, validator
from requests.auth import HTTPBasicAuth
from holmes.core.issue import Issue
-from holmes.core.tool_calling_llm import LLMResult, ToolCallingLLM, ToolCallResult
+from holmes.core.tool_calling_llm import LLMResult
from holmes.plugins.interfaces import SourcePlugin
-from holmes.plugins.utils import dict_to_markdown
class JiraSource(SourcePlugin):
@@ -25,9 +22,7 @@ def fetch_issues(self) -> List[Issue]:
response = requests.get(
f"{self.url}/rest/api/2/search",
params={"jql": self.jql_query},
- auth=HTTPBasicAuth(
- self.username, self.api_key
- ),
+ auth=HTTPBasicAuth(self.username, self.api_key),
headers={"Accept": "application/json"},
)
if response.status_code != 200:
@@ -39,7 +34,7 @@ def fetch_issues(self) -> List[Issue]:
data = response.json()
return [self.convert_to_issue(issue) for issue in data.get("issues", [])]
except requests.RequestException as e:
- raise ConnectionError(f"Failed to fetch data from Jira.") from e
+ raise ConnectionError("Failed to fetch data from Jira.") from e
def convert_to_issue(self, jira_issue):
return Issue(
@@ -63,9 +58,7 @@ def write_back_result(self, issue_id: str, result_data: LLMResult) -> None:
response = requests.post(
comment_url,
json=comment_data,
- auth=HTTPBasicAuth(
- self.username, self.api_key
- ),
+ auth=HTTPBasicAuth(self.username, self.api_key),
headers={"Accept": "application/json"},
)
response.raise_for_status()
diff --git a/holmes/plugins/sources/opsgenie/__init__.py b/holmes/plugins/sources/opsgenie/__init__.py
index b773d7e..3407f8e 100644
--- a/holmes/plugins/sources/opsgenie/__init__.py
+++ b/holmes/plugins/sources/opsgenie/__init__.py
@@ -2,14 +2,17 @@
from holmes.core.tool_calling_llm import LLMResult
from holmes.plugins.interfaces import SourcePlugin
from holmes.core.issue import Issue
-from typing import List, Pattern, Optional
+from typing import List, Optional
import requests
import markdown
OPSGENIE_TEAM_INTEGRATION_KEY_HELP = "OpsGenie Team Integration key for writing back results. (NOT a normal API Key.) Get it from Teams > YourTeamName > Integrations > Add Integration > API Key. Don't forget to turn on the integration!"
+
class OpsGenieSource(SourcePlugin):
- def __init__(self, api_key: str, query: str, team_integration_key: Optional[str] = None):
+ def __init__(
+ self, api_key: str, query: str, team_integration_key: Optional[str] = None
+ ):
self.api_key = api_key
self.query = query
self.team_integration_key = team_integration_key
@@ -21,25 +24,24 @@ def fetch_issues(self) -> List[Issue]:
url = "https://api.opsgenie.com/v2/alerts"
headers = {
"Authorization": f"GenieKey {self.api_key}",
- "Content-Type": "application/json"
- }
- params = {
- "query": self.query,
- "limit": 100
+ "Content-Type": "application/json",
}
+ params = {"query": self.query, "limit": 100}
while url:
# TODO: also fetch notes and description
response = requests.get(url, headers=headers, params=params)
logging.debug(f"Got {response.json()}")
if response.status_code != 200:
- raise Exception(f"Failed to get alerts: {response.status_code} {response.text}")
+ raise Exception(
+ f"Failed to get alerts: {response.status_code} {response.text}"
+ )
response.raise_for_status()
data.extend(response.json().get("data", []))
next_url = response.json().get("paging", {}).get("next", None)
url = next_url if next_url else None
return [self.convert_to_issue(alert) for alert in data]
except requests.RequestException as e:
- raise ConnectionError(f"Failed to fetch data from OpsGenie.") from e
+ raise ConnectionError("Failed to fetch data from OpsGenie.") from e
def convert_to_issue(self, opsgenie_alert):
return Issue(
@@ -50,11 +52,13 @@ def convert_to_issue(self, opsgenie_alert):
url=opsgenie_alert["tinyId"],
raw=opsgenie_alert,
)
-
+
def write_back_result(self, issue_id: str, result_data: LLMResult) -> None:
if self.team_integration_key is None:
- raise Exception(f"Please set '--opsgenie-team-integration-key' to write back results. This is an {OPSGENIE_TEAM_INTEGRATION_KEY_HELP}")
-
+ raise Exception(
+ f"Please set '--opsgenie-team-integration-key' to write back results. This is an {OPSGENIE_TEAM_INTEGRATION_KEY_HELP}"
+ )
+
# TODO: update description to make this more visible (right now we add a comment)
html_output = markdown.markdown(result_data.result)
logging.debug(f"HTML output: {html_output}")
@@ -62,12 +66,12 @@ def write_back_result(self, issue_id: str, result_data: LLMResult) -> None:
url = f"https://api.opsgenie.com/v2/alerts/{issue_id}/notes?identifierType=id"
headers = {
"Authorization": f"GenieKey {self.team_integration_key}",
- "Content-Type": "application/json"
+ "Content-Type": "application/json",
}
response = requests.post(
url=url,
json={"note": f"Automatic AI Investigation by Robusta:\n\n{html_output}\n"},
- headers=headers
+ headers=headers,
)
logging.debug(f"Response: {response.json()}")
response.raise_for_status()
@@ -82,4 +86,6 @@ def write_back_result(self, issue_id: str, result_data: LLMResult) -> None:
response.raise_for_status()
json_response = response.json()
if not json_response["data"]["success"]:
- raise Exception(f"Failed to write back result to OpsGenie: {json_response['data']['status']}")
+ raise Exception(
+ f"Failed to write back result to OpsGenie: {json_response['data']['status']}"
+ )
diff --git a/holmes/plugins/sources/pagerduty/__init__.py b/holmes/plugins/sources/pagerduty/__init__.py
index b72f867..36f9e9e 100644
--- a/holmes/plugins/sources/pagerduty/__init__.py
+++ b/holmes/plugins/sources/pagerduty/__init__.py
@@ -1,18 +1,21 @@
import logging
-from typing import List, Pattern, Optional
+from typing import List, Optional
import requests
-from pydantic import BaseModel
from holmes.core.issue import Issue
from holmes.core.tool_calling_llm import LLMResult
from holmes.plugins.interfaces import SourcePlugin
from holmes.utils.markdown_utils import markdown_to_plain_text
-class PagerDutySource(SourcePlugin):
- def __init__(self, api_key: str, user_email: str, incident_key: Optional[str] = None):
- self.api_url = "https://api.pagerduty.com" # currently hard-coded, can expose it if useful
+class PagerDutySource(SourcePlugin):
+ def __init__(
+ self, api_key: str, user_email: str, incident_key: Optional[str] = None
+ ):
+ self.api_url = (
+ "https://api.pagerduty.com" # currently hard-coded, can expose it if useful
+ )
self.api_key = api_key
self.user_email = user_email
self.incident_key = incident_key
@@ -22,18 +25,17 @@ def fetch_issues(self) -> List[Issue]:
try:
headers = {
"Authorization": f"Token token={self.api_key}",
- "Accept": "application/vnd.pagerduty+json;version=2"
+ "Accept": "application/vnd.pagerduty+json;version=2",
}
# excludes resolved
query_params = "?statuses[]=triggered&statuses[]=acknowledged"
if self.incident_key:
- query_params =f"{query_params}&incident_key={self.incident_key}"
+ query_params = f"{query_params}&incident_key={self.incident_key}"
response = requests.get(
- f"{self.api_url}/incidents{query_params}",
- headers=headers
+ f"{self.api_url}/incidents{query_params}", headers=headers
)
if response.status_code != 200:
print(f"Got response: {response}")
@@ -45,7 +47,7 @@ def fetch_issues(self) -> List[Issue]:
data = response.json()
return [self.convert_to_issue(issue) for issue in data.get("incidents", [])]
except requests.RequestException as e:
- raise ConnectionError(f"Failed to fetch data from PagerDuty.") from e
+ raise ConnectionError("Failed to fetch data from PagerDuty.") from e
def convert_to_issue(self, source_issue):
return Issue(
@@ -60,7 +62,9 @@ def convert_to_issue(self, source_issue):
def write_back_result(self, issue_id: str, result_data: LLMResult) -> None:
logging.info(f"Writing back result to issue {issue_id}")
if not self.user_email:
- raise Exception(f"When using --update mode, --pagerduty-user-email must be provided")
+ raise Exception(
+ "When using --update mode, --pagerduty-user-email must be provided"
+ )
try:
url = f"{self.api_url}/incidents/{issue_id}/notes"
@@ -75,22 +79,22 @@ def write_back_result(self, issue_id: str, result_data: LLMResult) -> None:
"content": f"Automatic AI Investigation by HolmesGPT:\n\n{comment}"
}
}
- response = requests.post(
- url,
- json=comment_data,
- headers=headers
- )
+ response = requests.post(url, json=comment_data, headers=headers)
response.raise_for_status()
data = response.json()
logging.debug(f"Comment added to issue {issue_id}: {data}")
except requests.RequestException as e:
- logging.error(f"Failed to write back result to PagerDuty: {e}; {e.response.text}")
+ logging.error(
+ f"Failed to write back result to PagerDuty: {e}; {e.response.text}"
+ )
raise
+
# Run with:
# poetry run python3 -m holmes.plugins.sources.pagerduty
if __name__ == "__main__":
import sys
+
pd_source = PagerDutySource(api_key=sys.argv[1], user_email=sys.argv[2])
issues = pd_source.fetch_issues()
for issue in issues:
diff --git a/holmes/plugins/sources/prometheus/__init__.py b/holmes/plugins/sources/prometheus/__init__.py
index b61916d..e69de29 100644
--- a/holmes/plugins/sources/prometheus/__init__.py
+++ b/holmes/plugins/sources/prometheus/__init__.py
@@ -1 +0,0 @@
-from .plugin import AlertManagerSource
\ No newline at end of file
diff --git a/holmes/plugins/sources/prometheus/models.py b/holmes/plugins/sources/prometheus/models.py
index 5ea0eb2..ca66e2a 100644
--- a/holmes/plugins/sources/prometheus/models.py
+++ b/holmes/plugins/sources/prometheus/models.py
@@ -3,7 +3,6 @@
from typing import Dict, List, Optional
from urllib.parse import parse_qs, unquote, urlparse
from pydantic import BaseModel, computed_field
-import humanize
# these models are used by AlertManager's push API (when alertmanager pushes alerts to us by webhook)
diff --git a/holmes/plugins/sources/prometheus/plugin.py b/holmes/plugins/sources/prometheus/plugin.py
index 31e56ac..64cb671 100644
--- a/holmes/plugins/sources/prometheus/plugin.py
+++ b/holmes/plugins/sources/prometheus/plugin.py
@@ -2,12 +2,12 @@
import logging
import re
from pathlib import Path
-from typing import List, Literal, Optional, Pattern
+from typing import List, Optional, Pattern
import humanize
import requests
import rich
-from pydantic import BaseModel, ValidationError, parse_obj_as, validator
+from pydantic import parse_obj_as
from pydantic.json import pydantic_encoder
from requests.auth import HTTPBasicAuth
import rich.segment
@@ -46,9 +46,13 @@ def __init__(
# we don't mention --alertmanager-file to avoid confusing users - most users wont care about it
raise ValueError("--alertmanager-url must be specified")
if self.url is not None and self.filepath is not None:
- logging.warning(f"Ignoring --alertmanager-url because --alertmanager-file is specified")
+ logging.warning(
+ "Ignoring --alertmanager-url because --alertmanager-file is specified"
+ )
if self.label_filter and self.filepath is not None:
- logging.warning(f"Ignoring --label-filter because --alertmanager-file is specified")
+ logging.warning(
+ "Ignoring --label-filter because --alertmanager-file is specified"
+ )
if self.url and not (
self.url.startswith("http://") or self.url.startswith("https://")
):
@@ -126,7 +130,9 @@ def output_curl_commands(self, console: rich.console.Console) -> None:
"""
alerts = self.__fetch_issues_from_api()
for alert in alerts:
- alert_json = json.dumps([alert.model_dump()], default=pydantic_encoder) # Wrap in a list
+ alert_json = json.dumps(
+ [alert.model_dump()], default=pydantic_encoder
+ ) # Wrap in a list
curl_command = (
f"curl -X POST -H 'Content-Type: application/json' "
f"-d '{alert_json}' {self.url}/api/v2/alerts"
diff --git a/holmes/plugins/toolsets/__init__.py b/holmes/plugins/toolsets/__init__.py
index a9fb057..2fd779b 100644
--- a/holmes/plugins/toolsets/__init__.py
+++ b/holmes/plugins/toolsets/__init__.py
@@ -6,7 +6,6 @@
from holmes.core.supabase_dal import SupabaseDal
from holmes.plugins.toolsets.datetime import DatetimeToolset
from holmes.plugins.toolsets.findings import FindingsToolset
-from holmes.plugins.toolsets.grafana.common import GrafanaConfig
from holmes.plugins.toolsets.grafana.toolset_grafana_loki import GrafanaLokiToolset
from holmes.plugins.toolsets.grafana.toolset_grafana_tempo import GrafanaTempoToolset
from holmes.plugins.toolsets.internet import InternetToolset
@@ -14,8 +13,7 @@
from holmes.plugins.toolsets.prometheus import PrometheusToolset
from holmes.core.tools import Toolset, YAMLToolset
-from typing import Dict
-from typing import Optional
+from holmes.plugins.toolsets.opensearch import OpenSearchToolset
import yaml
THIS_DIR = os.path.abspath(os.path.dirname(__file__))
diff --git a/holmes/plugins/toolsets/docker.yaml b/holmes/plugins/toolsets/docker.yaml
index 6e4c482..b0e72ae 100644
--- a/holmes/plugins/toolsets/docker.yaml
+++ b/holmes/plugins/toolsets/docker.yaml
@@ -44,4 +44,3 @@ toolsets:
- name: "docker_diff"
description: "Inspect changes to files or directories on a container's filesystem"
command: "docker diff {{ container_id }}"
-
diff --git a/holmes/plugins/toolsets/findings.py b/holmes/plugins/toolsets/findings.py
index 1451bea..3f619e6 100644
--- a/holmes/plugins/toolsets/findings.py
+++ b/holmes/plugins/toolsets/findings.py
@@ -4,13 +4,18 @@
from typing import Optional
from typing_extensions import Dict
from holmes.core.supabase_dal import SupabaseDal
-from holmes.core.tools import StaticPrerequisite, Tool, ToolParameter, Toolset, ToolsetTag
+from holmes.core.tools import (
+ StaticPrerequisite,
+ Tool,
+ ToolParameter,
+ Toolset,
+ ToolsetTag,
+)
PARAM_FINDING_ID = "id"
class FetchRobustaFinding(Tool):
-
_dal: Optional[SupabaseDal]
def __init__(self, dal: Optional[SupabaseDal]):
@@ -51,8 +56,8 @@ def invoke(self, params: Dict) -> str:
return f"There was an internal error while fetching finding {finding_id}"
- def get_parameterized_one_liner(self, params:Dict) -> str:
- return f"Fetch metadata and history"
+ def get_parameterized_one_liner(self, params: Dict) -> str:
+ return "Fetch metadata and history"
class FindingsToolset(Toolset):
@@ -72,6 +77,8 @@ def __init__(self, dal: Optional[SupabaseDal]):
name="robusta",
prerequisites=[dal_prereq],
tools=[FetchRobustaFinding(dal)],
- tags=[ToolsetTag.CORE,],
- is_default=True
+ tags=[
+ ToolsetTag.CORE,
+ ],
+ is_default=True,
)
diff --git a/holmes/plugins/toolsets/grafana/base_grafana_toolset.py b/holmes/plugins/toolsets/grafana/base_grafana_toolset.py
index 9f29efc..8ec784d 100644
--- a/holmes/plugins/toolsets/grafana/base_grafana_toolset.py
+++ b/holmes/plugins/toolsets/grafana/base_grafana_toolset.py
@@ -1,5 +1,5 @@
import logging
-from typing import Any
+from typing import Any, ClassVar, Type
from holmes.core.tools import (
Tool,
Toolset,
@@ -11,17 +11,28 @@
class BaseGrafanaToolset(Toolset):
- def __init__(self, name: str, description: str, icon_url: str, tools: list[Tool]):
+ config_class: ClassVar[Type[GrafanaConfig]] = GrafanaConfig
+
+ def __init__(
+ self,
+ name: str,
+ description: str,
+ icon_url: str,
+ tools: list[Tool],
+ doc_url: str,
+ ):
super().__init__(
name=name,
description=description,
icon_url=icon_url,
+ docs_url=doc_url,
prerequisites=[CallablePrerequisite(callable=self.prerequisites_callable)],
tools=tools,
tags=[
ToolsetTag.CORE,
],
- enabled=False
+ enabled=False,
+ is_default=True,
)
def prerequisites_callable(self, config: dict[str, Any]) -> bool:
@@ -30,10 +41,16 @@ def prerequisites_callable(self, config: dict[str, Any]) -> bool:
return False
try:
- self._grafana_config = GrafanaConfig(**config)
- is_healthy = get_health(self._grafana_config.url, self._grafana_config.api_key)
+ self._grafana_config = BaseGrafanaToolset.config_class(**config)
+ is_healthy = get_health(
+ self._grafana_config.url, self._grafana_config.api_key
+ )
return is_healthy
except Exception:
logging.exception("Failed to set up grafana toolset")
return False
+
+ def get_example_config(self):
+ example_config = GrafanaConfig(api_key="YOUR API KEY", url="YOUR GRAFANA URL")
+ return example_config.model_dump()
diff --git a/holmes/plugins/toolsets/grafana/common.py b/holmes/plugins/toolsets/grafana/common.py
index e5c5e1a..bc7f8bd 100644
--- a/holmes/plugins/toolsets/grafana/common.py
+++ b/holmes/plugins/toolsets/grafana/common.py
@@ -1,23 +1,12 @@
from typing import Dict, Optional, Union
import uuid
import time
-import os
from pydantic import BaseModel
-
-GRAFANA_URL_ENV_NAME = "GRAFANA_URL"
-GRAFANA_API_KEY_ENV_NAME = "GRAFANA_API_KEY"
ONE_HOUR_IN_SECONDS = 3600
-class GrafanaLokiConfig(BaseModel):
- pod_name_search_key: str = "pod"
- namespace_search_key: str = "namespace"
- node_name_search_key: str = "node"
-
-
class GrafanaConfig(BaseModel):
- loki: GrafanaLokiConfig = GrafanaLokiConfig()
api_key: str
url: str
@@ -59,7 +48,7 @@ def get_datasource_id(dict: Dict, param: str) -> str:
try:
if uuid.UUID(datasource_id, version=4):
return f"uid/{datasource_id}"
- except:
+ except Exception:
pass
-
+
return datasource_id
diff --git a/holmes/plugins/toolsets/grafana/grafana_api.py b/holmes/plugins/toolsets/grafana/grafana_api.py
index a7ccb4c..7e5e1de 100644
--- a/holmes/plugins/toolsets/grafana/grafana_api.py
+++ b/holmes/plugins/toolsets/grafana/grafana_api.py
@@ -1,4 +1,3 @@
-
import logging
import requests
from typing import Any, Dict, List, Optional
@@ -6,13 +5,17 @@
from holmes.plugins.toolsets.grafana.common import headers
+
@backoff.on_exception(
backoff.expo, # Exponential backoff
requests.exceptions.RequestException, # Retry on request exceptions
max_tries=5, # Maximum retries
- giveup=lambda e: isinstance(e, requests.exceptions.HTTPError) and e.response.status_code < 500,
+ giveup=lambda e: isinstance(e, requests.exceptions.HTTPError)
+ and e.response.status_code < 500,
)
-def list_grafana_datasources(grafana_url:str, api_key: str, source_name: Optional[str] = None) -> List[Dict[str, Any]]:
+def list_grafana_datasources(
+ grafana_url: str, api_key: str, source_name: Optional[str] = None
+) -> List[Dict[str, Any]]:
"""
List all configured datasources from a Grafana instance with retry and backoff.
@@ -23,7 +26,7 @@ def list_grafana_datasources(grafana_url:str, api_key: str, source_name: Optiona
List of datasource configurations.
"""
try:
- url = f'{grafana_url}/api/datasources'
+ url = f"{grafana_url}/api/datasources"
headers_ = headers(api_key=api_key)
logging.info(f"Fetching datasources from: {url}")
@@ -35,12 +38,13 @@ def list_grafana_datasources(grafana_url:str, api_key: str, source_name: Optiona
return datasources
relevant_datasources = [
- ds for ds in datasources
- if ds['type'].lower() == source_name.lower()
+ ds for ds in datasources if ds["type"].lower() == source_name.lower()
]
for ds in relevant_datasources:
- logging.info(f"Found datasource: {ds['name']} (type: {ds['type']}, id: {ds['id']})")
+ logging.info(
+ f"Found datasource: {ds['name']} (type: {ds['type']}, id: {ds['id']})"
+ )
return relevant_datasources
except requests.exceptions.RequestException as e:
@@ -51,15 +55,16 @@ def list_grafana_datasources(grafana_url:str, api_key: str, source_name: Optiona
backoff.expo, # Exponential backoff
requests.exceptions.RequestException, # Retry on request exceptions
max_tries=5, # Maximum retries
- giveup=lambda e: isinstance(e, requests.exceptions.HTTPError) and e.response.status_code < 500,
+ giveup=lambda e: isinstance(e, requests.exceptions.HTTPError)
+ and e.response.status_code < 500,
)
-def get_health(grafana_url:str, api_key: str) -> bool:
+def get_health(grafana_url: str, api_key: str) -> bool:
try:
- url = f'{grafana_url}/api/health'
+ url = f"{grafana_url}/api/health"
headers_ = headers(api_key=api_key)
response = requests.get(url, headers=headers_, timeout=10) # Added timeout
response.raise_for_status()
return True
- except:
+ except Exception:
return False
diff --git a/holmes/plugins/toolsets/grafana/loki_api.py b/holmes/plugins/toolsets/grafana/loki_api.py
index a911fe5..2a8d15f 100644
--- a/holmes/plugins/toolsets/grafana/loki_api.py
+++ b/holmes/plugins/toolsets/grafana/loki_api.py
@@ -1,6 +1,5 @@
-import logging
import requests
-from typing import Dict, List, Optional
+from typing import Dict, List
import backoff
from holmes.plugins.toolsets.grafana.common import headers
diff --git a/holmes/plugins/toolsets/grafana/tempo_api.py b/holmes/plugins/toolsets/grafana/tempo_api.py
index a6d855a..6e75deb 100644
--- a/holmes/plugins/toolsets/grafana/tempo_api.py
+++ b/holmes/plugins/toolsets/grafana/tempo_api.py
@@ -1,15 +1,16 @@
-
import requests
from typing import Dict, List
import backoff
+
def execute_tempo_query_with_retry(
- grafana_url:str,
+ grafana_url: str,
api_key: str,
tempo_datasource_id: str,
query_params: dict,
retries: int = 3,
- timeout: int = 5):
+ timeout: int = 5,
+):
"""
Execute a Tempo API query through Grafana with retries and timeout.
@@ -22,21 +23,22 @@ def execute_tempo_query_with_retry(
Returns:
List of trace results.
"""
- url = f'{grafana_url}/api/datasources/proxy/{tempo_datasource_id}/api/search'
+ url = f"{grafana_url}/api/datasources/proxy/{tempo_datasource_id}/api/search"
@backoff.on_exception(
backoff.expo, # Exponential backoff
requests.exceptions.RequestException, # Retry on request exceptions
max_tries=retries, # Maximum retries
- giveup=lambda e: isinstance(e, requests.exceptions.HTTPError) and e.response.status_code < 500,
+ giveup=lambda e: isinstance(e, requests.exceptions.HTTPError)
+ and e.response.status_code < 500,
)
def make_request():
response = requests.post(
url,
headers={
- 'Authorization': f'Bearer {api_key}',
- 'Accept': 'application/json',
- 'Content-Type': 'application/json',
+ "Authorization": f"Bearer {api_key}",
+ "Accept": "application/json",
+ "Content-Type": "application/json",
},
json=query_params,
timeout=timeout, # Set timeout for the request
@@ -51,7 +53,7 @@ def make_request():
def query_tempo_traces_by_duration(
- grafana_url:str,
+ grafana_url: str,
api_key: str,
tempo_datasource_id: str,
min_duration: str,
@@ -78,11 +80,13 @@ def query_tempo_traces_by_duration(
"end": str(end),
"limit": str(limit),
}
- return execute_tempo_query_with_retry(grafana_url, api_key, tempo_datasource_id, query_params)
+ return execute_tempo_query_with_retry(
+ grafana_url, api_key, tempo_datasource_id, query_params
+ )
def query_tempo_trace_by_id(
- grafana_url:str,
+ grafana_url: str,
api_key: str,
tempo_datasource_id: str,
trace_id: str,
@@ -101,20 +105,21 @@ def query_tempo_trace_by_id(
Returns:
Trace details.
"""
- url = f'{grafana_url}/api/datasources/proxy/{tempo_datasource_id}/api/traces/{trace_id}'
+ url = f"{grafana_url}/api/datasources/proxy/{tempo_datasource_id}/api/traces/{trace_id}"
@backoff.on_exception(
backoff.expo, # Exponential backoff
requests.exceptions.RequestException, # Retry on request exceptions
max_tries=retries, # Maximum retries
- giveup=lambda e: isinstance(e, requests.exceptions.HTTPError) and e.response.status_code < 500,
+ giveup=lambda e: isinstance(e, requests.exceptions.HTTPError)
+ and e.response.status_code < 500,
)
def make_request():
response = requests.get(
url,
headers={
- 'Authorization': f'Bearer {api_key}',
- 'Accept': 'application/json',
+ "Authorization": f"Bearer {api_key}",
+ "Accept": "application/json",
},
timeout=timeout, # Set timeout for the request
)
@@ -124,13 +129,13 @@ def make_request():
try:
return make_request()
except requests.exceptions.RequestException as e:
- raise Exception(f"Failed to retrieve trace by ID after retries: {e} \n for URL: {url}")
+ raise Exception(
+ f"Failed to retrieve trace by ID after retries: {e} \n for URL: {url}"
+ )
+
def process_trace_json(trace_json):
- result = {
- "total_elapsed_time_ms": 0,
- "applications": []
- }
+ result = {"total_elapsed_time_ms": 0, "applications": []}
# First pass: Collect basic details about spans
spans_info = {}
@@ -162,7 +167,7 @@ def process_trace_json(trace_json):
"exclusive_time_ms": elapsed_time_ns / 1_000_000,
"start_time": start_time,
"end_time": end_time,
- "loki_labels": {"app": app_name}
+ "loki_labels": {"app": app_name},
}
# Second pass: Subtract child span times from parent spans
@@ -177,18 +182,20 @@ def process_trace_json(trace_json):
app_info = {
"app_name": span_data["app_name"],
"service_name": span_data["service_name"],
- #"elapsed_time_ms": span_data["elapsed_time_ms"], # this confuses the llm
+ # "elapsed_time_ms": span_data["elapsed_time_ms"], # this confuses the llm
"elapsed_service_time_ms": span_data["exclusive_time_ms"],
"start_time": span_data["start_time"],
"end_time": span_data["end_time"],
- "loki_labels": span_data["loki_labels"]
+ "loki_labels": span_data["loki_labels"],
}
if app_info["app_name"]:
result["applications"].append(app_info)
# Set the total elapsed time to the root span's time (if available)
- root_span = max(spans_info.values(), key=lambda x: x["elapsed_time_ms"], default=None)
+ root_span = max(
+ spans_info.values(), key=lambda x: x["elapsed_time_ms"], default=None
+ )
if root_span:
result["total_elapsed_time_ms"] = root_span["elapsed_time_ms"]
diff --git a/holmes/plugins/toolsets/grafana/toolset_grafana_loki.py b/holmes/plugins/toolsets/grafana/toolset_grafana_loki.py
index 0023359..1cc0cf2 100644
--- a/holmes/plugins/toolsets/grafana/toolset_grafana_loki.py
+++ b/holmes/plugins/toolsets/grafana/toolset_grafana_loki.py
@@ -5,6 +5,7 @@
from holmes.core.tools import Tool, ToolParameter
from holmes.plugins.toolsets.grafana.base_grafana_toolset import BaseGrafanaToolset
from holmes.plugins.toolsets.grafana.common import (
+ GrafanaConfig,
get_datasource_id,
get_param_or_raise,
process_timestamps,
@@ -17,8 +18,13 @@
)
-class ListLokiDatasources(Tool):
+class GrafanaLokiConfig(GrafanaConfig):
+ pod_name_search_key: str = "pod"
+ namespace_search_key: str = "namespace"
+ node_name_search_key: str = "node"
+
+class ListLokiDatasources(Tool):
def __init__(self, toolset: BaseGrafanaToolset):
super().__init__(
name="list_loki_datasources",
@@ -40,7 +46,6 @@ def get_parameterized_one_liner(self, params: Dict) -> str:
class GetLokiLogsByNode(Tool):
-
def __init__(self, toolset: BaseGrafanaToolset):
super().__init__(
name="fetch_loki_logs_by_node",
@@ -84,7 +89,7 @@ def invoke(self, params: Dict) -> str:
api_key=self._toolset._grafana_config.api_key,
loki_datasource_id=get_datasource_id(params, "loki_datasource_id"),
node_name=get_param_or_raise(params, "node_name"),
- node_name_search_key=self._toolset._grafana_config.loki.node_name_search_key,
+ node_name_search_key=self._toolset._grafana_config.node_name_search_key,
start=start,
end=end,
limit=int(get_param_or_raise(params, "limit")),
@@ -158,7 +163,6 @@ def get_parameterized_one_liner(self, params: Dict) -> str:
class GetLokiLogsByPod(Tool):
-
def __init__(self, toolset: BaseGrafanaToolset):
super().__init__(
name="fetch_loki_logs_by_pod",
@@ -208,8 +212,8 @@ def invoke(self, params: Dict) -> str:
loki_datasource_id=get_datasource_id(params, "loki_datasource_id"),
pod_regex=get_param_or_raise(params, "pod_regex"),
namespace=get_param_or_raise(params, "namespace"),
- namespace_search_key=self._toolset._grafana_config.loki.namespace_search_key,
- pod_name_search_key=self._toolset._grafana_config.loki.pod_name_search_key,
+ namespace_search_key=self._toolset._grafana_config.namespace_search_key,
+ pod_name_search_key=self._toolset._grafana_config.pod_name_search_key,
start=start,
end=end,
limit=int(get_param_or_raise(params, "limit")),
@@ -221,11 +225,14 @@ def get_parameterized_one_liner(self, params: Dict) -> str:
class GrafanaLokiToolset(BaseGrafanaToolset):
+ config_class = GrafanaLokiConfig
+
def __init__(self):
super().__init__(
name="grafana/loki",
description="Fetchs kubernetes pods and node logs from Loki",
icon_url="https://grafana.com/media/docs/loki/logo-grafana-loki.png",
+ doc_url="https://grafana.com/oss/loki/",
tools=[
ListLokiDatasources(self),
GetLokiLogsByNode(self),
@@ -233,3 +240,9 @@ def __init__(self):
GetLokiLogsByLabel(self),
],
)
+
+ def get_example_config(self):
+ example_config = GrafanaLokiConfig(
+ api_key="YOUR API KEY", url="YOUR GRAFANA URL"
+ )
+ return example_config.model_dump()
diff --git a/holmes/plugins/toolsets/grafana/toolset_grafana_tempo.py b/holmes/plugins/toolsets/grafana/toolset_grafana_tempo.py
index 815ee27..ef9fa2a 100644
--- a/holmes/plugins/toolsets/grafana/toolset_grafana_tempo.py
+++ b/holmes/plugins/toolsets/grafana/toolset_grafana_tempo.py
@@ -11,7 +11,6 @@
)
from holmes.plugins.toolsets.grafana.grafana_api import list_grafana_datasources
from holmes.plugins.toolsets.grafana.common import (
- GrafanaConfig,
get_datasource_id,
get_param_or_raise,
process_timestamps,
@@ -19,7 +18,6 @@
class ListAllDatasources(Tool):
-
def __init__(self, toolset: BaseGrafanaToolset):
super().__init__(
name="list_all_datasources",
@@ -40,7 +38,6 @@ def get_parameterized_one_liner(self, params: Dict) -> str:
class GetTempoTracesByMinDuration(Tool):
-
def __init__(self, toolset: BaseGrafanaToolset):
super().__init__(
name="fetch_tempo_traces_by_min_duration",
@@ -97,7 +94,6 @@ def get_parameterized_one_liner(self, params: Dict) -> str:
class GetTempoTraceById(Tool):
-
def __init__(self, toolset: BaseGrafanaToolset):
super().__init__(
name="fetch_tempo_trace_by_id",
@@ -131,12 +127,12 @@ def get_parameterized_one_liner(self, params: Dict) -> str:
class GrafanaTempoToolset(BaseGrafanaToolset):
-
def __init__(self):
super().__init__(
name="grafana/tempo",
description="Fetchs kubernetes traces from Tempo",
icon_url="https://grafana.com/static/assets/img/blog/tempo.png",
+ doc_url="https://grafana.com/oss/tempo/",
tools=[
ListAllDatasources(self),
GetTempoTracesByMinDuration(self),
diff --git a/holmes/plugins/toolsets/helm.yaml b/holmes/plugins/toolsets/helm.yaml
index f1ec5e6..bd3c82e 100644
--- a/holmes/plugins/toolsets/helm.yaml
+++ b/holmes/plugins/toolsets/helm.yaml
@@ -12,11 +12,11 @@ toolsets:
- name: "helm_list"
description: "Use to get all the current helm releases"
command: "helm list"
-
+
- name: "helm_values"
description: "Use to gather Helm values or any released helm chart"
command: "helm get values -a {{ release_name }} -n {{ namespace }} -o json"
-
+
- name: "helm_status"
description: "Check the status of a Helm release"
command: "helm status {{ release_name }} -n {{ namespace }}"
@@ -39,4 +39,4 @@ toolsets:
- name: "helm_notes"
description: "Show the notes provided by the Helm chart"
- command: "helm get notes {{ release_name }} -n {{ namespace }}"
\ No newline at end of file
+ command: "helm get notes {{ release_name }} -n {{ namespace }}"
diff --git a/holmes/plugins/toolsets/internet.py b/holmes/plugins/toolsets/internet.py
index 50cb745..421e305 100644
--- a/holmes/plugins/toolsets/internet.py
+++ b/holmes/plugins/toolsets/internet.py
@@ -12,49 +12,61 @@
import requests
# TODO: change and make it holmes
-INTERNET_TOOLSET_USER_AGENT = os.environ.get("INTERNET_TOOLSET_USER_AGENT", "Mozilla/5.0 (X11; Linux x86_64; rv:128.0; holmesgpt;) Gecko/20100101 Firefox/128.0")
-INTERNET_TOOLSET_TIMEOUT_SECONDS = int(os.environ.get("INTERNET_TOOLSET_TIMEOUT_SECONDS", "60"))
+INTERNET_TOOLSET_USER_AGENT = os.environ.get(
+ "INTERNET_TOOLSET_USER_AGENT",
+ "Mozilla/5.0 (X11; Linux x86_64; rv:128.0; holmesgpt;) Gecko/20100101 Firefox/128.0",
+)
+INTERNET_TOOLSET_TIMEOUT_SECONDS = int(
+ os.environ.get("INTERNET_TOOLSET_TIMEOUT_SECONDS", "60")
+)
SELECTORS_TO_REMOVE = [
- 'script', 'style', 'meta', 'link', 'noscript',
- 'header', 'footer', 'nav',
- 'iframe', 'svg', 'img',
- 'button',
- 'menu', 'sidebar', 'aside',
- '.header'
- '.footer'
- '.navigation',
- '.nav',
- '.menu',
- '.sidebar',
- '.ad',
- '.advertisement',
- '.social',
- '.popup',
- '.modal',
- '.banner',
- '.cookie-notice',
- '.social-share',
- '.related-articles',
- '.recommended',
- '#header'
- '#footer'
- '#navigation',
- '#nav',
- '#menu',
- '#sidebar',
- '#ad',
- '#advertisement',
- '#social',
- '#popup',
- '#modal',
- '#banner',
- '#cookie-notice',
- '#social-share',
- '#related-articles',
- '#recommended'
+ "script",
+ "style",
+ "meta",
+ "link",
+ "noscript",
+ "header",
+ "footer",
+ "nav",
+ "iframe",
+ "svg",
+ "img",
+ "button",
+ "menu",
+ "sidebar",
+ "aside",
+ ".header" ".footer" ".navigation",
+ ".nav",
+ ".menu",
+ ".sidebar",
+ ".ad",
+ ".advertisement",
+ ".social",
+ ".popup",
+ ".modal",
+ ".banner",
+ ".cookie-notice",
+ ".social-share",
+ ".related-articles",
+ ".recommended",
+ "#header" "#footer" "#navigation",
+ "#nav",
+ "#menu",
+ "#sidebar",
+ "#ad",
+ "#advertisement",
+ "#social",
+ "#popup",
+ "#modal",
+ "#banner",
+ "#cookie-notice",
+ "#social-share",
+ "#related-articles",
+ "#recommended",
]
+
def scrape(url) -> Tuple[Optional[str], Optional[str]]:
response = None
content = None
@@ -62,16 +74,14 @@ def scrape(url) -> Tuple[Optional[str], Optional[str]]:
try:
response = requests.get(
url,
- headers={
- 'User-Agent': INTERNET_TOOLSET_USER_AGENT
- },
- timeout=INTERNET_TOOLSET_TIMEOUT_SECONDS
+ headers={"User-Agent": INTERNET_TOOLSET_USER_AGENT},
+ timeout=INTERNET_TOOLSET_TIMEOUT_SECONDS,
)
response.raise_for_status()
except Timeout:
logging.error(
f"Failed to load {url}. Timeout after {INTERNET_TOOLSET_TIMEOUT_SECONDS} seconds",
- exc_info=True
+ exc_info=True,
)
except RequestException as e:
logging.error(f"Failed to load {url}: {str(e)}", exc_info=True)
@@ -80,15 +90,18 @@ def scrape(url) -> Tuple[Optional[str], Optional[str]]:
if response:
content = response.text
try:
- content_type = response.headers['content-type']
+ content_type = response.headers["content-type"]
if content_type:
mime_type = content_type.split(";")[0]
except Exception:
- logging.info(f"Failed to parse content type from headers {response.headers}")
+ logging.info(
+ f"Failed to parse content type from headers {response.headers}"
+ )
return (content, mime_type)
-def cleanup(soup:BeautifulSoup):
+
+def cleanup(soup: BeautifulSoup):
"""Remove all elements that are irrelevant to the textual representation of a web page.
This includes images, extra data, even links as there is no intention to navigate from that page.
"""
@@ -105,9 +118,7 @@ def cleanup(soup:BeautifulSoup):
return soup
-
-def html_to_markdown(page_source:str):
-
+def html_to_markdown(page_source: str):
soup = BeautifulSoup(page_source, "html.parser")
soup = cleanup(soup)
page_source = str(soup)
@@ -156,7 +167,6 @@ def __init__(self):
)
def invoke(self, params: Any) -> str:
-
url: str = params["url"]
content, mime_type = scrape(url)
@@ -185,6 +195,8 @@ def __init__(self):
icon_url="https://platform.robusta.dev/demos/internet-access.svg",
prerequisites=[],
tools=[FetchWebpage()],
- tags=[ToolsetTag.CORE,],
- is_default=True
+ tags=[
+ ToolsetTag.CORE,
+ ],
+ is_default=True,
)
diff --git a/holmes/plugins/toolsets/kubernetes.yaml b/holmes/plugins/toolsets/kubernetes.yaml
index 3194805..33b8190 100644
--- a/holmes/plugins/toolsets/kubernetes.yaml
+++ b/holmes/plugins/toolsets/kubernetes.yaml
@@ -10,11 +10,11 @@ toolsets:
tools:
- name: "kubectl_describe"
- description: >
- Run kubectl describe -n ,
- call this when users ask for description,
+ description: >
+ Run kubectl describe -n ,
+ call this when users ask for description,
for example when a user asks
- - 'describe pod xyz-123'
+ - 'describe pod xyz-123'
- 'show service xyz-123 in namespace my-ns'
command: "kubectl describe {{ kind }} {{ name }}{% if namespace %} -n {{ namespace }}{% endif %}"
@@ -29,7 +29,7 @@ toolsets:
- name: "kubectl_get_by_kind_in_cluster"
description: "Run `kubectl get -A --show-labels` to get all resources of a given type in the cluster"
command: "kubectl get -A --show-labels -o wide {{ kind }}"
-
+
- name: "kubectl_find_resource"
description: "Run `kubectl get {{ kind }} -A --show-labels | grep {{ keyword }}` to find a resource where you know a substring of the name, IP, namespace, or labels"
command: "kubectl get -A --show-labels -o wide {{ kind }} | grep {{ keyword }}"
@@ -41,7 +41,7 @@ toolsets:
- name: "kubectl_events"
description: "Retrieve the events for a specific Kubernetes resource. `resource_type` can be any kubernetes resource type: 'pod', 'service', 'deployment, 'job'', 'node', etc."
command: "kubectl events --for {{resource_type}}/{{ pod_name }} -n {{ namespace }}"
-
+
- name: "kubectl_memory_requests_all_namespaces"
description: "Fetch and display memory requests for all pods across all namespaces in MiB, summing requests across multiple containers where applicable and handling binary, decimal, and millibyte units correctly."
command: |
@@ -125,12 +125,12 @@ toolsets:
}
print namespace, name, sum_memory(requests) " Mi";
}' | sort -k3 -nr
-
+
- name: "kubernetes_jq_query"
- description: >
+ description: >
Use kubectl to get json for all resources of a specific kind pipe the results to jq to filter them. Do not worry about escaping the jq_expr it will be done by the system on an unescaped expression that you give. e.g. give an expression like .items[] | .spec.containers[].image | select(test("^gcr.io/") | not)
command: kubectl get {{ kind }} --all-namespaces -o json | jq -r {{ jq_expr }}
-
+
# NOTE: this is only possible for probes with a healthz endpoint - we do this to avoid giving the LLM generic
# http GET capabilities which are more powerful than we want to expose
diff --git a/holmes/plugins/toolsets/opensearch.py b/holmes/plugins/toolsets/opensearch.py
index 84d4c73..5e824c5 100644
--- a/holmes/plugins/toolsets/opensearch.py
+++ b/holmes/plugins/toolsets/opensearch.py
@@ -1,7 +1,7 @@
import logging
from typing import Any, Dict, List, Optional
-from pydantic import ConfigDict
+from pydantic import BaseModel, ConfigDict
from holmes.core.tools import (
CallablePrerequisite,
Tool,
@@ -12,18 +12,45 @@
from opensearchpy import OpenSearch
+class OpenSearchHttpAuth(BaseModel):
+ username: str
+ password: str
+
+
+class OpenSearchHost(BaseModel):
+ host: str
+ port: int = 9200
+
+
+class OpenSearchCluster(BaseModel):
+ hosts: list[OpenSearchHost]
+ headers: Optional[dict[str, Any]] = None
+ use_ssl: bool = True
+ ssl_assert_hostname: bool = False
+ verify_certs: bool = False
+ ssl_show_warn: bool = False
+ http_auth: Optional[OpenSearchHttpAuth] = None
+
+
+class OpenSearchConfig(BaseModel):
+ opensearch_clusters: list[OpenSearchCluster]
+
+
class OpenSearchClient:
def __init__(self, **kwargs):
-
# Handle http_auth explicitly
if "http_auth" in kwargs:
http_auth = kwargs.pop("http_auth")
if isinstance(http_auth, dict):
- kwargs["http_auth"] = (http_auth.get("username"), http_auth.get("password"))
+ kwargs["http_auth"] = (
+ http_auth.get("username"),
+ http_auth.get("password"),
+ )
# Initialize OpenSearch client
self.client = OpenSearch(**kwargs)
-def get_client(clients:List[OpenSearchClient], host:Optional[str]):
+
+def get_client(clients: List[OpenSearchClient], host: Optional[str]):
if len(clients) == 1:
return clients[0]
@@ -46,7 +73,6 @@ class BaseOpenSearchTool(Tool):
class ListShards(BaseOpenSearchTool):
-
def __init__(self, toolset: "OpenSearchToolset"):
super().__init__(
name="opensearch_list_shards",
@@ -71,7 +97,6 @@ def get_parameterized_one_liner(self, params: Dict) -> str:
class GetClusterSettings(BaseOpenSearchTool):
-
def __init__(self, toolset: "OpenSearchToolset"):
super().__init__(
name="opensearch_get_cluster_settings",
@@ -98,7 +123,6 @@ def get_parameterized_one_liner(self, params) -> str:
class GetClusterHealth(BaseOpenSearchTool):
-
def __init__(self, toolset: "OpenSearchToolset"):
super().__init__(
name="opensearch_get_cluster_health",
@@ -127,13 +151,12 @@ class OpenSearchToolset(Toolset):
clients: List[OpenSearchClient] = []
def __init__(self):
-
super().__init__(
name="opensearch",
enabled=False,
description="Provide cluster metadata information like health, shards, settings.",
docs_url="https://opensearch.org/docs/latest/clients/python-low-level/",
- icon_url="https://upload.wikimedia.org/wikipedia/commons/9/91/Opensearch_Logo.svg",
+ icon_url="https://opensearch.org/assets/brand/PNG/Mark/opensearch_mark_default.png",
prerequisites=[CallablePrerequisite(callable=self.prerequisites_callable)],
tools=[
ListShards(self),
@@ -143,21 +166,40 @@ def __init__(self):
tags=[
ToolsetTag.CORE,
],
- is_default=False,
+ is_default=True,
)
def prerequisites_callable(self, config: dict[str, Any]) -> bool:
if not config:
return False
- clusters_configs: list[dict[str, Any]] = config.get("opensearch_clusters", [])
- for cluster in clusters_configs:
- try:
- logging.info(f"Setting up OpenSearch client")
- client = OpenSearchClient(**cluster)
- if client.client.cluster.health(params={"timeout": 5}):
- self.clients.append(client)
- except Exception:
- logging.exception("Failed to set up opensearch client")
-
- return len(self.clients) > 0
+ try:
+ os_config = OpenSearchConfig(**config)
+
+ for cluster in os_config.opensearch_clusters:
+ try:
+ logging.info("Setting up OpenSearch client")
+ cluster_kwargs = cluster.model_dump()
+ client = OpenSearchClient(**cluster_kwargs)
+ if client.client.cluster.health(params={"timeout": 5}):
+ self.clients.append(client)
+ except Exception:
+ logging.exception("Failed to set up opensearch client")
+
+ return len(self.clients) > 0
+ except Exception:
+ logging.exception("Failed to set up grafana toolset")
+ return False
+
+ def get_example_config(self) -> Dict[str, Any]:
+ example_config = OpenSearchConfig(
+ opensearch_clusters=[
+ OpenSearchCluster(
+ hosts=[OpenSearchHost(host="YOUR OPENSEACH HOST")],
+ headers={"Authorization": "{{ env.OPENSEARCH_BEARER_TOKEN }}"},
+ use_ssl=True,
+ ssl_assert_hostname=False,
+ )
+ ]
+ )
+ return example_config.model_dump()
diff --git a/holmes/plugins/utils.py b/holmes/plugins/utils.py
index d381be3..4a56bcf 100644
--- a/holmes/plugins/utils.py
+++ b/holmes/plugins/utils.py
@@ -1,6 +1,7 @@
# this file contains utilities that plugin writers are likely to use - not utilities that are only relevant for core
from typing import Dict
+
def dict_to_markdown(items: Dict[str, str]) -> str:
if not items:
return ""
@@ -10,4 +11,4 @@ def dict_to_markdown(items: Dict[str, str]) -> str:
# TODO: if v is a url, linkify it
text += f"• *{k}*: {v}\n"
- return text
\ No newline at end of file
+ return text
diff --git a/holmes/utils/default_toolset_installation_guide.jinja2 b/holmes/utils/default_toolset_installation_guide.jinja2
index 1cb32dd..d8c8775 100644
--- a/holmes/utils/default_toolset_installation_guide.jinja2
+++ b/holmes/utils/default_toolset_installation_guide.jinja2
@@ -1,5 +1,5 @@
{% if enabled %}
-This integration is enabled by default.
+This integration is enabled by default.
If you would like to disable this toolset (not recommended), you need to update the `generated_values.yaml` configuration.
@@ -25,6 +25,10 @@ holmes:
toolsets:
{{toolset_name}}:
enabled: true
+ {% if example_config %}
+ config:
+ {{ example_config | indent(8) }}
+ {% endif %}
```
{% endif %}
@@ -33,4 +37,4 @@ And deploy the updated configuration using Helm:
```bash
helm upgrade robusta robusta/robusta --values=generated_values.yaml --set clusterName=
-```
\ No newline at end of file
+```
diff --git a/holmes/utils/definitions.py b/holmes/utils/definitions.py
index b316eb3..61403f2 100644
--- a/holmes/utils/definitions.py
+++ b/holmes/utils/definitions.py
@@ -7,4 +7,4 @@
class RobustaConfig(BaseModel):
sinks_config: List[Dict[str, Dict]]
- global_config: dict
\ No newline at end of file
+ global_config: dict
diff --git a/holmes/utils/file_utils.py b/holmes/utils/file_utils.py
index 5847942..ea4b9e6 100644
--- a/holmes/utils/file_utils.py
+++ b/holmes/utils/file_utils.py
@@ -8,8 +8,8 @@ def write_json_file(json_output_file: str, json_ob_to_dump):
dirname = os.path.dirname(json_output_file)
if dirname:
os.makedirs(dirname, exist_ok=True)
- with open(json_output_file , 'w' , encoding='utf-8') as f:
+ with open(json_output_file, "w", encoding="utf-8") as f:
json.dump(json_ob_to_dump, f, ensure_ascii=False, indent=4, default=str)
- except Exception as e:
- logging.exception(f"Failed to create the json file.")
+ except Exception:
+ logging.exception("Failed to create the json file.")
return
diff --git a/holmes/utils/global_instructions.py b/holmes/utils/global_instructions.py
index 226a060..d4d8187 100644
--- a/holmes/utils/global_instructions.py
+++ b/holmes/utils/global_instructions.py
@@ -2,7 +2,13 @@
from holmes.core.tool_calling_llm import Instructions
-def add_global_instructions_to_user_prompt(user_prompt: str, global_instructions: Optional[Instructions]) -> str:
- if global_instructions and global_instructions.instructions and len(global_instructions.instructions[0]) > 0:
+def add_global_instructions_to_user_prompt(
+ user_prompt: str, global_instructions: Optional[Instructions]
+) -> str:
+ if (
+ global_instructions
+ and global_instructions.instructions
+ and len(global_instructions.instructions[0]) > 0
+ ):
user_prompt += f"\n\nGlobal Instructions (use only if relevant): {global_instructions.instructions[0]}\n"
return user_prompt
diff --git a/holmes/utils/holmes_status.py b/holmes/utils/holmes_status.py
index 60733a9..ad17b3f 100644
--- a/holmes/utils/holmes_status.py
+++ b/holmes/utils/holmes_status.py
@@ -6,11 +6,13 @@
def update_holmes_status_in_db(dal: SupabaseDal, config: Config):
logging.info("Updating status of holmes")
-
+
if not config.cluster_name:
- raise Exception("Cluster name is missing in the configuration. Please ensure 'CLUSTER_NAME' is defined in the environment variables, "
- "or verify that a cluster name is provided in the Robusta configuration file.")
-
+ raise Exception(
+ "Cluster name is missing in the configuration. Please ensure 'CLUSTER_NAME' is defined in the environment variables, "
+ "or verify that a cluster name is provided in the Robusta configuration file."
+ )
+
dal.upsert_holmes_status(
{
"cluster_id": config.cluster_name,
diff --git a/holmes/utils/holmes_sync_toolsets.py b/holmes/utils/holmes_sync_toolsets.py
index a485b03..340f199 100644
--- a/holmes/utils/holmes_sync_toolsets.py
+++ b/holmes/utils/holmes_sync_toolsets.py
@@ -1,4 +1,7 @@
from datetime import datetime
+from typing import Any
+
+import yaml
from holmes.config import Config
@@ -39,7 +42,7 @@ def holmes_sync_toolsets_status(dal: SupabaseDal, config: Config) -> None:
account_id=dal.account_id,
status=toolset.get_status(),
error=toolset.get_error(),
- updated_at=updated_at
+ updated_at=updated_at,
).model_dump(exclude_none=True)
)
dal.sync_toolsets(db_toolsets, config.cluster_name)
@@ -47,19 +50,17 @@ def holmes_sync_toolsets_status(dal: SupabaseDal, config: Config) -> None:
def render_default_installation_instructions_for_toolset(toolset: Toolset) -> str:
env_vars = toolset.get_environment_variables()
- context = {
+ context: dict[str, Any] = {
"env_vars": env_vars if env_vars else [],
"toolset_name": toolset.name,
"enabled": toolset.enabled,
- "default_toolset": toolset.is_default,
+ "example_config": yaml.dump(toolset.get_example_config()),
}
- if toolset.is_default:
- installation_instructions = load_and_render_prompt(
- "file://holmes/utils/default_toolset_installation_guide.jinja2", context
- )
- return installation_instructions
- installation_instructions = load_and_render_prompt(
- "file://holmes/utils/installation_guide.jinja2", context
+ template = (
+ "file://holmes/utils/default_toolset_installation_guide.jinja2"
+ if toolset.is_default
+ else "file://holmes/utils/installation_guide.jinja2"
)
+ installation_instructions = load_and_render_prompt(template, context)
return installation_instructions
diff --git a/holmes/utils/installation_guide.jinja2 b/holmes/utils/installation_guide.jinja2
index bbb8c0e..c1e4393 100644
--- a/holmes/utils/installation_guide.jinja2
+++ b/holmes/utils/installation_guide.jinja2
@@ -22,4 +22,4 @@ holmes:
# Add other configurations as needed
tools:
# Define the tools included in this toolset
-```
\ No newline at end of file
+```
diff --git a/holmes/utils/markdown_utils.py b/holmes/utils/markdown_utils.py
index d29aa16..3f43eaa 100644
--- a/holmes/utils/markdown_utils.py
+++ b/holmes/utils/markdown_utils.py
@@ -42,13 +42,14 @@ def to_plain_text(element):
class PlainTextExtension(Extension):
- def extendMarkdown(self, md):
- md.serializer = to_plain_text
+ def extendMarkdown(self, md):
+ md.serializer = to_plain_text
md.stripTopLevelTags = False
# Extention register actually runs before the format is set and it ends up rewriting serializer that we have just changed
md.set_output_format = lambda x: x
+
def markdown_to_plain_text(text):
md = Markdown(extensions=[PlainTextExtension()])
- return md.convert(text)
\ No newline at end of file
+ return md.convert(text)
diff --git a/holmes/utils/pydantic_utils.py b/holmes/utils/pydantic_utils.py
index 2d3c806..620985f 100644
--- a/holmes/utils/pydantic_utils.py
+++ b/holmes/utils/pydantic_utils.py
@@ -2,7 +2,6 @@
from typing import Any, Dict, List, Tuple, Type, Union, Annotated
import typer
-import yaml
from benedict import benedict
from pydantic import BaseModel, ValidationError, BeforeValidator, ConfigDict
@@ -10,8 +9,10 @@
PromptField = Annotated[str, BeforeValidator(lambda v: load_prompt(v))]
+
class RobustaBaseConfig(BaseModel):
- model_config = ConfigDict(extra='forbid', validate_default=True)
+ model_config = ConfigDict(extra="forbid", validate_default=True)
+
def loc_to_dot_sep(loc: Tuple[Union[str, int], ...]) -> str:
path = ""
@@ -34,9 +35,7 @@ def convert_errors(e: ValidationError) -> List[Dict[str, Any]]:
return new_errors
-def load_model_from_file(
- model: Type[BaseModel], file_path: str, yaml_path: str = None
-):
+def load_model_from_file(model: Type[BaseModel], file_path: str, yaml_path: str = None):
try:
contents = benedict(file_path, format="yaml")
if yaml_path is not None:
diff --git a/holmes/utils/robusta.py b/holmes/utils/robusta.py
index 619d5d7..cc201bd 100644
--- a/holmes/utils/robusta.py
+++ b/holmes/utils/robusta.py
@@ -1,4 +1,3 @@
-
import os
from holmes.config import Config
@@ -6,7 +5,7 @@
from pydantic import SecretStr
-def load_robusta_api_key(dal:SupabaseDal, config:Config):
+def load_robusta_api_key(dal: SupabaseDal, config: Config):
if os.environ.get("ROBUSTA_AI"):
account_id, token = dal.get_ai_credentials()
config.api_key = SecretStr(f"{account_id} {token}")
diff --git a/holmes/utils/tags.py b/holmes/utils/tags.py
index 0242dcd..e71e6b8 100644
--- a/holmes/utils/tags.py
+++ b/holmes/utils/tags.py
@@ -1,5 +1,3 @@
-
-
import logging
from typing import Optional
from typing_extensions import Dict, List
@@ -7,7 +5,8 @@
import json
from copy import deepcopy
-def stringify_tag(tag:Dict[str, str]) -> Optional[str]:
+
+def stringify_tag(tag: Dict[str, str]) -> Optional[str]:
"""
This serializes a dictionary into something more readable to the LLM.
Although I have not seen much difference in quality of output, in theory this can help the LLM
@@ -42,7 +41,8 @@ def stringify_tag(tag:Dict[str, str]) -> Optional[str]:
return formatted_string
-def format_tags_in_string(user_prompt:str) -> str:
+
+def format_tags_in_string(user_prompt: str) -> str:
"""
Formats the tags included in a user's message.
E.g.
@@ -50,7 +50,7 @@ def format_tags_in_string(user_prompt:str) -> str:
-> 'how many pods are running on node my-node?'
"""
try:
- pattern = r'<<(.*?)>>'
+ pattern = r"<<(.*?)>>"
def replace_match(match):
try:
@@ -68,13 +68,13 @@ def replace_match(match):
return user_prompt
-def parse_messages_tags(messages:List[Dict[str, str]]) -> List[Dict[str, str]]:
+def parse_messages_tags(messages: List[Dict[str, str]]) -> List[Dict[str, str]]:
"""
- Parses the user messages for tags and format these.
- System messages and llm responses are ignored and left as-is
+ Parses the user messages for tags and format these.
+ System messages and llm responses are ignored and left as-is
- This method returns a shallow copy of the messages list with the exception
- of the messages that have been parsed.
+ This method returns a shallow copy of the messages list with the exception
+ of the messages that have been parsed.
"""
formatted_messages = []
for message in messages:
@@ -85,7 +85,9 @@ def parse_messages_tags(messages:List[Dict[str, str]]) -> List[Dict[str, str]]:
formatted_message = deepcopy(message)
formatted_message["content"] = formatted_str
formatted_messages.append(formatted_message)
- logging.debug(f"Message with tags '{original_message}' formatted to '{formatted_message}'")
+ logging.debug(
+ f"Message with tags '{original_message}' formatted to '{formatted_message}'"
+ )
else:
formatted_messages.append(message)
diff --git a/loki/docker-compose.yaml b/loki/docker-compose.yaml
index b8a4a75..1201e20 100644
--- a/loki/docker-compose.yaml
+++ b/loki/docker-compose.yaml
@@ -33,7 +33,7 @@ services:
datasources:
- name: Loki
type: loki
- access: proxy
+ access: proxy
orgId: 1
url: http://loki:3100
basicAuth: false
diff --git a/server.py b/server.py
index 8df2261..a9f2a0f 100644
--- a/server.py
+++ b/server.py
@@ -1,3 +1,4 @@
+# ruff: noqa: E402
import os
from holmes.utils.cert_utils import add_custom_certificate
@@ -10,16 +11,13 @@
from holmes.core import investigation
from contextlib import asynccontextmanager
from holmes.utils.holmes_status import update_holmes_status_in_db
-import jinja2
import logging
import uvicorn
import colorlog
-import uuid
import time
from litellm.exceptions import AuthenticationError
from fastapi import FastAPI, HTTPException, Request
-from rich.console import Console
from holmes.utils.robusta import load_robusta_api_key
from holmes.common.env_vars import (
@@ -34,8 +32,8 @@
build_chat_messages,
build_issue_chat_messages,
handle_issue_conversation,
+ build_workload_health_chat_messages,
)
-from holmes.core.issue import Issue
from holmes.core.models import (
InvestigationResult,
ConversationRequest,
@@ -45,6 +43,7 @@
ChatRequest,
ChatResponse,
IssueChatRequest,
+ WorkloadHealthChatRequest,
)
from holmes.plugins.prompts import load_and_render_prompt
from holmes.utils.holmes_sync_toolsets import holmes_sync_toolsets_status
@@ -91,6 +90,7 @@ async def lifespan(app: FastAPI):
if LOG_PERFORMANCE:
+
@app.middleware("http")
async def log_requests(request: Request, call_next):
start_time = time.time()
@@ -101,18 +101,19 @@ async def log_requests(request: Request, call_next):
finally:
process_time = int((time.time() - start_time) * 1000)
- status_code = 'unknown'
+ status_code = "unknown"
if response:
status_code = response.status_code
- logging.info(f"Request completed {request.method} {request.url.path} status={status_code} latency={process_time}ms")
+ logging.info(
+ f"Request completed {request.method} {request.url.path} status={status_code} latency={process_time}ms"
+ )
+
@app.post("/api/investigate")
def investigate_issues(investigate_request: InvestigateRequest):
try:
result = investigation.investigate_issues(
- investigate_request=investigate_request,
- dal=dal,
- config=config
+ investigate_request=investigate_request, dal=dal, config=config
)
return result
@@ -144,10 +145,13 @@ def workload_health_check(request: WorkloadHealthRequest):
request.ask = f"{request.ask}\n My instructions for the investigation '''{nl.join(instructions)}'''"
global_instructions = dal.get_global_instructions_for_account()
- request.ask = add_global_instructions_to_user_prompt(request.ask, global_instructions)
-
- system_prompt = load_and_render_prompt(request.prompt_template, context={'alerts': workload_alerts})
+ request.ask = add_global_instructions_to_user_prompt(
+ request.ask, global_instructions
+ )
+ system_prompt = load_and_render_prompt(
+ request.prompt_template, context={"alerts": workload_alerts}
+ )
ai = config.create_toolcalling_llm(dal=dal)
@@ -165,6 +169,29 @@ def workload_health_check(request: WorkloadHealthRequest):
raise HTTPException(status_code=401, detail=e.message)
+@app.post("/api/workload_health_chat")
+def workload_health_conversation(
+ workload_health_chat_request: WorkloadHealthChatRequest,
+):
+ try:
+ load_robusta_api_key(dal=dal, config=config)
+ ai = config.create_toolcalling_llm(dal=dal)
+ global_instructions = dal.get_global_instructions_for_account()
+
+ messages = build_workload_health_chat_messages(
+ workload_health_chat_request, ai, global_instructions
+ )
+ llm_call = ai.messages_call(messages=messages)
+
+ return ChatResponse(
+ analysis=llm_call.result,
+ tool_calls=llm_call.tool_calls,
+ conversation_history=llm_call.messages,
+ )
+ except AuthenticationError as e:
+ raise HTTPException(status_code=401, detail=e.message)
+
+
# older api that does not support conversation history
@app.post("/api/conversation")
def issue_conversation_deprecated(conversation_request: ConversationRequest):
@@ -191,7 +218,9 @@ def issue_conversation(issue_chat_request: IssueChatRequest):
ai = config.create_toolcalling_llm(dal=dal)
global_instructions = dal.get_global_instructions_for_account()
- messages = build_issue_chat_messages(issue_chat_request, ai, global_instructions)
+ messages = build_issue_chat_messages(
+ issue_chat_request, ai, global_instructions
+ )
llm_call = ai.messages_call(messages=messages)
return ChatResponse(
@@ -212,7 +241,10 @@ def chat(chat_request: ChatRequest):
global_instructions = dal.get_global_instructions_for_account()
messages = build_chat_messages(
- chat_request.ask, chat_request.conversation_history, ai=ai, global_instructions=global_instructions
+ chat_request.ask,
+ chat_request.conversation_history,
+ ai=ai,
+ global_instructions=global_instructions,
)
llm_call = ai.messages_call(messages=messages)
@@ -232,6 +264,10 @@ def get_model():
if __name__ == "__main__":
log_config = uvicorn.config.LOGGING_CONFIG
- log_config["formatters"]["access"]["fmt"] = "%(asctime)s %(levelname)-8s %(message)s"
- log_config["formatters"]["default"]["fmt"] = "%(asctime)s %(levelname)-8s %(message)s"
+ log_config["formatters"]["access"]["fmt"] = (
+ "%(asctime)s %(levelname)-8s %(message)s"
+ )
+ log_config["formatters"]["default"]["fmt"] = (
+ "%(asctime)s %(levelname)-8s %(message)s"
+ )
uvicorn.run(app, host=HOLMES_HOST, port=HOLMES_PORT, log_config=log_config)
diff --git a/tests/llm/fixtures/test_ask_holmes/07_high_latency/helm/Dockerfile b/tests/llm/fixtures/test_ask_holmes/07_high_latency/helm/Dockerfile
index 754163c..f932d01 100644
--- a/tests/llm/fixtures/test_ask_holmes/07_high_latency/helm/Dockerfile
+++ b/tests/llm/fixtures/test_ask_holmes/07_high_latency/helm/Dockerfile
@@ -17,4 +17,3 @@ EXPOSE 8000 8001
# Run the FastAPI app
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8000"]
-
diff --git a/tests/llm/fixtures/test_ask_holmes/07_high_latency/helm/app.py b/tests/llm/fixtures/test_ask_holmes/07_high_latency/helm/app.py
index da618db..4a88c70 100644
--- a/tests/llm/fixtures/test_ask_holmes/07_high_latency/helm/app.py
+++ b/tests/llm/fixtures/test_ask_holmes/07_high_latency/helm/app.py
@@ -1,15 +1,15 @@
+# ruff: noqa: F821
import os
import logging
import time
-from fastapi import FastAPI, Request
+from fastapi import FastAPI
from fastapi.responses import HTMLResponse
-from sqlalchemy import create_engine, text
from prometheus_fastapi_instrumentator import Instrumentator
-
-app = FastAPI()
from random import randint
from time import sleep
+app = FastAPI()
+
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
@@ -27,26 +27,21 @@
# Add Prometheus middleware
Instrumentator().instrument(app).expose(app)
+
def check_promotional_notifications():
- logger.info("Connecting to promotions database to see if we should try to upsell user")
+ logger.info(
+ "Connecting to promotions database to see if we should try to upsell user"
+ )
try:
logger.info(f"Connecting to database at {DB_HOST}")
start_time = time.time()
logger.info(f"Fetching data using stored procedure: {STORED_PROCEDURE}")
# Execute the stored procedure
#
- sleep(randint(5,10))
+ sleep(randint(5, 10))
# Fetch the result
- result = [
- (
- True,
- {
- "type": "notification",
- "discount": "$15"
- }
- )
- ]
+ result = [(True, {"type": "notification", "discount": "$15"})]
end_time = time.time()
logger.info(f"Database call completed in {end_time - start_time:.2f} seconds.")
for row in result:
@@ -57,6 +52,7 @@ def check_promotional_notifications():
logger.error(f"Error checking for promotions: {e}")
return False
+
@app.get("/", response_class=HTMLResponse)
def read_root():
logger.info("Received request for checkout page.")
@@ -76,6 +72,7 @@ def read_root():