Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/master' into app-diagnose-chat
Browse files Browse the repository at this point in the history
  • Loading branch information
itisallgood committed Jan 30, 2025
2 parents bb15b40 + 99ac32c commit 9d3be03
Show file tree
Hide file tree
Showing 15 changed files with 187 additions and 105 deletions.
32 changes: 15 additions & 17 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -631,31 +631,31 @@ Using Grafana Loki

HolmesGPT can consult logs from [Loki](https://grafana.com/oss/loki/) by proxying through a [Grafana](https://grafana.com/oss/grafana/) instance.

There are 2 parts to configuring access to Grafana Loki: Access/Authentication and search terms.
To configure loki toolset:

For access and authentication, add the following environment variables:

* `GRAFANA_URL` - e.g. https://my-org.grafana.net
* `GRAFANA_API_KEY` - e.g. glsa_bsm6ZS_sdfs25f
```yaml
toolsets:
grafana/loki:
enabled: true
config:
api_key: "{{ env.GRAFANA_API_KEY }}"
url: "http://loki-url"
```
For search terms, you can optionally tweak the search terms used by the toolset.
This is done by appending the following to your Holmes configuration file:
This is done by appending the following to your Holmes grafana/loki configuration:
```yaml
grafana:
url: https://my-org.grafana.net #
api_key: glsa_bsm6ZS_sdfs25f
loki:
pod_name_search_key: "pod"
namespace_search_key: "namespace"
node_name_search_key: "node"
pod_name_search_key: "pod"
namespace_search_key: "namespace"
node_name_search_key: "node"
```
> You only need to tweak the configuration file if your Loki logs settings for pod, namespace and node differ from the above defaults.
The Loki toolset is configured the using the same Grafana settings as the Grafana Tempo toolset.
</details>
<details>
<summary>
Using Grafana Tempo
</summary>
Expand All @@ -664,8 +664,6 @@ HolmesGPT can fetch trace information from Grafana Tempo to debug performance re
Tempo is configured the using the same Grafana settings as the Grafana Loki toolset.
grafana:
url: https://my-org.grafana.net #
</details>
Expand Down Expand Up @@ -875,7 +873,7 @@ Configure Slack to send notifications to specific channels. Provide your Slack t
<summary>OpenSearch Integration</summary>

The OpenSearch toolset (`opensearch`) allows Holmes to consult an opensearch cluster for its health, settings and shards information.
The toolset supports multiple opensearch or elasticsearch clusters that are configured by editing Holmes' configuration file (or in cluster to the configuration secret):
The toolset supports multiple opensearch or elasticsearch clusters that are configured by editing Holmes' configuration file:

```
opensearch_clusters:
Expand Down
11 changes: 6 additions & 5 deletions holmes/core/investigation.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@

from typing import Optional
from rich.console import Console
from holmes.common.env_vars import HOLMES_POST_PROCESSING_PROMPT
from holmes.config import Config
from holmes.core.investigation_structured_output import process_response_into_sections
from holmes.core.issue import Issue
from holmes.core.models import InvestigateRequest, InvestigationResult
from holmes.core.supabase_dal import SupabaseDal
Expand Down Expand Up @@ -36,13 +35,15 @@ def investigate_issues(investigate_request: InvestigateRequest, dal: SupabaseDal
issue,
prompt=investigate_request.prompt_template,
post_processing_prompt=HOLMES_POST_PROCESSING_PROMPT,
sections=investigate_request.sections,
instructions=resource_instructions,
global_instructions=global_instructions
)

(text_response, sections) = process_response_into_sections(investigation.result)

return InvestigationResult(
analysis=investigation.result,
sections=investigation.sections,
analysis=text_response,
sections=sections,
tool_calls=investigation.tool_calls or [],
instructions=investigation.instructions,
)
59 changes: 44 additions & 15 deletions holmes/core/investigation_structured_output.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,25 @@
from typing import Any, Dict
from typing import Any, Dict, Optional, Tuple, Union
import json

DEFAULT_SECTIONS = {
from pydantic import RootModel

InputSectionsDataType = Dict[str, str]

OutputSectionsDataType = Optional[Dict[str, Union[str, None]]]

SectionsData = RootModel[OutputSectionsDataType]

DEFAULT_SECTIONS:InputSectionsDataType = {
"Alert Explanation": "1-2 sentences explaining the alert itself - note don't say \"The alert indicates a warning event related to a Kubernetes pod doing blah\" rather just say \"The pod XYZ did blah\" because that is what the user actually cares about",
"Investigation": "what you checked and found",
"Conclusions and Possible Root causes": "what conclusions can you reach based on the data you found? what are possible root causes (if you have enough conviction to say) or what uncertainty remains",
"Next Steps": "what you would do next to troubleshoot this issue, any commands that could be run to fix it, or other ways to solve it (prefer giving precise bash commands when possible)",
"Investigation": "What you checked and found",
"Conclusions and Possible Root causes": "What conclusions can you reach based on the data you found? what are possible root causes (if you have enough conviction to say) or what uncertainty remains. Don't say root cause but 'possible root causes'. Be clear to distinguish between what you know for certain and what is a possible explanation",
"Next Steps": "What you would do next to troubleshoot this issue, any commands that could be run to fix it, or other ways to solve it (prefer giving precise bash commands when possible)",
"Related logs": "Truncate and share the most relevant logs, especially if these explain the root cause. For example: \nLogs from pod robusta-holmes:\n```\n<logs>```\n. Always embed the surroundding +/- 5 log lines to any relevant logs. ",
"App or Infra?": "Explain whether the issue is more likely an infrastructure or an application level issue and why you think that.",
"External links": "Provide links to external sources. Where to look when investigating this issue. For example provide links to relevant runbooks, etc. Add a short sentence describing each link."
}

def get_output_format_for_investigation(sections: Dict[str, str]) -> Dict[str, Any]:
def get_output_format_for_investigation(sections: InputSectionsDataType) -> Dict[str, Any]:

properties = {}
required_fields = []
Expand All @@ -34,12 +43,32 @@ def get_output_format_for_investigation(sections: Dict[str, str]) -> Dict[str, A

return output_format

def combine_sections(sections: Any) -> str:
if isinstance(sections, dict):
content = ''
for section_title, section_content in sections.items():
if section_content:
# content = content + f'\n# {" ".join(section_title.split("_")).title()}\n{section_content}'
content = content + f'\n# {section_title}\n{section_content}\n'
return content
return f"{sections}"
def combine_sections(sections: Dict) -> str:
content = ''
for section_title, section_content in sections.items():
if section_content:
content = content + f'\n# {section_title}\n{section_content}\n'
return content


def process_response_into_sections(response: Any) -> Tuple[str, OutputSectionsDataType]:
if isinstance(response, dict):
# No matter if the result is already structured, we want to go through the code below to validate the JSON
response = json.dumps(response)

if not isinstance(response, str):
# if it's not a string, we make it so as it'll be parsed later
response = str(response)


try:
parsed_json = json.loads(response)
# TODO: force dict values into a string would make this more resilient as SectionsData only accept none/str as values
sections = SectionsData(root=parsed_json).root
if sections:
combined = combine_sections(sections)
return (combined, sections)
except Exception:
pass

return (response, None)
3 changes: 2 additions & 1 deletion holmes/core/models.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from holmes.core.investigation_structured_output import InputSectionsDataType
from holmes.core.tool_calling_llm import ToolCallResult
from typing import Optional, List, Dict, Any, Union
from pydantic import BaseModel, model_validator
Expand All @@ -21,7 +22,7 @@ class InvestigateRequest(BaseModel):
include_tool_calls: bool = False
include_tool_call_results: bool = False
prompt_template: str = "builtin://generic_investigation.jinja2"
sections: Optional[Dict[str, str]] = None
sections: Optional[InputSectionsDataType] = None
# TODO in the future
# response_handler: ...

Expand Down
22 changes: 3 additions & 19 deletions holmes/core/tool_calling_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import logging
import textwrap
from typing import List, Optional, Dict, Type, Union
from holmes.core.investigation_structured_output import DEFAULT_SECTIONS, get_output_format_for_investigation, combine_sections
from holmes.core.investigation_structured_output import DEFAULT_SECTIONS, InputSectionsDataType, get_output_format_for_investigation
from holmes.core.performance_timing import PerformanceTiming
from holmes.utils.tags import format_tags_in_string, parse_messages_tags
from holmes.plugins.prompts import load_and_render_prompt
Expand All @@ -27,14 +27,11 @@ class ToolCallResult(BaseModel):
description: str
result: str


class LLMResult(BaseModel):
tool_calls: Optional[List[ToolCallResult]] = None
sections: Optional[Dict[str, Union[str, None]]] = None
result: Optional[str] = None
unprocessed_result: Optional[str] = None
instructions: List[str] = []

# TODO: clean up these two
prompt: Optional[str] = None
messages: Optional[List[dict]] = None
Expand Down Expand Up @@ -159,22 +156,12 @@ def call(

tools_to_call = getattr(response_message, "tool_calls", None)
text_response = response_message.content
sections:Optional[Dict[str, str]] = None
if isinstance(text_response, str):
try:
parsed_json = json.loads(text_response)
text_response = parsed_json
except json.JSONDecodeError:
pass
if not isinstance(text_response, str):
sections = text_response
text_response = combine_sections(sections)

if not tools_to_call:
# For chatty models post process and summarize the result
# this only works for calls where user prompt is explicitly passed through
if post_process_prompt and user_prompt:
logging.info(f"Running post processing on investigation.")
logging.info("Running post processing on investigation.")
raw_response = text_response
post_processed_response = self._post_processing_call(
prompt=user_prompt,
Expand All @@ -185,7 +172,6 @@ def call(
perf_timing.end()
return LLMResult(
result=post_processed_response,
sections=sections,
unprocessed_result=raw_response,
tool_calls=tool_calls,
prompt=json.dumps(messages, indent=2),
Expand All @@ -195,7 +181,6 @@ def call(
perf_timing.end()
return LLMResult(
result=text_response,
sections=sections,
tool_calls=tool_calls,
prompt=json.dumps(messages, indent=2),
messages=messages,
Expand Down Expand Up @@ -231,7 +216,6 @@ def _invoke_tool(
logging.warning(
f"Failed to parse arguments for tool: {tool_name}. args: {tool_to_call.function.arguments}"
)

tool_call_id = tool_to_call.id
tool = self.tool_executor.get_tool_by_name(tool_name)

Expand Down Expand Up @@ -358,7 +342,7 @@ def investigate(
console: Optional[Console] = None,
global_instructions: Optional[Instructions] = None,
post_processing_prompt: Optional[str] = None,
sections: Optional[Dict[str, str]] = None
sections: Optional[InputSectionsDataType] = None
) -> LLMResult:
runbooks = self.runbook_manager.get_instructions_for_issue(issue)

Expand Down
3 changes: 3 additions & 0 deletions holmes/core/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -377,6 +377,9 @@ def check_prerequisites(self):

self._status = ToolsetStatusEnum.ENABLED

def get_example_config(self) -> Dict[str, Any]:
return {}


class YAMLToolset(Toolset):
tools: List[YAMLTool]
Expand Down
1 change: 0 additions & 1 deletion holmes/plugins/prompts/_general_instructions.jinja2
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@ In general:
* in this case, try to find substrings or search for the correct spellings
* always provide detailed information like exact resource names, versions, labels, etc
* even if you found the root cause, keep investigating to find other possible root causes and to gather data for the answer like exact names
* when giving an answer don't say root cause but "possible root causes" and be clear to distinguish between what you know for certain and what is a possible explanation
* if a runbook url is present as well as tool that can fetch it, you MUST fetch the runbook before beginning your investigation.
* if you don't know, say that the analysis was inconclusive.
* if there are multiple possible causes list them in a numbered list.
Expand Down
21 changes: 16 additions & 5 deletions holmes/plugins/toolsets/grafana/base_grafana_toolset.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import logging
from typing import Any
from typing import Any, ClassVar, Type
from holmes.core.tools import (
Tool,
Toolset,
Expand All @@ -11,17 +11,21 @@


class BaseGrafanaToolset(Toolset):
def __init__(self, name: str, description: str, icon_url: str, tools: list[Tool]):
config_class: ClassVar[Type[GrafanaConfig]] = GrafanaConfig

def __init__(self, name: str, description: str, icon_url: str, tools: list[Tool], doc_url: str):
super().__init__(
name=name,
description=description,
icon_url=icon_url,
docs_url=doc_url,
prerequisites=[CallablePrerequisite(callable=self.prerequisites_callable)],
tools=tools,
tags=[
ToolsetTag.CORE,
],
enabled=False
enabled=False,
is_default=True,
)

def prerequisites_callable(self, config: dict[str, Any]) -> bool:
Expand All @@ -30,10 +34,17 @@ def prerequisites_callable(self, config: dict[str, Any]) -> bool:
return False

try:
self._grafana_config = GrafanaConfig(**config)
is_healthy = get_health(self._grafana_config.url, self._grafana_config.api_key)
self._grafana_config = BaseGrafanaToolset.config_class(**config)
is_healthy = get_health(
self._grafana_config.url, self._grafana_config.api_key
)
return is_healthy

except Exception:
logging.exception("Failed to set up grafana toolset")
return False

def get_example_config(self):
example_config = GrafanaConfig(api_key="YOUR API KEY", url="YOUR GRAFANA URL")
return example_config.model_dump()

13 changes: 1 addition & 12 deletions holmes/plugins/toolsets/grafana/common.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,12 @@
from typing import Dict, Optional, Union
import uuid
import time
import os
from pydantic import BaseModel


GRAFANA_URL_ENV_NAME = "GRAFANA_URL"
GRAFANA_API_KEY_ENV_NAME = "GRAFANA_API_KEY"
ONE_HOUR_IN_SECONDS = 3600


class GrafanaLokiConfig(BaseModel):
pod_name_search_key: str = "pod"
namespace_search_key: str = "namespace"
node_name_search_key: str = "node"


class GrafanaConfig(BaseModel):
loki: GrafanaLokiConfig = GrafanaLokiConfig()
api_key: str
url: str

Expand Down Expand Up @@ -61,5 +50,5 @@ def get_datasource_id(dict: Dict, param: str) -> str:
return f"uid/{datasource_id}"
except:
pass

return datasource_id
Loading

0 comments on commit 9d3be03

Please sign in to comment.