51
- 52
- 53
- 54
- 55
- 56
+ | def _create_param_sets(self) -> list[UserSelections]:
- """Creates a random subset of the parameter space.
-
- Returns
- -------
- list[UserSelections]
- A random subset of the search space combinations.
- """
- param_sets: list[UserSelections] = []
-
- for (
- llm,
- embedding_model,
- filepath,
- loader,
- chunking_config,
- vector_store,
- similarity_top_k,
- ) in product(
- self._llms,
- self._embedding_models,
- self._files,
- self._loaders,
- self._chunking_configs,
- self._vector_stores,
- self._similarity_top_k,
- ):
- base_selections = {
- "llm": llm,
- "embedding_model": embedding_model,
- "filename": os.path.basename(str(filepath)),
- "filepath": filepath,
- "vector_store": vector_store,
- "loader": loader,
- "mode": "production",
- "similarity_top_k": similarity_top_k,
- "chunking_config": chunking_config,
- }
-
- if self._git_data is None:
- base_selections["git_data"] = None
- else:
- for git_data in self._git_data:
- if git_data["filename"] == filepath or git_data[
- "filename"
- ] == os.path.basename(str(filepath)):
- base_selections["git_data"] = create_git_data(
- user=git_data["git_info"]["user"],
- repo=git_data["git_info"]["repo"],
- branch=git_data["git_info"]["branch"],
- filters=git_data["git_info"]["filters"],
- )
- user_selections = create_user_selections(
- base_selections["llm"],
- base_selections["embedding_model"],
- base_selections["filename"],
- base_selections["filepath"],
- base_selections["vector_store"],
- base_selections["loader"],
- base_selections["mode"],
- base_selections["similarity_top_k"],
- base_selections["chunking_config"],
- base_selections["git_data"],
- )
- param_sets.append(user_selections)
-
- if self.subset_size > len(param_sets):
- self.subset_size = len(param_sets)
-
- param_subset = random.sample(param_sets, self.subset_size)
-
- return param_subset
+122
+123
+124
+125
+126
+127
+128
+129
+130
+131
+132
+133
+134
+135
+136
| def _create_param_sets(self) -> list[UserSelections]:
+ """Creates a random subset of the parameter space.
+
+ Returns
+ -------
+ list[UserSelections]
+ A random subset of the search space combinations.
+ """
+ param_sets: list[UserSelections] = []
+
+ for (
+ llm,
+ embedding_model,
+ filepath,
+ loader,
+ chunking_config,
+ vector_store,
+ similarity_top_k,
+ ) in product(
+ self._llms,
+ self._embedding_models,
+ self._files,
+ self._loaders,
+ self._chunking_configs,
+ self._vector_stores,
+ self._similarity_top_k,
+ ):
+ base_selections = {
+ "llm": llm,
+ "embedding_model": embedding_model,
+ "filename": os.path.basename(str(filepath)),
+ "filepath": filepath,
+ "vector_store": vector_store,
+ "loader": loader,
+ "mode": "production",
+ "similarity_top_k": similarity_top_k,
+ "chunking_config": chunking_config,
+ }
+
+ if self._git_data is None:
+ base_selections["git_data"] = None
+ else:
+ for git_data in self._git_data:
+ if git_data["filename"] == filepath or git_data[
+ "filename"
+ ] == os.path.basename(str(filepath)):
+ base_selections["git_data"] = create_git_data(
+ user=git_data["git_info"]["user"],
+ repo=git_data["git_info"]["repo"],
+ branch=git_data["git_info"]["branch"],
+ filters=git_data["git_info"]["filters"],
+ )
+
+ if self._other_docs is None:
+ base_selections["other_docs"] = None
+ else:
+ for paper, other_docs in self._other_docs.items():
+ if paper == os.path.basename(str(filepath)):
+ base_selections["other_docs"] = other_docs
+
+ user_selections = create_user_selections(
+ base_selections["llm"],
+ base_selections["embedding_model"],
+ base_selections["filename"],
+ base_selections["filepath"],
+ base_selections["vector_store"],
+ base_selections["loader"],
+ base_selections["mode"],
+ base_selections["similarity_top_k"],
+ base_selections["chunking_config"],
+ base_selections["git_data"],
+ base_selections["other_docs"],
+ )
+ param_sets.append(user_selections)
+
+ if self.subset_size > len(param_sets):
+ self.subset_size = len(param_sets)
+
+ param_subset = random.sample(param_sets, self.subset_size)
+
+ return param_subset
|
diff --git a/reference-frame/index.html b/reference-frame/index.html
index 02abb3b..e32de9d 100644
--- a/reference-frame/index.html
+++ b/reference-frame/index.html
@@ -217,7 +217,7 @@
- Setup
+ Setup and Quickstart
diff --git a/score-frame/index.html b/score-frame/index.html
index d638b0e..583502e 100644
--- a/score-frame/index.html
+++ b/score-frame/index.html
@@ -217,7 +217,7 @@
- Setup
+ Setup and Quickstart
diff --git a/search/search_index.json b/search/search_index.json
index 7a892c0..df4edb5 100644
--- a/search/search_index.json
+++ b/search/search_index.json
@@ -1 +1 @@
-{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"","title":"Biocompute Object Retrieval-Augmented Generation Assistant","text":""},{"location":"#background","title":"Background","text":"The BioCompute Object (BCO) project is a community-driven open standards framework for standardizing and sharing computations and analyses. With the exponential increase in both the quantity and complexity of biological data and the workflows used to analyze and transform the data, the need for standardization in documentation is imperative for experimental preservation, transparency, accuracy, and reproducability. As with any documentation standard, the main hurdles to continued adoption are the overhead required to maintain the quality and accuracy of a BCO in parallel as the research evolves over time and retroactively documenting pre-existing research. With the recent improvements in large language models (LLMs), the feasibility and utility of an automated BCO creation assistant is an intriguing use case. "},{"location":"#goal","title":"Goal","text":"The goal of this project is to reduce the overhead required in retroactively documenting pre-existing research. By using the Biocompute RAG assistant, you can seamlessly convert existing publications on previous research to be BCO compliant. "},{"location":"#approach-justification","title":"Approach Justification","text":"This BioCompute Object (BCO) assistant will assist users in automatically creating specific BCO domains from user uploaded papers. This proof of concept uses a Retrieval Augmented Generation (RAG) approach rather than a standalone (or fine-tuned) LLM. Our use case is somewhat antithetical to what LLMs were originally designed for. LLMs were designed for creative, free text responses that represent plausible natural language. BCOs, on the other hand, were designed for deterministic, accurate, non-ambiguous, and reproduceable documentation. Given this, there are two main initial questions that have to be considered: - Current LLMs are often categorized as \"stochastic parrots\" that have no underlying understanding of text structure, only generating what are considered plausible natural language respones. How well could LLMs consistently produce structured, schema compliant JSON responses (regardless of the actual output content)?
- Unlike traditional LLM use cases, the goal of BCOs as described above, are not to generate creative and original output. How can we constrain the LLM to limit creativity, extrapolation, and potentially subjective output?
Given these considerations and our use case, a traditional standalone LLM suffers from multiple drawbacks in our problem context. "},{"location":"#issues-with-long-context-windows","title":"Issues with Long Context Windows","text":"Recent studies (Lost in the Middle) have shown that LLMs can struggle with long contexts: ... performance can degrade significantly when changing the position of relevant information, indicating that current language models do not robustly make use of information in long input contexts. In particular, we observe that performance is often highest when relevant information occurs at the beginning or end of the input context, and significantly degrades when models must access relevant information in the middle of long contexts, even for explicitly long-context models. This issue is particularly important for our expected user workflow. If a user uploads a particularly long paper, ingesting the entire paper as part of our context window will likely result in significantly variant output quality on a per domain basis. For example, the usability domain information is usually contained in the paper abstract, which is usually at the beginning of the paper and as a result, will be earlier in the context window. In this case, the generated usability domain is more likely to contain quality information whereas the description domain captures specific workflow information that is usually contained in the middle of a paper. In this case, if the required information is on page 5 of a 10 page paper, we can expect lower quality information in the generated description domain. The RAG will help ensure that our context window stays manageable, by avoiding complete ingestion of the paper in one-shot. Instead, the paper will be indexed and intelligently queried prior to each prompt to ensure our context window is manageable, precise, and relevant. "},{"location":"#training-data-and-false-extrapolation","title":"Training Data and False Extrapolation","text":"LLMs are also highly sensitive to the quality of the training data. A study from Microsoft Research titled Textbooks Are All You Need demonstrated the impact of high-quality training data in output quality, specifically with regard to proficiency in code-generation tasks. By crafting \"textbook quality\" data we were able to train a model that surpasses almost all open-source models on coding benchmarks such as HumanEval and MBPP despite being 10x smaller in model size and 100x smaller in dataset size. When explicit facts aren't availble, standalone LLMs can extrapolate fabricated outputs resulting in confident, but false output. Since we are leveraging existing pre-trained LLMs and do not have the resources to control the training data specificity and quality, we can leverage a RAG framework to supplement our requests with up-to-date, accurate, and relevant information. Rather than relying on the LLM to extrapolate itself, we can supply it with the exact related information it needs to parse, format, and summarize. "},{"location":"aggregator/","title":"In-Progress Documentation","text":"Handles the in progress documentation generator. "},{"location":"aggregator/#aggregator.aggregator.Aggregator","title":"Aggregator ","text":"Classs to handle the in progress documentation of a repository. Processes the work done so far in a code repository and generates plain text documentation on the project that resembles a plain text Biocompute Object. Attributes: Name Type Description path str Path to the directory to process. include str Comma delimited list of glob patterns to include in processing. exclude str Comma delimited list of glob patterns to exclude in processing. include_priority bool Determines whether to prioritize the include or exclude pattern in the case that include and exclude patterns conflict. exclude_from_tree bool Whether to exclude excluded files from the source tree path for prompt generation. client OpenAI OpenAI API client. encoding Encoding The encoding for the LLM. Source code in aggregator/aggregator.py class Aggregator:\n \"\"\"Classs to handle the in progress documentation of a repository. Processes the work done so far in a\n code repository and generates plain text documentation on the project that resembles a plain text Biocompute\n Object.\n\n Attributes\n ----------\n path: str\n Path to the directory to process.\n include: str\n Comma delimited list of glob patterns to include in processing.\n exclude: str\n Comma delimited list of glob patterns to exclude in processing.\n include_priority : bool\n Determines whether to prioritize the include or exclude pattern\n in the case that include and exclude patterns conflict.\n exclude_from_tree : bool\n Whether to exclude excluded files from the source tree path for\n prompt generation.\n client : OpenAI\n OpenAI API client.\n encoding : Encoding\n The encoding for the LLM.\n \"\"\"\n\n def __init__(\n self,\n path: str,\n include: Optional[str],\n exclude: Optional[str],\n include_priority: bool = False,\n exclude_from_tree: bool = False,\n ):\n \"\"\"Constructor.\n\n Parameters\n ----------\n path: str\n Path to the directory to process.\n include: str\n Comma delimited list of glob patterns to include in processing.\n exclude: str\n Comma delimited list of glob patterns to exclude in processing.\n include_priority : bool, optional\n Determines whether to prioritize the include or exclude pattern\n in the case that include and exclude patterns conflict.\n exclude_from_tree : bool, optional\n Whether to exclude excluded files from the source tree path for\n prompt generation.\n \"\"\"\n load_dotenv()\n self.path = path\n self.include = include\n self.exclude = exclude\n self.include_priority = include_priority\n self.exclude_from_tree = exclude_from_tree\n self.client = OpenAI(api_key=os.getenv(\"OPENAI_API_KEY\"))\n self.encoding = tiktoken.encoding_for_model(MODEL)\n\n host_os = platform.system().lower()\n machine = platform.machine().lower()\n if host_os not in BINARY_MAP:\n graceful_exit(1, f\"OS `{os}` not supported.\")\n if machine not in BINARY_MAP[host_os]:\n graceful_exit(1, f\"{os} architecture for `{machine}` not supported.\")\n\n self._binary_path = os.path.join(\n os.path.dirname(__file__), \"binaries\", BINARY_MAP[host_os][machine]\n )\n\n def get_prompt(self) -> str:\n \"\"\"Calls the codeprompt binary and generates the LLM prompt.\"\"\"\n cmd = [self._binary_path, self.path]\n\n if self.include:\n cmd.extend([\"--include\", f\"{self.include}\"])\n if self.exclude:\n cmd.extend([\"--exclude\", f\"{self.exclude}\"])\n if self.include_priority:\n cmd.append(\"--include-priority\")\n if self.exclude_from_tree:\n cmd.extend([\"--exclude-from-tree\"])\n cmd.extend([\"--output\", PROMPT_PATH])\n cmd.extend([\"-t\", os.path.join(os.path.dirname(__file__), \"template.hbs\")])\n cmd.append(\"--no-clipboard\")\n cmd.append(\"--spinner\")\n cmd.append(\"--line-numbers\")\n cmd.append(\"--tokens\")\n\n try:\n result = subprocess.run(cmd, capture_output=True, text=True, check=True)\n return result.stdout\n except subprocess.CalledProcessError as e:\n error_msg = (\n f\"Command '{e.cmd}' returned non-zero exit status {e.returncode}.\"\n )\n error_msg += f\"\\nError output:\\n{e.stderr}\"\n except Exception as e:\n error_msg = f\"Unexpected error in generating prompt.\\n{e}\"\n graceful_exit(1, error_msg)\n\n def generate_summary(self) -> None:\n \"\"\"Entry point for generating the LLM documentation.\"\"\"\n if not os.path.isfile(PROMPT_PATH):\n graceful_exit(1, f\"No prompt found at `{PROMPT_PATH}`.\")\n with open(PROMPT_PATH, \"r\") as f:\n prompt = f.read()\n\n tokens = self._count_tokens(prompt)\n token_count = len(tokens)\n print(f\"Total prompt token count: {token_count}\")\n\n if token_count <= MAX_TOKENS:\n response = self._process_prompt(prompt)\n else:\n print(\n f\"Warning: Prompt size exceeds the max tokens limit ({MAX_TOKENS}), response will still be generated but will likely be somewhat degraded in quality. Consider limiting the include patterns.\"\n )\n chunks = self._split_prompt(tokens, token_count)\n responses = self._process_chunks(chunks)\n response = self._combine_responses(responses)\n\n self._write_output(response)\n\n def _process_prompt(self, prompt: str) -> str:\n \"\"\"Process a single prompt using the OpenAI API.\n\n Parameters\n ----------\n prompt : str\n The prompt to be processed.\n\n Raises\n ------\n Exception\n If there's an unexpected error in generating the summary.\n \"\"\"\n try:\n response = self.client.chat.completions.create(\n model=MODEL,\n messages=[\n {\n \"role\": \"system\",\n \"content\": SYSTEM_PROMPT,\n },\n {\"role\": \"user\", \"content\": prompt},\n ],\n )\n response_txt = (\n response.choices[0].message.content\n if response.choices[0].message.content\n else \"\"\n )\n\n except Exception as e:\n error_msg = f\"Unexpected error in generating summary.\\n{e}\"\n graceful_exit(1, error_msg)\n\n return response_txt\n\n def _split_prompt(self, tokens: list[int], token_count: int) -> list[str]:\n \"\"\"Split a large prompt into smaller chunks that fit within the token limit.\n\n Parameters\n ----------\n\n Returns\n -------\n list[str]\n A list of prompt chunks, each within the token limit.\n \"\"\"\n print(\"Splitting prompt...\")\n chunks = []\n\n start = 0\n while start < token_count:\n end = min(start + MAX_TOKENS, token_count)\n if end < token_count:\n split_range = max(10, int(MAX_TOKENS * 0.1))\n for i in range(end, end - split_range, -1):\n if tokens[i] == self.encoding.encode(\"\\n\")[0]:\n end = i + 1\n break\n\n chunk_tokens = tokens[start:end]\n chunks.append(self.encoding.decode(chunk_tokens))\n start = end\n\n print(f\"Split into {len(chunks)} chunks\")\n return chunks\n\n def _process_chunks(self, chunks: list[str]) -> list[str]:\n \"\"\"Process multiple prompt chunks and combine their responses.\n\n Parameters\n ----------\n chunks : list[str]\n A list of prompt chunks to be processed.\n\n Raises\n ------\n Exception\n If there's an unexpected error in generating the summary for any chunk.\n \"\"\"\n responses: list[str] = []\n for i, chunk in enumerate(chunks):\n try:\n response = self.client.chat.completions.create(\n model=MODEL,\n messages=[\n {\n \"role\": \"system\",\n \"content\": f\"{SYSTEM_PROMPT} This is part {i + 1} of {len(chunks)}.\",\n },\n {\"role\": \"user\", \"content\": chunk},\n ],\n )\n response_txt = (\n response.choices[0].message.content\n if response.choices[0].message.content\n else \"\"\n )\n responses.append(response_txt)\n except Exception as e:\n graceful_exit(\n 1, f\"Unexpected error in generating summary for chunk {i + 1}.\\n{e}\"\n )\n return responses\n\n def _combine_responses(self, responses: list[str]) -> str:\n combine_prompt = f\"\"\"\n You are tasked with combining multiple responses into cohesive BioCompute Object-like (BCO) documentation. \n The BCO-like plain text documentation should include the following domains:\n - Usability Domain\n - IO Domain\n - Description Domain\n - Execution Domain\n - Parametric Domain\n - Error Domain\n\n Here are the responses to combine:\n\n {' '.join(responses)}\n\n Please structure the information into a single, coherent BCO documentation, ensuring that:\n 1. All relevant information from the responses is included.\n 2. The information is organized under the appropriate BCO domains.\n 3. Any redundant information is removed.\n 4. The final document flows logically and reads cohesively.\n 5. If specific information for a domain isn't available, mention that in the respective section.\n\n Format the output as markdown, with each domain as a second-level header (##).\n \"\"\"\n try:\n response = self.client.chat.completions.create(\n model=MODEL,\n messages=[\n {\"role\": \"system\", \"content\": SYSTEM_PROMPT},\n {\"role\": \"user\", \"content\": combine_prompt},\n ],\n )\n return (\n response.choices[0].message.content\n if response.choices[0].message.content\n else \"\"\n )\n except Exception as e:\n graceful_exit(1, f\"{e}\\nUnexpected error in combining responses.\")\n\n def _count_tokens(self, text: str) -> list[int]:\n \"\"\"Count the number of tokens in the given text.\n\n Parameters\n ----------\n text : str\n The text to count tokens for.\n\n Returns\n -------\n list[int]\n The number of tokens in each line of the text.\n \"\"\"\n return self.encoding.encode(text)\n\n def _write_output(self, content: str) -> None:\n with open(OUTPUT_PATH, \"w\") as out_file:\n out_file.write(content)\n "},{"location":"aggregator/#aggregator.aggregator.Aggregator.__init__","title":"__init__(path, include, exclude, include_priority=False, exclude_from_tree=False) ","text":"Constructor. Parameters: Name Type Description Default path str Path to the directory to process. required include Optional[str] Comma delimited list of glob patterns to include in processing. required exclude Optional[str] Comma delimited list of glob patterns to exclude in processing. required include_priority bool Determines whether to prioritize the include or exclude pattern in the case that include and exclude patterns conflict. False exclude_from_tree bool Whether to exclude excluded files from the source tree path for prompt generation. False Source code in aggregator/aggregator.py def __init__(\n self,\n path: str,\n include: Optional[str],\n exclude: Optional[str],\n include_priority: bool = False,\n exclude_from_tree: bool = False,\n):\n \"\"\"Constructor.\n\n Parameters\n ----------\n path: str\n Path to the directory to process.\n include: str\n Comma delimited list of glob patterns to include in processing.\n exclude: str\n Comma delimited list of glob patterns to exclude in processing.\n include_priority : bool, optional\n Determines whether to prioritize the include or exclude pattern\n in the case that include and exclude patterns conflict.\n exclude_from_tree : bool, optional\n Whether to exclude excluded files from the source tree path for\n prompt generation.\n \"\"\"\n load_dotenv()\n self.path = path\n self.include = include\n self.exclude = exclude\n self.include_priority = include_priority\n self.exclude_from_tree = exclude_from_tree\n self.client = OpenAI(api_key=os.getenv(\"OPENAI_API_KEY\"))\n self.encoding = tiktoken.encoding_for_model(MODEL)\n\n host_os = platform.system().lower()\n machine = platform.machine().lower()\n if host_os not in BINARY_MAP:\n graceful_exit(1, f\"OS `{os}` not supported.\")\n if machine not in BINARY_MAP[host_os]:\n graceful_exit(1, f\"{os} architecture for `{machine}` not supported.\")\n\n self._binary_path = os.path.join(\n os.path.dirname(__file__), \"binaries\", BINARY_MAP[host_os][machine]\n )\n "},{"location":"aggregator/#aggregator.aggregator.Aggregator.get_prompt","title":"get_prompt() ","text":"Calls the codeprompt binary and generates the LLM prompt. Source code in aggregator/aggregator.py def get_prompt(self) -> str:\n \"\"\"Calls the codeprompt binary and generates the LLM prompt.\"\"\"\n cmd = [self._binary_path, self.path]\n\n if self.include:\n cmd.extend([\"--include\", f\"{self.include}\"])\n if self.exclude:\n cmd.extend([\"--exclude\", f\"{self.exclude}\"])\n if self.include_priority:\n cmd.append(\"--include-priority\")\n if self.exclude_from_tree:\n cmd.extend([\"--exclude-from-tree\"])\n cmd.extend([\"--output\", PROMPT_PATH])\n cmd.extend([\"-t\", os.path.join(os.path.dirname(__file__), \"template.hbs\")])\n cmd.append(\"--no-clipboard\")\n cmd.append(\"--spinner\")\n cmd.append(\"--line-numbers\")\n cmd.append(\"--tokens\")\n\n try:\n result = subprocess.run(cmd, capture_output=True, text=True, check=True)\n return result.stdout\n except subprocess.CalledProcessError as e:\n error_msg = (\n f\"Command '{e.cmd}' returned non-zero exit status {e.returncode}.\"\n )\n error_msg += f\"\\nError output:\\n{e.stderr}\"\n except Exception as e:\n error_msg = f\"Unexpected error in generating prompt.\\n{e}\"\n graceful_exit(1, error_msg)\n "},{"location":"aggregator/#aggregator.aggregator.Aggregator.generate_summary","title":"generate_summary() ","text":"Entry point for generating the LLM documentation. Source code in aggregator/aggregator.py def generate_summary(self) -> None:\n \"\"\"Entry point for generating the LLM documentation.\"\"\"\n if not os.path.isfile(PROMPT_PATH):\n graceful_exit(1, f\"No prompt found at `{PROMPT_PATH}`.\")\n with open(PROMPT_PATH, \"r\") as f:\n prompt = f.read()\n\n tokens = self._count_tokens(prompt)\n token_count = len(tokens)\n print(f\"Total prompt token count: {token_count}\")\n\n if token_count <= MAX_TOKENS:\n response = self._process_prompt(prompt)\n else:\n print(\n f\"Warning: Prompt size exceeds the max tokens limit ({MAX_TOKENS}), response will still be generated but will likely be somewhat degraded in quality. Consider limiting the include patterns.\"\n )\n chunks = self._split_prompt(tokens, token_count)\n responses = self._process_chunks(chunks)\n response = self._combine_responses(responses)\n\n self._write_output(response)\n "},{"location":"aggregator/#aggregator.aggregator.Aggregator._process_prompt","title":"_process_prompt(prompt) ","text":"Process a single prompt using the OpenAI API. Parameters: Name Type Description Default prompt str The prompt to be processed. required Raises: Type Description Exception If there's an unexpected error in generating the summary. Source code in aggregator/aggregator.py def _process_prompt(self, prompt: str) -> str:\n \"\"\"Process a single prompt using the OpenAI API.\n\n Parameters\n ----------\n prompt : str\n The prompt to be processed.\n\n Raises\n ------\n Exception\n If there's an unexpected error in generating the summary.\n \"\"\"\n try:\n response = self.client.chat.completions.create(\n model=MODEL,\n messages=[\n {\n \"role\": \"system\",\n \"content\": SYSTEM_PROMPT,\n },\n {\"role\": \"user\", \"content\": prompt},\n ],\n )\n response_txt = (\n response.choices[0].message.content\n if response.choices[0].message.content\n else \"\"\n )\n\n except Exception as e:\n error_msg = f\"Unexpected error in generating summary.\\n{e}\"\n graceful_exit(1, error_msg)\n\n return response_txt\n "},{"location":"aggregator/#aggregator.aggregator.Aggregator._split_prompt","title":"_split_prompt(tokens, token_count) ","text":"Split a large prompt into smaller chunks that fit within the token limit. Parameters: Name Type Description Default Returns required list A list of prompt chunks, each within the token limit. required Source code in aggregator/aggregator.py def _split_prompt(self, tokens: list[int], token_count: int) -> list[str]:\n \"\"\"Split a large prompt into smaller chunks that fit within the token limit.\n\n Parameters\n ----------\n\n Returns\n -------\n list[str]\n A list of prompt chunks, each within the token limit.\n \"\"\"\n print(\"Splitting prompt...\")\n chunks = []\n\n start = 0\n while start < token_count:\n end = min(start + MAX_TOKENS, token_count)\n if end < token_count:\n split_range = max(10, int(MAX_TOKENS * 0.1))\n for i in range(end, end - split_range, -1):\n if tokens[i] == self.encoding.encode(\"\\n\")[0]:\n end = i + 1\n break\n\n chunk_tokens = tokens[start:end]\n chunks.append(self.encoding.decode(chunk_tokens))\n start = end\n\n print(f\"Split into {len(chunks)} chunks\")\n return chunks\n "},{"location":"aggregator/#aggregator.aggregator.Aggregator._process_chunks","title":"_process_chunks(chunks) ","text":"Process multiple prompt chunks and combine their responses. Parameters: Name Type Description Default chunks list[str] A list of prompt chunks to be processed. required Raises: Type Description Exception If there's an unexpected error in generating the summary for any chunk. Source code in aggregator/aggregator.py def _process_chunks(self, chunks: list[str]) -> list[str]:\n \"\"\"Process multiple prompt chunks and combine their responses.\n\n Parameters\n ----------\n chunks : list[str]\n A list of prompt chunks to be processed.\n\n Raises\n ------\n Exception\n If there's an unexpected error in generating the summary for any chunk.\n \"\"\"\n responses: list[str] = []\n for i, chunk in enumerate(chunks):\n try:\n response = self.client.chat.completions.create(\n model=MODEL,\n messages=[\n {\n \"role\": \"system\",\n \"content\": f\"{SYSTEM_PROMPT} This is part {i + 1} of {len(chunks)}.\",\n },\n {\"role\": \"user\", \"content\": chunk},\n ],\n )\n response_txt = (\n response.choices[0].message.content\n if response.choices[0].message.content\n else \"\"\n )\n responses.append(response_txt)\n except Exception as e:\n graceful_exit(\n 1, f\"Unexpected error in generating summary for chunk {i + 1}.\\n{e}\"\n )\n return responses\n "},{"location":"aggregator/#aggregator.aggregator.Aggregator._count_tokens","title":"_count_tokens(text) ","text":"Count the number of tokens in the given text. Parameters: Name Type Description Default text str The text to count tokens for. required Returns: Type Description list[int] The number of tokens in each line of the text. Source code in aggregator/aggregator.py def _count_tokens(self, text: str) -> list[int]:\n \"\"\"Count the number of tokens in the given text.\n\n Parameters\n ----------\n text : str\n The text to count tokens for.\n\n Returns\n -------\n list[int]\n The number of tokens in each line of the text.\n \"\"\"\n return self.encoding.encode(text)\n "},{"location":"app-start/","title":"App Start","text":"Handles the app initilization procedure. "},{"location":"app-start/#evaluator.backend.app_start.initialization","title":"initialization() ","text":"Handles the app initialization process. Source code in evaluator/backend/app_start.py def initialization() -> AppAttributes:\n \"\"\"Handles the app initialization process.\"\"\"\n _config_data = _load_config_data()\n if _config_data is None:\n misc_fns.graceful_exit(1, \"Error loading frontend configuration data.\")\n\n logger = misc_fns.setup_root_logger(\n log_path=_config_data[\"logger_path\"], name=_config_data[\"logger_name\"]\n )\n logger.info(\n \"################################## RUN START ##################################\"\n )\n\n _raw_directory_paths = glob(\n os.path.join(\n _config_data[\"generated_output_dir_path\"], _config_data[\"glob_pattern\"]\n )\n )\n directory_paths = [\n x\n for x in _raw_directory_paths\n if not any(y in x for y in _config_data[\"ignore_files\"])\n ]\n\n # load in existing evaluation data\n bco_results_data = misc_fns.load_json(\n os.path.join(\n _config_data[\"results_dir_path\"], _config_data[\"bco_results_file_name\"]\n )\n )\n user_results_data = misc_fns.load_json(\n os.path.join(\n _config_data[\"results_dir_path\"], _config_data[\"user_results_file_name\"]\n )\n )\n users_data = misc_fns.load_json(\n os.path.join(_config_data[\"results_dir_path\"], _config_data[\"users_file_name\"])\n )\n if bco_results_data is None or user_results_data is None or users_data is None:\n misc_fns.graceful_exit(1, \"Error loading results files.\")\n\n bco_results_data = _create_paper_keys(directory_paths, bco_results_data)\n\n app_attrs = create_app_attributes(\n logger=logger,\n results_dir_path=_config_data[\"results_dir_path\"],\n bco_results_file_name=_config_data[\"bco_results_file_name\"],\n bco_results_data=bco_results_data,\n user_results_file_name=_config_data[\"user_results_file_name\"],\n user_results_data=user_results_data,\n users_file_name=_config_data[\"users_file_name\"],\n users_data=users_data,\n generated_output_dir_root=_config_data[\"generated_output_dir_path\"],\n generated_directory_paths=directory_paths,\n padding=_config_data[\"padding\"],\n font=_config_data[\"font\"],\n )\n\n return app_attrs\n "},{"location":"app-start/#evaluator.backend.app_start.create_init_run_state","title":"create_init_run_state(app_state) ","text":"Creates the init run state. Parameters: Name Type Description Default app_state AppState The current app state. required Returns: Type Description RunState The intial run state. Source code in evaluator/backend/app_start.py def create_init_run_state(app_state: AppState) -> RunState:\n \"\"\"Creates the init run state.\n\n Parameters\n ----------\n app_state : AppState\n The current app state.\n\n Returns\n -------\n RunState\n The intial run state.\n \"\"\"\n total_runs = _get_total_runs(app_state)\n run_state = load_run_state(run_index=0, total_runs=total_runs, app_state=app_state)\n return run_state\n "},{"location":"app-start/#evaluator.backend.app_start._get_total_runs","title":"_get_total_runs(app_state) ","text":"Get the total number of runs in the output directory. Parameters: Name Type Description Default app_state AppState The current app state. required Returns: Type Description int The number of total potential generated domains to evaluate. Source code in evaluator/backend/app_start.py def _get_total_runs(app_state: AppState) -> int:\n \"\"\"Get the total number of runs in the output directory.\n\n Parameters\n ----------\n app_state : AppState\n The current app state.\n\n Returns\n -------\n int\n The number of total potential generated domains\n to evaluate.\n \"\"\"\n total_runs = 0\n for directory in app_state[\"generated_directory_paths\"]:\n output_map = misc_fns.load_json(os.path.join(directory, \"output_map.json\"))\n if output_map is None:\n misc_fns.graceful_exit(\n 1,\n f\"Error: Output map not found in directory `{directory}` while calculating total runs.\",\n )\n for domain in output_map:\n for domain_param_set in output_map[domain]:\n total_runs += len(domain_param_set[\"entries\"][\"runs\"])\n return total_runs\n "},{"location":"app-start/#evaluator.backend.app_start._create_paper_keys","title":"_create_paper_keys(directory_paths, bco_results_data) ","text":"Creates an entry for each paper in the evaluations file. Parameters: Name Type Description Default directory_paths list[str] Path to the generated BCO directories. required bco_results_data dict The loaded BCO evaluations results file. required Returns: Type Description dict The updated BCO evaluations data. Source code in evaluator/backend/app_start.py def _create_paper_keys(directory_paths: list[str], bco_results_data: dict) -> dict:\n \"\"\"Creates an entry for each paper in the evaluations file.\n\n Parameters\n ----------\n directory_paths : list [str]\n Path to the generated BCO directories.\n bco_results_data : dict\n The loaded BCO evaluations results file.\n\n Returns\n -------\n dict\n The updated BCO evaluations data.\n \"\"\"\n directory_basenames = [os.path.basename(x) for x in directory_paths]\n for paper in directory_basenames:\n if paper not in bco_results_data:\n bco_results_data[paper] = {}\n return bco_results_data\n "},{"location":"app-start/#evaluator.backend.app_start._load_config_data","title":"_load_config_data(filepath='./evaluator/backend/conf.json') ","text":"Loads the App configuration data. Parameters: Name Type Description Default filepath str Filepath to the App config data. './evaluator/backend/conf.json' Returns: Type Description ConfigData | None The configuration data on success, None on error. Source code in evaluator/backend/app_start.py def _load_config_data(\n filepath: str = \"./evaluator/backend/conf.json\",\n) -> Optional[ConfigData]:\n \"\"\"Loads the App configuration data.\n\n Parameters\n ----------\n filepath : str, optional\n Filepath to the App config data.\n\n Returns\n -------\n ConfigData | None\n The configuration data on success, None on error.\n \"\"\"\n naive_load_data = misc_fns.load_json(filepath)\n if naive_load_data is None:\n return None\n if isinstance(naive_load_data, dict):\n config_object = cast(ConfigData, naive_load_data)\n return config_object\n return None\n "},{"location":"app/","title":"App","text":""},{"location":"app/#evaluator.frontend.app.App","title":"App ","text":" Bases: CTk Frontend for evaluating generated BCO domains from BcoRag. Source code in evaluator/frontend/app.py class App(ctk.CTk):\n \"\"\"Frontend for evaluating generated BCO domains from\n BcoRag.\n \"\"\"\n\n def __init__(self):\n \"\"\"Constructor.\"\"\"\n super().__init__()\n init_data = app_start.initialization()\n\n self.attributes = init_data\n\n self.title(\"BCO RAG Evaluator\")\n self.geometry(f\"{1920}x{1080}\")\n\n self.grid_columnconfigure(0, weight=1)\n self.grid_rowconfigure(0, weight=1)\n\n self.login_screen = LoginScreen(\n master=self,\n attributes=self.attributes,\n on_login=login,\n on_login_success=self._login_success,\n on_exit=misc.exit_app,\n )\n\n def start(self):\n \"\"\"Start the app main loop.\"\"\"\n self.mainloop()\n\n def navigate(\n self, direction: Literal[-1, 1], run_index: int, app_state: AppState\n ) -> None:\n \"\"\"Callback to execute when the user presses\n the next or previous buttons.\n\n Parameters\n ----------\n direction : -1 or 1\n Indicates the direction the user is navigating,\n -1 for previous, 1 for next.\n run_index : int\n The new run index being navigated to.\n app_state : AppState\n The current app state.\n \"\"\"\n self.app_state = app_state\n updated_run_state = state.load_run_state(\n run_index=run_index,\n total_runs=self.run[\"total_runs\"],\n app_state=self.app_state,\n )\n self.view_page.update_state(\n app_state=self.app_state, run_state=updated_run_state\n )\n\n def _login_success(self, app_state: AppState) -> None:\n \"\"\"Callback to execute on login success.\"\"\"\n self.app_state = app_state\n self.login_screen.grid_forget()\n self.intermediate_screen = IntermediateScreen(\n master=self, on_start=self._on_start, app_state=self.app_state\n )\n\n def _on_start(self, app_state: AppState) -> None:\n \"\"\"Callback to execute on evaluation start.\"\"\"\n self.intermediate_screen.grid_forget()\n self.app_state = app_state\n # create init run state\n init_run_state = app_start.create_init_run_state(app_state)\n self.run = init_run_state\n self.view_page = ViewPage(\n master=self,\n app_state=self.app_state,\n run_state=init_run_state,\n navigate=self.navigate,\n on_save=state.save_state,\n on_exit=misc.exit_app,\n )\n "},{"location":"app/#evaluator.frontend.app.App.__init__","title":"__init__() ","text":"Constructor. Source code in evaluator/frontend/app.py def __init__(self):\n \"\"\"Constructor.\"\"\"\n super().__init__()\n init_data = app_start.initialization()\n\n self.attributes = init_data\n\n self.title(\"BCO RAG Evaluator\")\n self.geometry(f\"{1920}x{1080}\")\n\n self.grid_columnconfigure(0, weight=1)\n self.grid_rowconfigure(0, weight=1)\n\n self.login_screen = LoginScreen(\n master=self,\n attributes=self.attributes,\n on_login=login,\n on_login_success=self._login_success,\n on_exit=misc.exit_app,\n )\n "},{"location":"app/#evaluator.frontend.app.App.start","title":"start() ","text":"Start the app main loop. Source code in evaluator/frontend/app.py def start(self):\n \"\"\"Start the app main loop.\"\"\"\n self.mainloop()\n "},{"location":"app/#evaluator.frontend.app.App.navigate","title":"navigate(direction, run_index, app_state) ","text":"Callback to execute when the user presses the next or previous buttons. Parameters: Name Type Description Default direction -1 or 1 Indicates the direction the user is navigating, -1 for previous, 1 for next. required run_index int The new run index being navigated to. required app_state AppState The current app state. required Source code in evaluator/frontend/app.py def navigate(\n self, direction: Literal[-1, 1], run_index: int, app_state: AppState\n) -> None:\n \"\"\"Callback to execute when the user presses\n the next or previous buttons.\n\n Parameters\n ----------\n direction : -1 or 1\n Indicates the direction the user is navigating,\n -1 for previous, 1 for next.\n run_index : int\n The new run index being navigated to.\n app_state : AppState\n The current app state.\n \"\"\"\n self.app_state = app_state\n updated_run_state = state.load_run_state(\n run_index=run_index,\n total_runs=self.run[\"total_runs\"],\n app_state=self.app_state,\n )\n self.view_page.update_state(\n app_state=self.app_state, run_state=updated_run_state\n )\n "},{"location":"app/#evaluator.frontend.app.App._login_success","title":"_login_success(app_state) ","text":"Callback to execute on login success. Source code in evaluator/frontend/app.py def _login_success(self, app_state: AppState) -> None:\n \"\"\"Callback to execute on login success.\"\"\"\n self.app_state = app_state\n self.login_screen.grid_forget()\n self.intermediate_screen = IntermediateScreen(\n master=self, on_start=self._on_start, app_state=self.app_state\n )\n "},{"location":"app/#evaluator.frontend.app.App._on_start","title":"_on_start(app_state) ","text":"Callback to execute on evaluation start. Source code in evaluator/frontend/app.py def _on_start(self, app_state: AppState) -> None:\n \"\"\"Callback to execute on evaluation start.\"\"\"\n self.intermediate_screen.grid_forget()\n self.app_state = app_state\n # create init run state\n init_run_state = app_start.create_init_run_state(app_state)\n self.run = init_run_state\n self.view_page = ViewPage(\n master=self,\n app_state=self.app_state,\n run_state=init_run_state,\n navigate=self.navigate,\n on_save=state.save_state,\n on_exit=misc.exit_app,\n )\n "},{"location":"base-evaluation-frame/","title":"Base Class","text":"Base evaluation frame, enforces the update state and get results methods. "},{"location":"base-evaluation-frame/#evaluator.frontend.components.evaluation_frames.evaluation_parent.EvaluationBaseFrame","title":"EvaluationBaseFrame ","text":" Bases: ABC , Generic[T] Source code in evaluator/frontend/components/evaluation_frames/evaluation_parent.py class EvaluationBaseFrame(ABC, Generic[T]):\n\n @abstractmethod\n def __init__(self, master, app_state: AppState, run_state: RunState, **kwargs):\n pass\n\n @abstractmethod\n def update_state(self, app_state: AppState, run_state: RunState) -> None:\n \"\"\"Upate the component state.\n\n Parameters\n ----------\n app_state : AppState\n The updated app state.\n run_state : RunState\n The updated run state.\n \"\"\"\n pass\n\n @abstractmethod\n def get_results(self) -> T:\n \"\"\"Gets the results for the current state of the evaluation frame.\n\n Returns\n -------\n T\n The specific evaluation TypedDict for the frame.\n \"\"\"\n pass\n "},{"location":"base-evaluation-frame/#evaluator.frontend.components.evaluation_frames.evaluation_parent.EvaluationBaseFrame.update_state","title":"update_state(app_state, run_state) abstractmethod ","text":"Upate the component state. Parameters: Name Type Description Default app_state AppState The updated app state. required run_state RunState The updated run state. required Source code in evaluator/frontend/components/evaluation_frames/evaluation_parent.py @abstractmethod\ndef update_state(self, app_state: AppState, run_state: RunState) -> None:\n \"\"\"Upate the component state.\n\n Parameters\n ----------\n app_state : AppState\n The updated app state.\n run_state : RunState\n The updated run state.\n \"\"\"\n pass\n "},{"location":"base-evaluation-frame/#evaluator.frontend.components.evaluation_frames.evaluation_parent.EvaluationBaseFrame.get_results","title":"get_results() abstractmethod ","text":"Gets the results for the current state of the evaluation frame. Returns: Type Description T The specific evaluation TypedDict for the frame. Source code in evaluator/frontend/components/evaluation_frames/evaluation_parent.py @abstractmethod\ndef get_results(self) -> T:\n \"\"\"Gets the results for the current state of the evaluation frame.\n\n Returns\n -------\n T\n The specific evaluation TypedDict for the frame.\n \"\"\"\n pass\n "},{"location":"bcorag-types/","title":"Core Types","text":"The core logic custom types. Type Aliases DomainKey = Literal[\"usability\", \"io\", \"description\", \"execution\", \"parametric\", \"error\"] OptionKey = Literal[ \"loader\", \"chunking_config\", \"embedding_model\", \"vector_store\", \"similarity_top_k\", \"llm\", \"mode\"] "},{"location":"bcorag-types/#bcorag.custom_types.core_types.GitFilter","title":"GitFilter ","text":" Bases: Enum Enum delineating between the directory and file extension filters. Attributes: Name Type Description DIRECTORY int A git directory filter, represented by the value 1. FILE_EXTENSION int A file extension filter, represented by the value 2. Source code in bcorag/custom_types/core_types.py class GitFilter(Enum):\n \"\"\"Enum delineating between the directory and file extension filters.\n\n Attributes\n ----------\n DIRECTORY : int\n A git directory filter, represented by the value 1.\n FILE_EXTENSION : int\n A file extension filter, represented by the value 2.\n \"\"\"\n\n DIRECTORY = 1\n FILE_EXTENSION = 2\n "},{"location":"bcorag-types/#bcorag.custom_types.core_types.GitFilters","title":"GitFilters ","text":" Bases: TypedDict Typed dict for github loader filters. Attributes: Name Type Description filter_type FilterType The type of github filter (whether it is an include or exclude filter). filter GitFilter The filter enum specification. value list[str] The values to filter on. Source code in bcorag/custom_types/core_types.py class GitFilters(TypedDict):\n \"\"\"Typed dict for github loader filters.\n\n Attributes\n ----------\n filter_type : GithubRepositoryReader.FilterType\n The type of github filter (whether it is an include or exclude filter).\n filter : GitFilter\n The filter enum specification.\n value : list[str]\n The values to filter on.\n \"\"\"\n\n filter_type: GithubRepositoryReader.FilterType\n filter: GitFilter\n value: list[str]\n "},{"location":"bcorag-types/#bcorag.custom_types.core_types.GitData","title":"GitData ","text":" Bases: TypedDict Typed dict for the optional git repo information. Attributes: Name Type Description user str The github repo owner. repo str The repo name. branch str The repo branch to index. filters list[GitFilters] The list of filters to apply. Source code in bcorag/custom_types/core_types.py class GitData(TypedDict):\n \"\"\"Typed dict for the optional git repo information.\n\n Attributes\n ----------\n user : str\n The github repo owner.\n repo : str\n The repo name.\n branch : str\n The repo branch to index.\n filters : list[GitFilters]\n The list of filters to apply.\n \"\"\"\n\n user: str\n repo: str\n branch: str\n # TODO : can we refactor this for a tuple?\n filters: list[GitFilters]\n "},{"location":"bcorag-types/#bcorag.custom_types.core_types.UserSelections","title":"UserSelections ","text":" Bases: TypedDict Types dict for the user selections. Attributes: Name Type Description llm str The LLM to use. embedding_model str The embedding model to use. filename str The file name of the paper being processed. filepath str The file path to the paper being processed. vector_store str The vector store to use. loader str The data loader to ingest the paper with. mode str The run mode. similarity_top_k int The base integer used to calculate the similarity_top_k and top_n values. chunking_config str The chunking configuration to use during node parsing. git_data Optional[GitData] The optional github repository information to include in the documents. other_docs Optional[list[str]] The file path to any additional documentation to include in the documents. Source code in bcorag/custom_types/core_types.py class UserSelections(TypedDict):\n \"\"\"Types dict for the user selections.\n\n Attributes\n ----------\n llm : str\n The LLM to use.\n embedding_model : str\n The embedding model to use.\n filename : str\n The file name of the paper being processed.\n filepath : str\n The file path to the paper being processed.\n vector_store : str\n The vector store to use.\n loader : str\n The data loader to ingest the paper with.\n mode : str\n The run mode.\n similarity_top_k : int\n The base integer used to calculate the similarity_top_k and top_n values.\n chunking_config : str\n The chunking configuration to use during node parsing.\n git_data : Optional[GitData]\n The optional github repository information to include in the documents.\n other_docs : Optional[list[str]]\n The file path to any additional documentation to include in the documents.\n \"\"\"\n\n llm: str\n embedding_model: str\n filename: str\n filepath: str\n vector_store: str\n loader: str\n mode: str\n similarity_top_k: int\n chunking_config: str\n git_data: Optional[GitData]\n other_docs: Optional[list[str]]\n "},{"location":"bcorag-types/#bcorag.custom_types.core_types.SourceNode","title":"SourceNode ","text":" Bases: TypedDict Holds the source node information for one node. Attributes: Name Type Description node_id str content str metdata str score str Source code in bcorag/custom_types/core_types.py class SourceNode(TypedDict):\n \"\"\"Holds the source node information for one node.\n\n Attributes\n ----------\n node_id : str\n content : str\n metdata : str\n score : str\n \"\"\"\n\n node_id: str\n content: str\n metadata: str\n score: str\n "},{"location":"bcorag-types/#bcorag.custom_types.core_types.DomainContent","title":"DomainContent ","text":" Bases: TypedDict Holds the most recent generated domain for in memory storage. Attributes: Name Type Description usability Optional[str] io Optional[str] description Optional[str] execution Optional[str] parametric Optional[str] error Optional[list[str]] Source code in bcorag/custom_types/core_types.py class DomainContent(TypedDict):\n \"\"\"Holds the most recent generated domain for in memory storage.\n\n Attributes\n ----------\n usability: Optional[str]\n io: Optional[str]\n description: Optional[str]\n execution: Optional[str]\n parametric: Optional[str]\n error: Optional[list[str]]\n \"\"\"\n\n usability: Optional[str]\n io: Optional[str]\n description: Optional[str]\n execution: Optional[str]\n parametric: Optional[str]\n error: Optional[str]\n last_source_nodes: Optional[list[SourceNode]]\n "},{"location":"bcorag-types/#bcorag.custom_types.core_types.IndividualDomainMapEntry","title":"IndividualDomainMapEntry ","text":" Bases: TypedDict Information for one domain to prompt and process the user domain choice. Attributes: Name Type Description prompt str The prompt to use for querying the RAG pipeline for a specific domain generation. top_level bool Whether the specified domain includes object's defined in the top level JSON schema. user_prompt str The prompt string to display to the user. code str The short hand code for choosing the domain. dependencies list[DomainKey] The domain dependencies. Source code in bcorag/custom_types/core_types.py class IndividualDomainMapEntry(TypedDict):\n \"\"\"Information for one domain to prompt and process the user domain choice.\n\n Attributes\n ----------\n prompt : str\n The prompt to use for querying the RAG pipeline for a specific domain generation.\n top_level : bool\n Whether the specified domain includes object's defined in the top level JSON schema.\n user_prompt : str\n The prompt string to display to the user.\n code : str\n The short hand code for choosing the domain.\n dependencies : list[DomainKey]\n The domain dependencies.\n \"\"\"\n\n prompt: str\n top_level: bool\n user_prompt: str\n code: str\n dependencies: list[DomainKey]\n "},{"location":"bcorag-types/#bcorag.custom_types.core_types.DomainMap","title":"DomainMap ","text":" Bases: TypedDict Domain map for processing user input. Maps the user input for the domain prompt to the prompt to use for querying the RAG pipeline. Attributes: Name Type Description usability IndividualDomainMapEntry io IndividualDomainMapEntry description IndividualDomainMapEntry execution IndividualDomainMapEntry parametric IndividualDomainMapEntry error IndividualDomainMapEntry Source code in bcorag/custom_types/core_types.py class DomainMap(TypedDict):\n \"\"\"Domain map for processing user input. Maps the user input for\n the domain prompt to the prompt to use for querying the RAG pipeline.\n\n Attributes\n ----------\n usability : IndividualDomainMapEntry\n io: IndividualDomainMapEntry\n description: IndividualDomainMapEntry\n execution: IndividualDomainMapEntry\n parametric: IndividualDomainMapEntry\n error: IndividualDomainMapEntry\n \"\"\"\n\n usability: IndividualDomainMapEntry\n io: IndividualDomainMapEntry\n description: IndividualDomainMapEntry\n execution: IndividualDomainMapEntry\n parametric: IndividualDomainMapEntry\n error: IndividualDomainMapEntry\n "},{"location":"bcorag-types/#bcorag.custom_types.core_types.OptionSchema","title":"OptionSchema ","text":" Bases: TypedDict Schema for a config object option entry in the config JSON file. Attributes: Name Type Description list list[str] The list of options to choose from. default str The option to use as the default. documentation str The link to the documentation for the option. Source code in bcorag/custom_types/core_types.py class OptionSchema(TypedDict):\n \"\"\"Schema for a config object option entry in the config JSON file.\n\n Attributes\n ----------\n list : list[str]\n The list of options to choose from.\n default : str\n The option to use as the default.\n documentation : str\n The link to the documentation for the option.\n \"\"\"\n\n list: list[str]\n default: str\n documentation: str\n "},{"location":"bcorag-types/#bcorag.custom_types.core_types.ConfigObjectOptions","title":"ConfigObjectOptions ","text":" Bases: TypedDict Schema for the supported options. Attributes: Name Type Description loader OptionSchema chunking_config OptionSchema embedding_model OptionSchema vector_store OptionSchema similarity_top_k OptionSchema llm OptionSchema mode OptionSchema Source code in bcorag/custom_types/core_types.py class ConfigObjectOptions(TypedDict):\n \"\"\"Schema for the supported options.\n\n Attributes\n ----------\n loader : OptionSchema\n chunking_config : OptionSchema\n embedding_model: OptionSchema\n vector_store: OptionSchema\n similarity_top_k: OptionSchema\n llm: OptionSchema\n mode: OptionSchema\n \"\"\"\n\n loader: OptionSchema\n chunking_config: OptionSchema\n embedding_model: OptionSchema\n vector_store: OptionSchema\n similarity_top_k: OptionSchema\n llm: OptionSchema\n mode: OptionSchema\n "},{"location":"bcorag-types/#bcorag.custom_types.core_types.ConfigObject","title":"ConfigObject ","text":" Bases: TypedDict Config JSON schema. Attributes: Name Type Description paper_directory str The file path to the paper's directory. options ConfigObjectOptions The supported configuration options. Source code in bcorag/custom_types/core_types.py class ConfigObject(TypedDict):\n \"\"\"Config JSON schema.\n\n Attributes\n ----------\n paper_directory : str\n The file path to the paper's directory.\n options : ConfigObjectOptions\n The supported configuration options.\n \"\"\"\n\n paper_directory: str\n options: ConfigObjectOptions\n "},{"location":"bcorag-types/#bcorag.custom_types.core_types.create_git_filters","title":"create_git_filters(filter_type, filter, value) ","text":"Constructor for the GitFilters TypedDict. Parameters: Name Type Description Default filter_type FilterType The type of github filter (whether it is an include or exclude filter). required filter GitFilter The filter enum specification. required value list[str] The values to filter on. required Returns: Type Description GitFilters Source code in bcorag/custom_types/core_types.py def create_git_filters(\n filter_type: GithubRepositoryReader.FilterType, filter: GitFilter, value: list[str]\n) -> GitFilters:\n \"\"\"Constructor for the `GitFilters` TypedDict.\n\n Parameters\n ----------\n filter_type : GithubRepositoryReader.FilterType\n The type of github filter (whether it is an include or exclude filter).\n filter : GitFilter\n The filter enum specification.\n value : list[str]\n The values to filter on.\n\n Returns\n -------\n GitFilters\n \"\"\"\n sorted_values = sorted(value)\n return_data: GitFilters = {\n \"filter_type\": filter_type,\n \"filter\": filter,\n \"value\": sorted_values,\n }\n return return_data\n "},{"location":"bcorag-types/#bcorag.custom_types.core_types.create_git_data","title":"create_git_data(user, repo, branch, filters=[]) ","text":"Constructor for the GitData TypedDict. Parameters: Name Type Description Default user str The github repo owner. required repo str The repo name. required branch str The repo branch to index. required filters list[GitFilters] The list of filters to apply. [] Returns: Type Description GitData Source code in bcorag/custom_types/core_types.py def create_git_data(\n user: str, repo: str, branch: str, filters: list[GitFilters] = []\n) -> GitData:\n \"\"\"Constructor for the `GitData` TypedDict.\n\n Parameters\n ----------\n user : str\n The github repo owner.\n repo : str\n The repo name.\n branch : str\n The repo branch to index.\n filters : list[GitFilters]\n The list of filters to apply.\n\n Returns\n -------\n GitData\n \"\"\"\n return_data: GitData = {\n \"user\": user,\n \"repo\": repo,\n \"branch\": branch,\n \"filters\": filters,\n }\n return return_data\n "},{"location":"bcorag-types/#bcorag.custom_types.core_types.create_user_selections","title":"create_user_selections(llm, embedding_model, filename, filepath, vector_store, loader, mode, similarity_top_k, chunking_config, git_data, other_docs) ","text":"Constructor for the UserSelections TypedDict. Parameters: Name Type Description Default llm str The LLM to use. required embedding_model str The embedding model to use. required filename str The file name of the paper being processed. required filepath str The file path to the paper being processed. required vector_store str The vector store to use. required loader str The data loader to ingest the paper with. required mode str The run mode. required similarity_top_k int The base integer used to calculate the similarity_top_k and top_n values. required chunking_config str The chunking configuration to use during node parsing. required git_data Optional[GitData] The optional github repository information to include in the documents. required other_docs Optional[list[str]] The file path to any additional documentation to include in the documents. required Returns: Type Description UserSelections Source code in bcorag/custom_types/core_types.py def create_user_selections(\n llm: str,\n embedding_model: str,\n filename: str,\n filepath: str,\n vector_store: str,\n loader: str,\n mode: str,\n similarity_top_k: int,\n chunking_config: str,\n git_data: Optional[GitData],\n other_docs: Optional[list[str]],\n) -> UserSelections:\n \"\"\"Constructor for the `UserSelections` TypedDict.\n\n Parameters\n ----------\n llm : str\n The LLM to use.\n embedding_model : str\n The embedding model to use.\n filename : str\n The file name of the paper being processed.\n filepath : str\n The file path to the paper being processed.\n vector_store : str\n The vector store to use.\n loader : str\n The data loader to ingest the paper with.\n mode : str\n The run mode.\n similarity_top_k : int\n The base integer used to calculate the similarity_top_k and top_n values.\n chunking_config : str\n The chunking configuration to use during node parsing.\n git_data : Optional[GitData]\n The optional github repository information to include in the documents.\n other_docs : Optional[list[str]]\n The file path to any additional documentation to include in the documents.\n\n Returns\n -------\n UserSelections\n \"\"\"\n return_data: UserSelections = {\n \"llm\": llm,\n \"embedding_model\": embedding_model,\n \"filename\": filename,\n \"filepath\": filepath,\n \"vector_store\": vector_store,\n \"loader\": loader,\n \"mode\": mode,\n \"similarity_top_k\": similarity_top_k,\n \"chunking_config\": chunking_config,\n \"git_data\": git_data,\n \"other_docs\": other_docs,\n }\n return return_data\n "},{"location":"bcorag-types/#bcorag.custom_types.core_types.default_domain_content","title":"default_domain_content() ","text":"Creates an empty, default DomainContent TypedDict. Returns: Type Description DomainContent Source code in bcorag/custom_types/core_types.py def default_domain_content() -> DomainContent:\n \"\"\"Creates an empty, default DomainContent TypedDict.\n\n Returns\n -------\n DomainContent\n \"\"\"\n return_data: DomainContent = {\n \"usability\": None,\n \"io\": None,\n \"description\": None,\n \"execution\": None,\n \"parametric\": None,\n \"error\": None,\n \"last_source_nodes\": None,\n }\n return return_data\n "},{"location":"bcorag-types/#bcorag.custom_types.core_types.add_source_nodes","title":"add_source_nodes(domain_content, nodes) ","text":"Adds source node data to the domain content. Parameters: Name Type Description Default domain_content DomainContent The domain content instance to add source node data to. required nodes list[NodeWithScore] List of nodes with score data. required Returns: Type Description DomainContent The updated domain content object. Source code in bcorag/custom_types/core_types.py def add_source_nodes(\n domain_content: DomainContent, nodes: list[NodeWithScore]\n) -> DomainContent:\n \"\"\"Adds source node data to the domain content.\n\n Parameters\n ----------\n domain_content : DomainContent\n The domain content instance to add source node data to.\n nodes : list[NodeWithScore]\n List of nodes with score data.\n\n Returns\n -------\n DomainContent\n The updated domain content object.\n \"\"\"\n node_list: list[SourceNode] = []\n for node in nodes:\n node_list.append(\n {\n \"node_id\": node.node.node_id,\n \"content\": node.node.get_content(),\n \"metadata\": node.node.get_metadata_str(),\n \"score\": str(node.score),\n }\n )\n domain_content[\"last_source_nodes\"] = node_list\n return domain_content\n "},{"location":"bcorag/","title":"Bcorag","text":"Handles the RAG implementation using the llama-index library. "},{"location":"bcorag/#bcorag.bcorag.BcoRag","title":"BcoRag ","text":"Class to handle the RAG implementation. Attributes: Name Type Description _parameter_set_hash str The MD5 hexidecimal hash of the parameter set. _domain_map DomainMap Mapping for each domain to its standardized prompt. _file_name str The source file (paper) name. _file_path str The file path to the source file (paper). _output_path_root str Path to the specific document directory to dump the outputs. _debug bool Whether in debug mode or not. _logger Logger The document specific logger. _llm_model_name str The LLM model name. _llm_model OpenAI The Open AI LLM model instance. _embed_model_name str The embedding model name. _embed_model OpenAIEmbedding The embedding model instance. _loader str The data loader being used. _vector_store str The vector store being used. _splitter SemanticSplitterNodeParser or None The node parser (if a non-fixed chunking strategy is chosen). _similarity_top_k int The similarity top k retrieval number for node sources. _token_counter TokenCountingHandler or None The token counter handler or None if in production mode. _token_counts dict[str, int] or None The token counts or None if in production mode. _git_data GitData or None The git data or None if no github repo was included. _documents list[Documents] The list of documents (containers for the data source). _index VectorStoreIndex The vector store index instance. _query_engine RetrieverQueryEngine The query engine. _other_docs list[str] | None Any other miscellaneous documents to include in the indexing process. _domain_content DomainContent Holds the most recent generated domain. Source code in bcorag/bcorag.py class BcoRag:\n \"\"\"Class to handle the RAG implementation.\n\n Attributes\n ----------\n _parameter_set_hash : str\n The MD5 hexidecimal hash of the parameter set.\n _domain_map : DomainMap\n Mapping for each domain to its standardized prompt.\n _file_name : str\n The source file (paper) name.\n _file_path : str\n The file path to the source file (paper).\n _output_path_root : str\n Path to the specific document directory to dump the outputs.\n _debug : bool\n Whether in debug mode or not.\n _logger : logging.Logger\n The document specific logger.\n _llm_model_name : str\n The LLM model name.\n _llm_model : OpenAI\n The Open AI LLM model instance.\n _embed_model_name : str\n The embedding model name.\n _embed_model : OpenAIEmbedding\n The embedding model instance.\n _loader : str\n The data loader being used.\n _vector_store : str\n The vector store being used.\n _splitter : SemanticSplitterNodeParser or None\n The node parser (if a non-fixed chunking strategy is chosen).\n _similarity_top_k : int\n The similarity top k retrieval number for node sources.\n _token_counter : TokenCountingHandler or None\n The token counter handler or None if in production mode.\n _token_counts : dict[str, int] or None\n The token counts or None if in production mode.\n _git_data : GitData or None\n The git data or None if no github repo was included.\n _documents : list[Documents]\n The list of documents (containers for the data source).\n _index : VectorStoreIndex\n The vector store index instance.\n _query_engine : RetrieverQueryEngine\n The query engine.\n _other_docs : list[str] | None\n Any other miscellaneous documents to include in the indexing process.\n _domain_content : DomainContent\n Holds the most recent generated domain.\n \"\"\"\n\n def __init__(\n self,\n user_selections: UserSelections,\n output_dir: str = \"./output\",\n ):\n \"\"\"Constructor.\n\n Parameters\n ----------\n user_selections : UserSelections\n The user configuration selections.\n output_dir : str\n The directory to dump the outputs (relative to main.py entry point\n in the repo root).\n evaluation_metrics : bool\n Whether or not to calculate Faithfulness and Relevancy metrics.\n \"\"\"\n load_dotenv()\n\n self._parameter_set_hash = self._user_selection_hash(user_selections)\n self._domain_map = DOMAIN_MAP\n self._file_name = user_selections[\"filename\"]\n self._file_path = user_selections[\"filepath\"]\n self._output_path_root = os.path.join(\n output_dir,\n os.path.splitext(self._file_name.lower().replace(\" \", \"_\").strip())[0],\n )\n self._debug = True if user_selections[\"mode\"] == \"debug\" else False\n self._logger = misc_fns.setup_document_logger(\n self._file_name.lower().strip().replace(\" \", \"_\")\n )\n self._llm_model_name = user_selections[\"llm\"]\n self._llm_model = OpenAI(model=self._llm_model_name)\n self._embed_model_name = user_selections[\"embedding_model\"]\n self._embed_model = OpenAIEmbedding(model=self._embed_model_name)\n self._loader = user_selections[\"loader\"]\n self._vector_store = user_selections[\"vector_store\"]\n self._splitter = None\n self._similarity_top_k = user_selections[\"similarity_top_k\"]\n self._chunking_config = user_selections[\"chunking_config\"]\n self._token_counter: TokenCountingHandler | None = None\n self._token_counts: dict[str, int] | None = None\n self._git_data: Optional[GitData] = (\n user_selections[\"git_data\"]\n if user_selections[\"git_data\"] is not None\n else None\n )\n self._other_docs: list[str] | None = user_selections[\"other_docs\"]\n self.domain_content: DomainContent = default_domain_content()\n\n openai_api_key = os.getenv(\"OPENAI_API_KEY\")\n if not openai_api_key:\n raise EnvironmentError(\"OpenAI API key not found.\")\n\n github_token = os.getenv(\"GITHUB_TOKEN\")\n if self._git_data is not None and not github_token:\n raise EnvironmentError(\"Github token not found.\")\n\n misc_fns.check_dir(self._output_path_root)\n self._display_info(user_selections, \"User selections:\")\n\n Settings.embed_model = self._embed_model\n Settings.llm = self._llm_model\n\n match self._chunking_config:\n case \"semantic\":\n self._splitter = SemanticSplitterNodeParser.from_defaults(\n buffer_size=1,\n embed_model=self._embed_model,\n # The percentile of cosin dissimilarity that must be exceeded\n # between a group of sentences and the next to form a node. The\n # smaller this number is, the more nodes will be generated.\n breakpoint_percentile_threshold=90,\n )\n case \"256 chunk size/20 chunk overlap\":\n Settings.chunk_size = 256\n Settings.chunk_overlap = 50\n case \"512 chunk size/50 chunk overlap\":\n Settings.chunk_size = 512\n Settings.chunk_overlap = 50\n case \"2048 chunk size/50 chunk overlap\":\n Settings.chunk_size = 2048\n Settings.chunk_overlap = 50\n case _:\n Settings.chunk_size = 1024\n Settings.chunk_overlap = 20\n\n if self._debug:\n self._token_counter = TokenCountingHandler(\n tokenizer=tiktoken.encoding_for_model(self._llm_model_name).encode\n )\n Settings.callback_manager = CallbackManager([self._token_counter])\n self._token_counts = {\n \"embedding\": 0,\n \"input\": 0,\n \"output\": 0,\n \"total\": 0,\n }\n\n match self._loader:\n case \"SimpleDirectoryReader\":\n loader = SimpleDirectoryReader(input_files=[self._file_path])\n paper_documents = loader.load_data()\n case \"PDFReader\":\n # Note: download_loader is deprecated in llama_index now\n # with supress_stdout():\n # pdf_loader = download_loader(\"PDFReader\")\n pdf_loader = PDFReader()\n paper_documents = pdf_loader.load_data(file=Path(self._file_path))\n case \"PDFMarker\":\n with supress_stdout():\n pdf_loader = PDFMarkerReader()\n paper_documents = pdf_loader.load_data(file=Path(self._file_path))\n\n other_docs = []\n if self._other_docs:\n for path in self._other_docs:\n loader = SimpleDirectoryReader(input_files=[path])\n other_docs += loader.load_data()\n\n documents = paper_documents + other_docs # type: ignore\n if self._git_data is not None:\n\n github_client = GithubClient(github_token)\n # Note: download_loader is deprecated in llama_index now\n # with supress_stdout():\n # download_loader(\"GithubRepositoryReader\")\n\n directory_filter: GitFilters | None = None\n file_ext_filter: GitFilters | None = None\n for filter in self._git_data[\"filters\"]:\n if filter[\"filter\"] == GitFilter.DIRECTORY:\n directory_filter = filter\n elif filter[\"filter\"] == GitFilter.FILE_EXTENSION:\n file_ext_filter = filter\n\n git_loader = GithubRepositoryReader(\n github_client=github_client,\n owner=self._git_data[\"user\"],\n repo=self._git_data[\"repo\"],\n filter_directories=(\n (directory_filter[\"value\"], directory_filter[\"filter_type\"])\n if directory_filter is not None\n else None\n ),\n filter_file_extensions=(\n (file_ext_filter[\"value\"], file_ext_filter[\"filter_type\"])\n if file_ext_filter is not None\n else None\n ),\n )\n\n github_documents = git_loader.load_data(branch=self._git_data[\"branch\"])\n documents += github_documents\n self._logger.info(\n f\"Loading repo `{self._git_data['repo']}` from user `{self._git_data['user']}`\"\n )\n self._documents = documents\n\n _chunk_fixed = (\n False if user_selections[\"chunking_config\"] == \"semantic\" else True\n )\n if self._vector_store == \"VectorStoreIndex\":\n if _chunk_fixed:\n self._index = VectorStoreIndex.from_documents(self._documents)\n else:\n if self._splitter is not None:\n nodes = self._splitter.build_semantic_nodes_from_documents(\n self._documents\n )\n self._index = VectorStoreIndex(nodes=nodes)\n\n retriever = VectorIndexRetriever(\n index=self._index, similarity_top_k=self._similarity_top_k * 3\n )\n response_synthesizer = get_response_synthesizer()\n rerank_postprocessor = SentenceTransformerRerank(\n top_n=self._similarity_top_k,\n keep_retrieval_score=True,\n )\n self._query_engine = RetrieverQueryEngine(\n retriever=retriever,\n response_synthesizer=response_synthesizer,\n node_postprocessors=[rerank_postprocessor],\n )\n\n if (\n self._debug\n and self._token_counts is not None\n and self._token_counter is not None\n ):\n self._token_counts[\n \"embedding\"\n ] += self._token_counter.total_embedding_token_count\n\n def perform_query(self, domain: DomainKey) -> str:\n \"\"\"Performs a query for a specific BCO domain.\n\n Parameters\n ----------\n domain : DomainKey\n The domain being queried for.\n\n Returns\n -------\n str\n The generated domain.\n \"\"\"\n query_start_time = time.time()\n domain_prompt = self._domain_map[domain][\"prompt\"]\n for dependency in self._domain_map[domain][\"dependencies\"]:\n if self.domain_content[dependency] is not None:\n dependency_prompt = f\"The {domain} domain is dependent on the {dependency} domain. Here is the {dependency} domain: {self.domain_content[dependency]}.\"\n domain_prompt += dependency_prompt\n query_prompt = QUERY_PROMPT.format(domain, domain_prompt)\n if self._domain_map[domain][\"top_level\"]:\n query_prompt += f\"\\n{SUPPLEMENT_PROMPT}\"\n\n response_object = self._query_engine.query(query_prompt)\n if isinstance(response_object, Response):\n response_object = Response(\n response=response_object.response,\n metadata=response_object.metadata,\n source_nodes=response_object.source_nodes,\n )\n else:\n self._logger.error(\n f\"Error parsing response object, expected type Response, got type `{type(response_object)}`.\"\n )\n print(\n f\"Error parsing response object, expected type Response, got type `{type(response_object)}`.\"\n )\n misc_fns.graceful_exit(1)\n query_response = str(response_object.response)\n\n self.domain_content[domain] = query_response\n self.domain_content = add_source_nodes(\n domain_content=self.domain_content, nodes=response_object.source_nodes\n )\n\n source_str = \"\"\n for idx, source_node in enumerate(response_object.source_nodes):\n source_str += f\"\\n--------------- Source Node '{idx + 1}/{len(response_object.source_nodes)}' ---------------\"\n source_str += f\"\\nNode ID: '{source_node.node.node_id}'\"\n source_str += f\"\\nRerank Score: '{source_node.score}'\"\n source_str += f\"\\nMetadata String:\\n`{source_node.node.get_metadata_str()}`\"\n source_str += (\n f\"\\nMetadata Size: `{len(source_node.node.get_metadata_str())}`\"\n )\n source_str += f\"\\nContent Size: `{len(source_node.node.get_content())}`\"\n source_str += (\n f\"\\nRetrieved Text:\\n{source_node.node.get_content().strip()}\\n\"\n )\n source_str += \"\\n\"\n\n if self._debug:\n self._display_info(query_prompt, f\"QUERY PROMPT for the {domain} domain:\")\n self._token_counts[\"input\"] += self._token_counter.prompt_llm_token_count # type: ignore\n self._token_counts[\"output\"] += self._token_counter.completion_llm_token_count # type: ignore\n self._token_counts[\"total\"] += self._token_counter.total_llm_token_count # type: ignore\n self._token_counts[\"embedding\"] += self._token_counter.total_embedding_token_count # type: ignore\n self._display_info(self._token_counts, \"Updated token counts:\")\n self._display_info(source_str, \"Retrieval source(s):\")\n\n query_elapsed_time = time.time() - query_start_time\n self._process_output(\n domain, query_response, source_str, round(query_elapsed_time, 2)\n )\n\n return query_response\n\n def choose_domain(\n self, automatic_query: bool = False\n ) -> Optional[tuple[DomainKey, str] | DomainKey]:\n \"\"\"Gets the user input for the domain the user wants to generate.\n\n Parameters\n ----------\n automatic_query : bool, optional\n Whether to automatically query after the user chooses a domain. If set to\n True this is a shortcut to calling `bcorag.perform_query(choose_domain())`.\n\n Returns\n -------\n (DomainKey, str) | str | None\n If automatic query is set to True will return a tuple containing the domain\n name and the query response. If automatic query is False will return the user\n chosen domain. None is returned if the user chooses to exit.\n \"\"\"\n domain_prompt = (\n \"Which domain would you like to generate? Supported domains are:\"\n )\n\n domain_user_prompt: DomainKey\n for domain_user_prompt in get_args(DomainKey):\n domain_prompt += (\n f\"\\n\\t{self._domain_map[domain_user_prompt]['user_prompt']}\"\n )\n domain_prompt += \"\\n\\tE[x]it\\n\"\n print(domain_prompt)\n\n domain_selection = None\n\n while True:\n\n domain_selection = input(\"> \").strip().lower()\n\n domain: DomainKey\n for domain in get_args(DomainKey):\n if (\n domain_selection == domain\n or domain_selection == self._domain_map[domain][\"code\"]\n ):\n domain_selection = domain\n break\n else:\n if domain_selection == \"exit\" or domain_selection == \"x\":\n if self._debug:\n self._display_info(\n \"User selected 'exit' on the domain selection step.\"\n )\n return None\n else:\n if self._debug:\n self._display_info(\n f\"User entered unrecognized input '{domain_selection}' on domain chooser step.\"\n )\n print(\n f\"Unrecognized input {domain_selection} entered, please try again.\"\n )\n continue\n if not self._check_dependencies(domain_selection):\n print(\n f\"Dependencies for the `{domain_selection}` domain are not satisfied. Please choose another domain.\"\n )\n continue\n\n break\n\n if automatic_query:\n if self._debug:\n self._display_info(\n f\"Automatic query called on domain: '{domain_selection}'.\"\n )\n return domain_selection, self.perform_query(domain_selection)\n if self._debug:\n self._display_info(\n f\"User chose '{domain_selection}' domain with no automatic query.\"\n )\n return domain_selection\n\n def _process_output(\n self, domain: DomainKey, response: str, source_str: str, elapsed_time: float\n ):\n \"\"\"Attempts to serialize the response into a JSON object and dumps the output.\n Also dumps the raw text regardless if JSON serialization was successful. The\n file dumps are dumped to the `output` directory located in the root of this\n repo. Keeps a TSV file to track all of the domain outputs and what parameter\n set generated the results.\n\n Note: This function is getting long with some redundancy, it should be re-written\n at some point. It works, but is ugly.\n\n Parameters\n ----------\n domain : DomainKey\n The domain the response is for.\n response : str\n The generated response to dump.\n source_str : str\n The formatted source string for the query.\n elapsed_time : float\n The query generation elapsed time.\n \"\"\"\n\n def dump_json_response(fp: str, response_string: str) -> bool:\n if response_string.startswith(\"```json\\n\"):\n response_string = response_string.replace(\"```json\\n\", \"\").replace(\n \"```\", \"\"\n )\n self._display_info(\n response_string, f\"QUERY RESPONSE for the `{domain}` domain:\"\n )\n try:\n response_json = json.loads(response_string)\n if misc_fns.write_json(fp, response_json):\n self._logger.info(\n f\"Succesfully serialized JSON response for the `{domain}` domain.\"\n )\n return True\n except Exception as e:\n self._logger.error(\n f\"Failed to serialize the JSON response for the `{domain}` domain.\\n{e}\"\n )\n return False\n\n generated_dir = os.path.join(self._output_path_root, \"generated_domains\")\n misc_fns.check_dir(generated_dir)\n\n txt_file_unindexed = os.path.join(\n generated_dir, f\"{domain}-(index)-{self._parameter_set_hash}.txt\"\n )\n json_file_unindexed = os.path.join(\n generated_dir, f\"{domain}-(index)-{self._parameter_set_hash}.json\"\n )\n source_file_unindexed = os.path.join(\n self._output_path_root,\n \"reference_sources\",\n f\"{domain}-(index)-{self._parameter_set_hash}.txt\",\n )\n\n output_map_json = misc_fns.load_output_tracker(\n os.path.join(self._output_path_root, \"output_map.json\")\n )\n\n # Create a new output file if one doesn't exist\n if output_map_json is None:\n\n txt_file = txt_file_unindexed.replace(\"(index)\", \"1\")\n json_file = json_file_unindexed.replace(\"(index)\", \"1\")\n source_file = source_file_unindexed.replace(\"(index)\", \"1\")\n if not dump_json_response(json_file, response):\n json_file = \"NA\"\n\n run_entry = create_output_tracker_runs_entry(\n 1,\n misc_fns.create_timestamp(),\n txt_file,\n json_file,\n source_file,\n elapsed_time,\n )\n\n directory_filter: OutputTrackerGitFilter | None = None\n file_ext_filter: OutputTrackerGitFilter | None = None\n if self._git_data is not None:\n for filter in self._git_data[\"filters\"]:\n if filter[\"filter\"] == GitFilter.FILE_EXTENSION:\n file_ext_filter = create_output_tracker_git_filter(\n (\"include\", filter[\"value\"])\n if filter[\"filter_type\"]\n == GithubRepositoryReader.FilterType.INCLUDE\n else (\"exclude\", filter[\"value\"])\n )\n elif filter[\"filter\"] == GitFilter.DIRECTORY:\n directory_filter = create_output_tracker_git_filter(\n (\"include\", filter[\"value\"])\n if filter[\"filter_type\"]\n == GithubRepositoryReader.FilterType.INCLUDE\n else (\"exclude\", filter[\"value\"])\n )\n\n param_set = create_output_tracker_param_set(\n loader=self._loader,\n vector_store=self._vector_store,\n llm=self._llm_model_name,\n embedding_model=self._embed_model_name,\n similarity_top_k=self._similarity_top_k,\n chunking_config=self._chunking_config,\n git_user=self._git_data[\"user\"] if self._git_data is not None else None,\n git_repo=self._git_data[\"repo\"] if self._git_data is not None else None,\n git_branch=(\n self._git_data[\"branch\"] if self._git_data is not None else None\n ),\n directory_git_filter=directory_filter,\n file_ext_git_filter=file_ext_filter,\n other_docs=self._other_docs,\n )\n\n instance_entry = create_output_tracker_entry(1, param_set, [run_entry])\n\n domain_entry = create_output_tracker_domain_entry(\n self._parameter_set_hash, instance_entry\n )\n\n output_data = default_output_tracker_file()\n output_data[domain].append(domain_entry)\n\n # update output map\n else:\n\n domain_map_entries = output_map_json[domain]\n\n for domain_map_entry in domain_map_entries:\n\n # found the collision entry\n if domain_map_entry[\"hash_str\"] == self._parameter_set_hash:\n\n new_index = domain_map_entry[\"entries\"][\"curr_index\"] + 1\n domain_map_entry[\"entries\"][\"curr_index\"] = new_index\n\n txt_file = txt_file_unindexed.replace(\"(index)\", str(new_index))\n json_file = json_file_unindexed.replace(\"(index)\", str(new_index))\n source_file = source_file_unindexed.replace(\n \"(index)\", str(new_index)\n )\n if not dump_json_response(json_file, response):\n json_file = \"NA\"\n\n run_entry = create_output_tracker_runs_entry(\n new_index,\n misc_fns.create_timestamp(),\n txt_file,\n json_file,\n source_file,\n elapsed_time,\n )\n\n domain_map_entry[\"entries\"][\"runs\"].append(run_entry)\n\n break\n\n # first time parameter set run (loop didn't break)\n else:\n\n txt_file = txt_file_unindexed.replace(\"(index)\", \"1\")\n json_file = json_file_unindexed.replace(\"(index)\", \"1\")\n source_file = source_file_unindexed.replace(\"(index)\", \"1\")\n if not dump_json_response(json_file, response):\n json_file = \"NA\"\n\n run_entry = create_output_tracker_runs_entry(\n 1,\n misc_fns.create_timestamp(),\n txt_file,\n json_file,\n source_file,\n elapsed_time,\n )\n\n directory_filter = None\n file_ext_filter = None\n if self._git_data is not None:\n for filter in self._git_data[\"filters\"]:\n if filter[\"filter\"] == GitFilter.FILE_EXTENSION:\n file_ext_filter = create_output_tracker_git_filter(\n (\"include\", filter[\"value\"])\n if filter[\"filter_type\"]\n == GithubRepositoryReader.FilterType.INCLUDE\n else (\"exclude\", filter[\"value\"])\n )\n elif filter[\"filter\"] == GitFilter.DIRECTORY:\n directory_filter = create_output_tracker_git_filter(\n (\"include\", filter[\"value\"])\n if filter[\"filter_type\"]\n == GithubRepositoryReader.FilterType.INCLUDE\n else (\"exclude\", filter[\"value\"])\n )\n\n param_set = create_output_tracker_param_set(\n loader=self._loader,\n vector_store=self._vector_store,\n llm=self._llm_model_name,\n embedding_model=self._embed_model_name,\n similarity_top_k=self._similarity_top_k,\n chunking_config=self._chunking_config,\n git_user=(\n self._git_data[\"user\"] if self._git_data is not None else None\n ),\n git_repo=(\n self._git_data[\"repo\"] if self._git_data is not None else None\n ),\n git_branch=(\n self._git_data[\"branch\"] if self._git_data is not None else None\n ),\n directory_git_filter=directory_filter,\n file_ext_git_filter=file_ext_filter,\n other_docs=self._other_docs,\n )\n\n instance_entry = create_output_tracker_entry(1, param_set, [run_entry])\n\n domain_entry = create_output_tracker_domain_entry(\n self._parameter_set_hash, instance_entry\n )\n\n domain_map_entries.append(domain_entry)\n\n output_data = output_map_json\n\n misc_fns.dump_string(txt_file, response)\n misc_fns.dump_string(source_file, source_str)\n # writes the output mapping files\n misc_fns.write_json(\n os.path.join(self._output_path_root, \"output_map.json\"), output_data\n )\n misc_fns.dump_output_file_map_tsv(\n os.path.join(self._output_path_root, \"output_map.tsv\"), output_data\n )\n\n def _display_info(\n self,\n info: Optional[dict | list | str | UserSelections],\n header: Optional[str] = None,\n ):\n \"\"\"If in debug mode, handles the debug info output to the log file.\n\n Parameters\n ----------\n info : dict | list | str | UserSelections | None\n The object to log.\n header : str or None\n The optional header to log before the info.\n \"\"\"\n log_str = header if header is not None else \"\"\n if isinstance(info, dict):\n for key, value in info.items():\n log_str += f\"\\n\\t{key}: '{value}'\"\n elif isinstance(info, str):\n log_str += f\"{info}\" if header is None else f\"\\n{info}\"\n self._logger.info(log_str)\n\n def _user_selection_hash(self, params: UserSelections) -> str:\n \"\"\"Generates an MD5 hash of the parameter set.\n\n Parameters\n ----------\n params : UserSelections\n The user configuration selections.\n\n Returns\n -------\n str\n The hexidecimal MD5 hash.\n \"\"\"\n hash_list = []\n hash_list.append(params[\"llm\"])\n hash_list.append(params[\"embedding_model\"])\n hash_list.append(params[\"vector_store\"])\n hash_list.append(params[\"loader\"])\n hash_list.append(str(params[\"similarity_top_k\"]))\n hash_list.append(params[\"chunking_config\"])\n\n if params[\"git_data\"] is not None:\n\n hash_list.append(params[\"git_data\"][\"user\"])\n hash_list.append(params[\"git_data\"][\"repo\"])\n hash_list.append(params[\"git_data\"][\"branch\"])\n\n for filter in params[\"git_data\"][\"filters\"]:\n\n filter_type = (\n \"include\"\n if filter[\"filter_type\"]\n == GithubRepositoryReader.FilterType.INCLUDE\n else \"exclude\"\n )\n filter_str = f\"{filter_type}-{filter['value']}\"\n hash_list.append(filter_str)\n\n sorted(hash_list)\n hash_str = \"_\".join(hash_list)\n hash_hex = md5(hash_str.encode(\"utf-8\")).hexdigest()\n return hash_hex\n\n def _check_dependencies(self, domain: DomainKey) -> bool:\n \"\"\"Checks a domain's dependencies.\n\n Parameters\n ----------\n domain : DomainKey\n The domain to check.\n\n Returns\n -------\n bool\n True if dependencies are satisfied, False otherwise.\n \"\"\"\n for dependency in self._domain_map[domain][\"dependencies\"]:\n if self.domain_content[dependency] is None:\n print(\n f\"Error: {dependency.title()} domain must be generated before the {domain.title()} domain.\"\n )\n return False\n return True\n "},{"location":"bcorag/#bcorag.bcorag.BcoRag.__init__","title":"__init__(user_selections, output_dir='./output') ","text":"Constructor. Parameters: Name Type Description Default user_selections UserSelections The user configuration selections. required output_dir str The directory to dump the outputs (relative to main.py entry point in the repo root). './output' evaluation_metrics bool Whether or not to calculate Faithfulness and Relevancy metrics. required Source code in bcorag/bcorag.py def __init__(\n self,\n user_selections: UserSelections,\n output_dir: str = \"./output\",\n):\n \"\"\"Constructor.\n\n Parameters\n ----------\n user_selections : UserSelections\n The user configuration selections.\n output_dir : str\n The directory to dump the outputs (relative to main.py entry point\n in the repo root).\n evaluation_metrics : bool\n Whether or not to calculate Faithfulness and Relevancy metrics.\n \"\"\"\n load_dotenv()\n\n self._parameter_set_hash = self._user_selection_hash(user_selections)\n self._domain_map = DOMAIN_MAP\n self._file_name = user_selections[\"filename\"]\n self._file_path = user_selections[\"filepath\"]\n self._output_path_root = os.path.join(\n output_dir,\n os.path.splitext(self._file_name.lower().replace(\" \", \"_\").strip())[0],\n )\n self._debug = True if user_selections[\"mode\"] == \"debug\" else False\n self._logger = misc_fns.setup_document_logger(\n self._file_name.lower().strip().replace(\" \", \"_\")\n )\n self._llm_model_name = user_selections[\"llm\"]\n self._llm_model = OpenAI(model=self._llm_model_name)\n self._embed_model_name = user_selections[\"embedding_model\"]\n self._embed_model = OpenAIEmbedding(model=self._embed_model_name)\n self._loader = user_selections[\"loader\"]\n self._vector_store = user_selections[\"vector_store\"]\n self._splitter = None\n self._similarity_top_k = user_selections[\"similarity_top_k\"]\n self._chunking_config = user_selections[\"chunking_config\"]\n self._token_counter: TokenCountingHandler | None = None\n self._token_counts: dict[str, int] | None = None\n self._git_data: Optional[GitData] = (\n user_selections[\"git_data\"]\n if user_selections[\"git_data\"] is not None\n else None\n )\n self._other_docs: list[str] | None = user_selections[\"other_docs\"]\n self.domain_content: DomainContent = default_domain_content()\n\n openai_api_key = os.getenv(\"OPENAI_API_KEY\")\n if not openai_api_key:\n raise EnvironmentError(\"OpenAI API key not found.\")\n\n github_token = os.getenv(\"GITHUB_TOKEN\")\n if self._git_data is not None and not github_token:\n raise EnvironmentError(\"Github token not found.\")\n\n misc_fns.check_dir(self._output_path_root)\n self._display_info(user_selections, \"User selections:\")\n\n Settings.embed_model = self._embed_model\n Settings.llm = self._llm_model\n\n match self._chunking_config:\n case \"semantic\":\n self._splitter = SemanticSplitterNodeParser.from_defaults(\n buffer_size=1,\n embed_model=self._embed_model,\n # The percentile of cosin dissimilarity that must be exceeded\n # between a group of sentences and the next to form a node. The\n # smaller this number is, the more nodes will be generated.\n breakpoint_percentile_threshold=90,\n )\n case \"256 chunk size/20 chunk overlap\":\n Settings.chunk_size = 256\n Settings.chunk_overlap = 50\n case \"512 chunk size/50 chunk overlap\":\n Settings.chunk_size = 512\n Settings.chunk_overlap = 50\n case \"2048 chunk size/50 chunk overlap\":\n Settings.chunk_size = 2048\n Settings.chunk_overlap = 50\n case _:\n Settings.chunk_size = 1024\n Settings.chunk_overlap = 20\n\n if self._debug:\n self._token_counter = TokenCountingHandler(\n tokenizer=tiktoken.encoding_for_model(self._llm_model_name).encode\n )\n Settings.callback_manager = CallbackManager([self._token_counter])\n self._token_counts = {\n \"embedding\": 0,\n \"input\": 0,\n \"output\": 0,\n \"total\": 0,\n }\n\n match self._loader:\n case \"SimpleDirectoryReader\":\n loader = SimpleDirectoryReader(input_files=[self._file_path])\n paper_documents = loader.load_data()\n case \"PDFReader\":\n # Note: download_loader is deprecated in llama_index now\n # with supress_stdout():\n # pdf_loader = download_loader(\"PDFReader\")\n pdf_loader = PDFReader()\n paper_documents = pdf_loader.load_data(file=Path(self._file_path))\n case \"PDFMarker\":\n with supress_stdout():\n pdf_loader = PDFMarkerReader()\n paper_documents = pdf_loader.load_data(file=Path(self._file_path))\n\n other_docs = []\n if self._other_docs:\n for path in self._other_docs:\n loader = SimpleDirectoryReader(input_files=[path])\n other_docs += loader.load_data()\n\n documents = paper_documents + other_docs # type: ignore\n if self._git_data is not None:\n\n github_client = GithubClient(github_token)\n # Note: download_loader is deprecated in llama_index now\n # with supress_stdout():\n # download_loader(\"GithubRepositoryReader\")\n\n directory_filter: GitFilters | None = None\n file_ext_filter: GitFilters | None = None\n for filter in self._git_data[\"filters\"]:\n if filter[\"filter\"] == GitFilter.DIRECTORY:\n directory_filter = filter\n elif filter[\"filter\"] == GitFilter.FILE_EXTENSION:\n file_ext_filter = filter\n\n git_loader = GithubRepositoryReader(\n github_client=github_client,\n owner=self._git_data[\"user\"],\n repo=self._git_data[\"repo\"],\n filter_directories=(\n (directory_filter[\"value\"], directory_filter[\"filter_type\"])\n if directory_filter is not None\n else None\n ),\n filter_file_extensions=(\n (file_ext_filter[\"value\"], file_ext_filter[\"filter_type\"])\n if file_ext_filter is not None\n else None\n ),\n )\n\n github_documents = git_loader.load_data(branch=self._git_data[\"branch\"])\n documents += github_documents\n self._logger.info(\n f\"Loading repo `{self._git_data['repo']}` from user `{self._git_data['user']}`\"\n )\n self._documents = documents\n\n _chunk_fixed = (\n False if user_selections[\"chunking_config\"] == \"semantic\" else True\n )\n if self._vector_store == \"VectorStoreIndex\":\n if _chunk_fixed:\n self._index = VectorStoreIndex.from_documents(self._documents)\n else:\n if self._splitter is not None:\n nodes = self._splitter.build_semantic_nodes_from_documents(\n self._documents\n )\n self._index = VectorStoreIndex(nodes=nodes)\n\n retriever = VectorIndexRetriever(\n index=self._index, similarity_top_k=self._similarity_top_k * 3\n )\n response_synthesizer = get_response_synthesizer()\n rerank_postprocessor = SentenceTransformerRerank(\n top_n=self._similarity_top_k,\n keep_retrieval_score=True,\n )\n self._query_engine = RetrieverQueryEngine(\n retriever=retriever,\n response_synthesizer=response_synthesizer,\n node_postprocessors=[rerank_postprocessor],\n )\n\n if (\n self._debug\n and self._token_counts is not None\n and self._token_counter is not None\n ):\n self._token_counts[\n \"embedding\"\n ] += self._token_counter.total_embedding_token_count\n "},{"location":"bcorag/#bcorag.bcorag.BcoRag.perform_query","title":"perform_query(domain) ","text":"Performs a query for a specific BCO domain. Parameters: Name Type Description Default domain DomainKey The domain being queried for. required Returns: Type Description str The generated domain. Source code in bcorag/bcorag.py def perform_query(self, domain: DomainKey) -> str:\n \"\"\"Performs a query for a specific BCO domain.\n\n Parameters\n ----------\n domain : DomainKey\n The domain being queried for.\n\n Returns\n -------\n str\n The generated domain.\n \"\"\"\n query_start_time = time.time()\n domain_prompt = self._domain_map[domain][\"prompt\"]\n for dependency in self._domain_map[domain][\"dependencies\"]:\n if self.domain_content[dependency] is not None:\n dependency_prompt = f\"The {domain} domain is dependent on the {dependency} domain. Here is the {dependency} domain: {self.domain_content[dependency]}.\"\n domain_prompt += dependency_prompt\n query_prompt = QUERY_PROMPT.format(domain, domain_prompt)\n if self._domain_map[domain][\"top_level\"]:\n query_prompt += f\"\\n{SUPPLEMENT_PROMPT}\"\n\n response_object = self._query_engine.query(query_prompt)\n if isinstance(response_object, Response):\n response_object = Response(\n response=response_object.response,\n metadata=response_object.metadata,\n source_nodes=response_object.source_nodes,\n )\n else:\n self._logger.error(\n f\"Error parsing response object, expected type Response, got type `{type(response_object)}`.\"\n )\n print(\n f\"Error parsing response object, expected type Response, got type `{type(response_object)}`.\"\n )\n misc_fns.graceful_exit(1)\n query_response = str(response_object.response)\n\n self.domain_content[domain] = query_response\n self.domain_content = add_source_nodes(\n domain_content=self.domain_content, nodes=response_object.source_nodes\n )\n\n source_str = \"\"\n for idx, source_node in enumerate(response_object.source_nodes):\n source_str += f\"\\n--------------- Source Node '{idx + 1}/{len(response_object.source_nodes)}' ---------------\"\n source_str += f\"\\nNode ID: '{source_node.node.node_id}'\"\n source_str += f\"\\nRerank Score: '{source_node.score}'\"\n source_str += f\"\\nMetadata String:\\n`{source_node.node.get_metadata_str()}`\"\n source_str += (\n f\"\\nMetadata Size: `{len(source_node.node.get_metadata_str())}`\"\n )\n source_str += f\"\\nContent Size: `{len(source_node.node.get_content())}`\"\n source_str += (\n f\"\\nRetrieved Text:\\n{source_node.node.get_content().strip()}\\n\"\n )\n source_str += \"\\n\"\n\n if self._debug:\n self._display_info(query_prompt, f\"QUERY PROMPT for the {domain} domain:\")\n self._token_counts[\"input\"] += self._token_counter.prompt_llm_token_count # type: ignore\n self._token_counts[\"output\"] += self._token_counter.completion_llm_token_count # type: ignore\n self._token_counts[\"total\"] += self._token_counter.total_llm_token_count # type: ignore\n self._token_counts[\"embedding\"] += self._token_counter.total_embedding_token_count # type: ignore\n self._display_info(self._token_counts, \"Updated token counts:\")\n self._display_info(source_str, \"Retrieval source(s):\")\n\n query_elapsed_time = time.time() - query_start_time\n self._process_output(\n domain, query_response, source_str, round(query_elapsed_time, 2)\n )\n\n return query_response\n "},{"location":"bcorag/#bcorag.bcorag.BcoRag.choose_domain","title":"choose_domain(automatic_query=False) ","text":"Gets the user input for the domain the user wants to generate. Parameters: Name Type Description Default automatic_query bool Whether to automatically query after the user chooses a domain. If set to True this is a shortcut to calling bcorag.perform_query(choose_domain()) . False Returns: Type Description (DomainKey, str) | str | None If automatic query is set to True will return a tuple containing the domain name and the query response. If automatic query is False will return the user chosen domain. None is returned if the user chooses to exit. Source code in bcorag/bcorag.py def choose_domain(\n self, automatic_query: bool = False\n) -> Optional[tuple[DomainKey, str] | DomainKey]:\n \"\"\"Gets the user input for the domain the user wants to generate.\n\n Parameters\n ----------\n automatic_query : bool, optional\n Whether to automatically query after the user chooses a domain. If set to\n True this is a shortcut to calling `bcorag.perform_query(choose_domain())`.\n\n Returns\n -------\n (DomainKey, str) | str | None\n If automatic query is set to True will return a tuple containing the domain\n name and the query response. If automatic query is False will return the user\n chosen domain. None is returned if the user chooses to exit.\n \"\"\"\n domain_prompt = (\n \"Which domain would you like to generate? Supported domains are:\"\n )\n\n domain_user_prompt: DomainKey\n for domain_user_prompt in get_args(DomainKey):\n domain_prompt += (\n f\"\\n\\t{self._domain_map[domain_user_prompt]['user_prompt']}\"\n )\n domain_prompt += \"\\n\\tE[x]it\\n\"\n print(domain_prompt)\n\n domain_selection = None\n\n while True:\n\n domain_selection = input(\"> \").strip().lower()\n\n domain: DomainKey\n for domain in get_args(DomainKey):\n if (\n domain_selection == domain\n or domain_selection == self._domain_map[domain][\"code\"]\n ):\n domain_selection = domain\n break\n else:\n if domain_selection == \"exit\" or domain_selection == \"x\":\n if self._debug:\n self._display_info(\n \"User selected 'exit' on the domain selection step.\"\n )\n return None\n else:\n if self._debug:\n self._display_info(\n f\"User entered unrecognized input '{domain_selection}' on domain chooser step.\"\n )\n print(\n f\"Unrecognized input {domain_selection} entered, please try again.\"\n )\n continue\n if not self._check_dependencies(domain_selection):\n print(\n f\"Dependencies for the `{domain_selection}` domain are not satisfied. Please choose another domain.\"\n )\n continue\n\n break\n\n if automatic_query:\n if self._debug:\n self._display_info(\n f\"Automatic query called on domain: '{domain_selection}'.\"\n )\n return domain_selection, self.perform_query(domain_selection)\n if self._debug:\n self._display_info(\n f\"User chose '{domain_selection}' domain with no automatic query.\"\n )\n return domain_selection\n "},{"location":"bcorag/#bcorag.bcorag.BcoRag._process_output","title":"_process_output(domain, response, source_str, elapsed_time) ","text":"Attempts to serialize the response into a JSON object and dumps the output. Also dumps the raw text regardless if JSON serialization was successful. The file dumps are dumped to the output directory located in the root of this repo. Keeps a TSV file to track all of the domain outputs and what parameter set generated the results. Note: This function is getting long with some redundancy, it should be re-written at some point. It works, but is ugly. Parameters: Name Type Description Default domain DomainKey The domain the response is for. required response str The generated response to dump. required source_str str The formatted source string for the query. required elapsed_time float The query generation elapsed time. required Source code in bcorag/bcorag.py def _process_output(\n self, domain: DomainKey, response: str, source_str: str, elapsed_time: float\n):\n \"\"\"Attempts to serialize the response into a JSON object and dumps the output.\n Also dumps the raw text regardless if JSON serialization was successful. The\n file dumps are dumped to the `output` directory located in the root of this\n repo. Keeps a TSV file to track all of the domain outputs and what parameter\n set generated the results.\n\n Note: This function is getting long with some redundancy, it should be re-written\n at some point. It works, but is ugly.\n\n Parameters\n ----------\n domain : DomainKey\n The domain the response is for.\n response : str\n The generated response to dump.\n source_str : str\n The formatted source string for the query.\n elapsed_time : float\n The query generation elapsed time.\n \"\"\"\n\n def dump_json_response(fp: str, response_string: str) -> bool:\n if response_string.startswith(\"```json\\n\"):\n response_string = response_string.replace(\"```json\\n\", \"\").replace(\n \"```\", \"\"\n )\n self._display_info(\n response_string, f\"QUERY RESPONSE for the `{domain}` domain:\"\n )\n try:\n response_json = json.loads(response_string)\n if misc_fns.write_json(fp, response_json):\n self._logger.info(\n f\"Succesfully serialized JSON response for the `{domain}` domain.\"\n )\n return True\n except Exception as e:\n self._logger.error(\n f\"Failed to serialize the JSON response for the `{domain}` domain.\\n{e}\"\n )\n return False\n\n generated_dir = os.path.join(self._output_path_root, \"generated_domains\")\n misc_fns.check_dir(generated_dir)\n\n txt_file_unindexed = os.path.join(\n generated_dir, f\"{domain}-(index)-{self._parameter_set_hash}.txt\"\n )\n json_file_unindexed = os.path.join(\n generated_dir, f\"{domain}-(index)-{self._parameter_set_hash}.json\"\n )\n source_file_unindexed = os.path.join(\n self._output_path_root,\n \"reference_sources\",\n f\"{domain}-(index)-{self._parameter_set_hash}.txt\",\n )\n\n output_map_json = misc_fns.load_output_tracker(\n os.path.join(self._output_path_root, \"output_map.json\")\n )\n\n # Create a new output file if one doesn't exist\n if output_map_json is None:\n\n txt_file = txt_file_unindexed.replace(\"(index)\", \"1\")\n json_file = json_file_unindexed.replace(\"(index)\", \"1\")\n source_file = source_file_unindexed.replace(\"(index)\", \"1\")\n if not dump_json_response(json_file, response):\n json_file = \"NA\"\n\n run_entry = create_output_tracker_runs_entry(\n 1,\n misc_fns.create_timestamp(),\n txt_file,\n json_file,\n source_file,\n elapsed_time,\n )\n\n directory_filter: OutputTrackerGitFilter | None = None\n file_ext_filter: OutputTrackerGitFilter | None = None\n if self._git_data is not None:\n for filter in self._git_data[\"filters\"]:\n if filter[\"filter\"] == GitFilter.FILE_EXTENSION:\n file_ext_filter = create_output_tracker_git_filter(\n (\"include\", filter[\"value\"])\n if filter[\"filter_type\"]\n == GithubRepositoryReader.FilterType.INCLUDE\n else (\"exclude\", filter[\"value\"])\n )\n elif filter[\"filter\"] == GitFilter.DIRECTORY:\n directory_filter = create_output_tracker_git_filter(\n (\"include\", filter[\"value\"])\n if filter[\"filter_type\"]\n == GithubRepositoryReader.FilterType.INCLUDE\n else (\"exclude\", filter[\"value\"])\n )\n\n param_set = create_output_tracker_param_set(\n loader=self._loader,\n vector_store=self._vector_store,\n llm=self._llm_model_name,\n embedding_model=self._embed_model_name,\n similarity_top_k=self._similarity_top_k,\n chunking_config=self._chunking_config,\n git_user=self._git_data[\"user\"] if self._git_data is not None else None,\n git_repo=self._git_data[\"repo\"] if self._git_data is not None else None,\n git_branch=(\n self._git_data[\"branch\"] if self._git_data is not None else None\n ),\n directory_git_filter=directory_filter,\n file_ext_git_filter=file_ext_filter,\n other_docs=self._other_docs,\n )\n\n instance_entry = create_output_tracker_entry(1, param_set, [run_entry])\n\n domain_entry = create_output_tracker_domain_entry(\n self._parameter_set_hash, instance_entry\n )\n\n output_data = default_output_tracker_file()\n output_data[domain].append(domain_entry)\n\n # update output map\n else:\n\n domain_map_entries = output_map_json[domain]\n\n for domain_map_entry in domain_map_entries:\n\n # found the collision entry\n if domain_map_entry[\"hash_str\"] == self._parameter_set_hash:\n\n new_index = domain_map_entry[\"entries\"][\"curr_index\"] + 1\n domain_map_entry[\"entries\"][\"curr_index\"] = new_index\n\n txt_file = txt_file_unindexed.replace(\"(index)\", str(new_index))\n json_file = json_file_unindexed.replace(\"(index)\", str(new_index))\n source_file = source_file_unindexed.replace(\n \"(index)\", str(new_index)\n )\n if not dump_json_response(json_file, response):\n json_file = \"NA\"\n\n run_entry = create_output_tracker_runs_entry(\n new_index,\n misc_fns.create_timestamp(),\n txt_file,\n json_file,\n source_file,\n elapsed_time,\n )\n\n domain_map_entry[\"entries\"][\"runs\"].append(run_entry)\n\n break\n\n # first time parameter set run (loop didn't break)\n else:\n\n txt_file = txt_file_unindexed.replace(\"(index)\", \"1\")\n json_file = json_file_unindexed.replace(\"(index)\", \"1\")\n source_file = source_file_unindexed.replace(\"(index)\", \"1\")\n if not dump_json_response(json_file, response):\n json_file = \"NA\"\n\n run_entry = create_output_tracker_runs_entry(\n 1,\n misc_fns.create_timestamp(),\n txt_file,\n json_file,\n source_file,\n elapsed_time,\n )\n\n directory_filter = None\n file_ext_filter = None\n if self._git_data is not None:\n for filter in self._git_data[\"filters\"]:\n if filter[\"filter\"] == GitFilter.FILE_EXTENSION:\n file_ext_filter = create_output_tracker_git_filter(\n (\"include\", filter[\"value\"])\n if filter[\"filter_type\"]\n == GithubRepositoryReader.FilterType.INCLUDE\n else (\"exclude\", filter[\"value\"])\n )\n elif filter[\"filter\"] == GitFilter.DIRECTORY:\n directory_filter = create_output_tracker_git_filter(\n (\"include\", filter[\"value\"])\n if filter[\"filter_type\"]\n == GithubRepositoryReader.FilterType.INCLUDE\n else (\"exclude\", filter[\"value\"])\n )\n\n param_set = create_output_tracker_param_set(\n loader=self._loader,\n vector_store=self._vector_store,\n llm=self._llm_model_name,\n embedding_model=self._embed_model_name,\n similarity_top_k=self._similarity_top_k,\n chunking_config=self._chunking_config,\n git_user=(\n self._git_data[\"user\"] if self._git_data is not None else None\n ),\n git_repo=(\n self._git_data[\"repo\"] if self._git_data is not None else None\n ),\n git_branch=(\n self._git_data[\"branch\"] if self._git_data is not None else None\n ),\n directory_git_filter=directory_filter,\n file_ext_git_filter=file_ext_filter,\n other_docs=self._other_docs,\n )\n\n instance_entry = create_output_tracker_entry(1, param_set, [run_entry])\n\n domain_entry = create_output_tracker_domain_entry(\n self._parameter_set_hash, instance_entry\n )\n\n domain_map_entries.append(domain_entry)\n\n output_data = output_map_json\n\n misc_fns.dump_string(txt_file, response)\n misc_fns.dump_string(source_file, source_str)\n # writes the output mapping files\n misc_fns.write_json(\n os.path.join(self._output_path_root, \"output_map.json\"), output_data\n )\n misc_fns.dump_output_file_map_tsv(\n os.path.join(self._output_path_root, \"output_map.tsv\"), output_data\n )\n "},{"location":"bcorag/#bcorag.bcorag.BcoRag._display_info","title":"_display_info(info, header=None) ","text":"If in debug mode, handles the debug info output to the log file. Parameters: Name Type Description Default info dict | list | str | UserSelections | None The object to log. required header str or None The optional header to log before the info. None Source code in bcorag/bcorag.py def _display_info(\n self,\n info: Optional[dict | list | str | UserSelections],\n header: Optional[str] = None,\n):\n \"\"\"If in debug mode, handles the debug info output to the log file.\n\n Parameters\n ----------\n info : dict | list | str | UserSelections | None\n The object to log.\n header : str or None\n The optional header to log before the info.\n \"\"\"\n log_str = header if header is not None else \"\"\n if isinstance(info, dict):\n for key, value in info.items():\n log_str += f\"\\n\\t{key}: '{value}'\"\n elif isinstance(info, str):\n log_str += f\"{info}\" if header is None else f\"\\n{info}\"\n self._logger.info(log_str)\n "},{"location":"bcorag/#bcorag.bcorag.BcoRag._user_selection_hash","title":"_user_selection_hash(params) ","text":"Generates an MD5 hash of the parameter set. Parameters: Name Type Description Default params UserSelections The user configuration selections. required Returns: Type Description str The hexidecimal MD5 hash. Source code in bcorag/bcorag.py def _user_selection_hash(self, params: UserSelections) -> str:\n \"\"\"Generates an MD5 hash of the parameter set.\n\n Parameters\n ----------\n params : UserSelections\n The user configuration selections.\n\n Returns\n -------\n str\n The hexidecimal MD5 hash.\n \"\"\"\n hash_list = []\n hash_list.append(params[\"llm\"])\n hash_list.append(params[\"embedding_model\"])\n hash_list.append(params[\"vector_store\"])\n hash_list.append(params[\"loader\"])\n hash_list.append(str(params[\"similarity_top_k\"]))\n hash_list.append(params[\"chunking_config\"])\n\n if params[\"git_data\"] is not None:\n\n hash_list.append(params[\"git_data\"][\"user\"])\n hash_list.append(params[\"git_data\"][\"repo\"])\n hash_list.append(params[\"git_data\"][\"branch\"])\n\n for filter in params[\"git_data\"][\"filters\"]:\n\n filter_type = (\n \"include\"\n if filter[\"filter_type\"]\n == GithubRepositoryReader.FilterType.INCLUDE\n else \"exclude\"\n )\n filter_str = f\"{filter_type}-{filter['value']}\"\n hash_list.append(filter_str)\n\n sorted(hash_list)\n hash_str = \"_\".join(hash_list)\n hash_hex = md5(hash_str.encode(\"utf-8\")).hexdigest()\n return hash_hex\n "},{"location":"bcorag/#bcorag.bcorag.BcoRag._check_dependencies","title":"_check_dependencies(domain) ","text":"Checks a domain's dependencies. Parameters: Name Type Description Default domain DomainKey The domain to check. required Returns: Type Description bool True if dependencies are satisfied, False otherwise. Source code in bcorag/bcorag.py def _check_dependencies(self, domain: DomainKey) -> bool:\n \"\"\"Checks a domain's dependencies.\n\n Parameters\n ----------\n domain : DomainKey\n The domain to check.\n\n Returns\n -------\n bool\n True if dependencies are satisfied, False otherwise.\n \"\"\"\n for dependency in self._domain_map[domain][\"dependencies\"]:\n if self.domain_content[dependency] is None:\n print(\n f\"Error: {dependency.title()} domain must be generated before the {domain.title()} domain.\"\n )\n return False\n return True\n "},{"location":"bcorag/#bcorag.bcorag.supress_stdout","title":"supress_stdout() ","text":"Context manager that redirects stdout and stderr to devnull. Source code in bcorag/bcorag.py @contextmanager\ndef supress_stdout():\n \"\"\"Context manager that redirects stdout and stderr to devnull.\"\"\"\n with open(os.devnull, \"w\") as f, redirect_stdout(f):\n yield\n "},{"location":"error-frame/","title":"Error Frame","text":""},{"location":"error-frame/#evaluator.frontend.components.evaluation_frames.error_frame.ErrorFrame","title":"ErrorFrame ","text":" Bases: CTkFrame , EvaluationBaseFrame Class for the error evaluation frame. Source code in evaluator/frontend/components/evaluation_frames/error_frame.py class ErrorFrame(ctk.CTkFrame, EvaluationBaseFrame):\n \"\"\"Class for the error evaluation frame.\"\"\"\n\n def __init__(\n self, master: ctk.CTkFrame, app_state: AppState, run_state: RunState, **kwargs\n ):\n \"\"\"Constructor.\"\"\"\n super().__init__(master, **kwargs)\n self.state = app_state\n self.run = run_state\n self.error_eval = self.run[\"eval_data\"][\"error_eval\"]\n\n self.grid_columnconfigure(0, weight=1)\n self.grid_rowconfigure(1, weight=1)\n\n self.main_err_label = ctk.CTkLabel(\n master=self, text=\"Error Evaluation\", font=(self.state[\"font\"], 28, \"bold\")\n )\n self.main_err_label.grid(\n row=0,\n columnspan=2,\n padx=self.state[\"padding\"],\n pady=self.state[\"padding\"],\n )\n\n self.inf_err_var = ctk.StringVar(\n value=reverse_cast_checkbox(\n self.error_eval.get(\n \"inferred_knowledge_error\",\n EVAL_DEFAULTS[\"inferred_knowledge_error\"],\n )\n )\n )\n self.inf_checkbox = ctk.CTkCheckBox(\n master=self,\n text=\"Inferred Knowledge Error\",\n variable=self.inf_err_var,\n onvalue=\"on\",\n offvalue=\"off\",\n )\n self.inf_checkbox.grid(\n row=2,\n column=0,\n padx=(self.state[\"padding\"], self.state[\"padding\"] // 4),\n pady=self.state[\"padding\"] // 2,\n sticky=\"w\",\n )\n\n self.ext_err_var = ctk.StringVar(\n value=reverse_cast_checkbox(\n self.error_eval.get(\n \"external_knowledge_error\",\n EVAL_DEFAULTS[\"external_knowledge_error\"],\n )\n )\n )\n self.ext_checkbox = ctk.CTkCheckBox(\n master=self,\n text=\"External Knowledge Error\",\n variable=self.ext_err_var,\n onvalue=\"on\",\n offvalue=\"off\",\n )\n self.ext_checkbox.grid(\n row=3,\n column=0,\n padx=(self.state[\"padding\"], self.state[\"padding\"] // 4),\n pady=self.state[\"padding\"] // 4,\n sticky=\"w\",\n )\n\n self.json_err_var = ctk.StringVar(\n value=reverse_cast_checkbox(\n self.error_eval.get(\n \"json_format_error\", EVAL_DEFAULTS[\"json_format_error\"]\n )\n )\n )\n self.json_checkbox = ctk.CTkCheckBox(\n master=self,\n text=\"JSON Formatting Error\",\n variable=self.json_err_var,\n onvalue=\"on\",\n offvalue=\"off\",\n )\n self.json_checkbox.grid(\n row=2,\n column=1,\n padx=(self.state[\"padding\"] // 4, self.state[\"padding\"]),\n pady=self.state[\"padding\"] // 2,\n sticky=\"w\",\n )\n\n self.other_err_var = ctk.StringVar(\n value=reverse_cast_checkbox(\n self.error_eval.get(\"other_error\", EVAL_DEFAULTS[\"other_error\"])\n )\n )\n self.other_err_checkbox = ctk.CTkCheckBox(\n master=self,\n text=\"Other Error\",\n variable=self.other_err_var,\n onvalue=\"on\",\n offvalue=\"off\",\n )\n self.other_err_checkbox.grid(\n row=3,\n column=1,\n padx=(self.state[\"padding\"] // 4, self.state[\"padding\"]),\n pady=self.state[\"padding\"] // 4,\n sticky=\"w\",\n )\n\n self.error_notes_label = ctk.CTkLabel(\n master=self, text=\"Notes\", font=(self.state[\"font\"], 16, \"bold\")\n )\n self.error_notes_label.grid(\n row=4,\n column=0,\n padx=self.state[\"padding\"],\n pady=(self.state[\"padding\"] // 2, self.state[\"padding\"] // 4),\n sticky=\"w\",\n )\n\n self.error_notes = ctk.CTkTextbox(\n master=self, wrap=\"word\", font=(self.state[\"font\"], 18)\n )\n self.error_notes.grid(\n row=5,\n columnspan=2,\n padx=self.state[\"padding\"] // 2,\n pady=(self.state[\"padding\"] // 4, self.state[\"padding\"]),\n sticky=\"nsew\",\n )\n\n def update_state(self, app_state: AppState, run_state: RunState) -> None:\n \"\"\"Update the component state.\n\n Parameters\n ----------\n app_state : AppState\n The updated app state.\n run_state : RunState\n The updated run state.\n \"\"\"\n self.run = run_state\n self.state = app_state\n self.error_eval = self.run[\"eval_data\"][\"error_eval\"]\n\n self.inf_err_var = ctk.StringVar(\n value=reverse_cast_checkbox(\n self.error_eval.get(\n \"inferred_knowledge_error\",\n EVAL_DEFAULTS[\"inferred_knowledge_error\"],\n )\n )\n )\n self.inf_checkbox.configure(variable=self.inf_err_var)\n\n self.ext_err_var = ctk.StringVar(\n value=reverse_cast_checkbox(\n self.error_eval.get(\n \"external_knowledge_error\",\n EVAL_DEFAULTS[\"external_knowledge_error\"],\n )\n )\n )\n self.ext_checkbox.configure(variable=self.ext_err_var)\n\n self.json_err_var = ctk.StringVar(\n value=reverse_cast_checkbox(\n self.error_eval.get(\n \"json_format_error\", EVAL_DEFAULTS[\"json_format_error\"]\n )\n )\n )\n self.json_checkbox.configure(variable=self.json_err_var)\n\n self.other_err_var = ctk.StringVar(\n value=reverse_cast_checkbox(\n self.error_eval.get(\"other_error\", EVAL_DEFAULTS[\"other_error\"])\n )\n )\n self.other_err_checkbox.configure(variable=self.other_err_var)\n\n self.error_notes.delete(0.0, \"end\")\n self.error_notes.insert(\n 0.0, self.error_eval.get(\"notes\", EVAL_DEFAULTS[\"notes\"])\n )\n\n def get_results(self) -> ErrorEval:\n \"\"\"Returns the error evaluations.\n\n Returns\n -------\n ErrorEval\n The error evaluation results.\n \"\"\"\n error_eval = create_error_val(\n inf_err=self.inf_err_var.get(),\n ext_err=self.ext_err_var.get(),\n json_err=self.json_err_var.get(),\n other_err=self.other_err_var.get(),\n notes=self.error_notes.get(0.0, \"end\"),\n )\n return error_eval\n "},{"location":"error-frame/#evaluator.frontend.components.evaluation_frames.error_frame.ErrorFrame.__init__","title":"__init__(master, app_state, run_state, **kwargs) ","text":"Constructor. Source code in evaluator/frontend/components/evaluation_frames/error_frame.py def __init__(\n self, master: ctk.CTkFrame, app_state: AppState, run_state: RunState, **kwargs\n):\n \"\"\"Constructor.\"\"\"\n super().__init__(master, **kwargs)\n self.state = app_state\n self.run = run_state\n self.error_eval = self.run[\"eval_data\"][\"error_eval\"]\n\n self.grid_columnconfigure(0, weight=1)\n self.grid_rowconfigure(1, weight=1)\n\n self.main_err_label = ctk.CTkLabel(\n master=self, text=\"Error Evaluation\", font=(self.state[\"font\"], 28, \"bold\")\n )\n self.main_err_label.grid(\n row=0,\n columnspan=2,\n padx=self.state[\"padding\"],\n pady=self.state[\"padding\"],\n )\n\n self.inf_err_var = ctk.StringVar(\n value=reverse_cast_checkbox(\n self.error_eval.get(\n \"inferred_knowledge_error\",\n EVAL_DEFAULTS[\"inferred_knowledge_error\"],\n )\n )\n )\n self.inf_checkbox = ctk.CTkCheckBox(\n master=self,\n text=\"Inferred Knowledge Error\",\n variable=self.inf_err_var,\n onvalue=\"on\",\n offvalue=\"off\",\n )\n self.inf_checkbox.grid(\n row=2,\n column=0,\n padx=(self.state[\"padding\"], self.state[\"padding\"] // 4),\n pady=self.state[\"padding\"] // 2,\n sticky=\"w\",\n )\n\n self.ext_err_var = ctk.StringVar(\n value=reverse_cast_checkbox(\n self.error_eval.get(\n \"external_knowledge_error\",\n EVAL_DEFAULTS[\"external_knowledge_error\"],\n )\n )\n )\n self.ext_checkbox = ctk.CTkCheckBox(\n master=self,\n text=\"External Knowledge Error\",\n variable=self.ext_err_var,\n onvalue=\"on\",\n offvalue=\"off\",\n )\n self.ext_checkbox.grid(\n row=3,\n column=0,\n padx=(self.state[\"padding\"], self.state[\"padding\"] // 4),\n pady=self.state[\"padding\"] // 4,\n sticky=\"w\",\n )\n\n self.json_err_var = ctk.StringVar(\n value=reverse_cast_checkbox(\n self.error_eval.get(\n \"json_format_error\", EVAL_DEFAULTS[\"json_format_error\"]\n )\n )\n )\n self.json_checkbox = ctk.CTkCheckBox(\n master=self,\n text=\"JSON Formatting Error\",\n variable=self.json_err_var,\n onvalue=\"on\",\n offvalue=\"off\",\n )\n self.json_checkbox.grid(\n row=2,\n column=1,\n padx=(self.state[\"padding\"] // 4, self.state[\"padding\"]),\n pady=self.state[\"padding\"] // 2,\n sticky=\"w\",\n )\n\n self.other_err_var = ctk.StringVar(\n value=reverse_cast_checkbox(\n self.error_eval.get(\"other_error\", EVAL_DEFAULTS[\"other_error\"])\n )\n )\n self.other_err_checkbox = ctk.CTkCheckBox(\n master=self,\n text=\"Other Error\",\n variable=self.other_err_var,\n onvalue=\"on\",\n offvalue=\"off\",\n )\n self.other_err_checkbox.grid(\n row=3,\n column=1,\n padx=(self.state[\"padding\"] // 4, self.state[\"padding\"]),\n pady=self.state[\"padding\"] // 4,\n sticky=\"w\",\n )\n\n self.error_notes_label = ctk.CTkLabel(\n master=self, text=\"Notes\", font=(self.state[\"font\"], 16, \"bold\")\n )\n self.error_notes_label.grid(\n row=4,\n column=0,\n padx=self.state[\"padding\"],\n pady=(self.state[\"padding\"] // 2, self.state[\"padding\"] // 4),\n sticky=\"w\",\n )\n\n self.error_notes = ctk.CTkTextbox(\n master=self, wrap=\"word\", font=(self.state[\"font\"], 18)\n )\n self.error_notes.grid(\n row=5,\n columnspan=2,\n padx=self.state[\"padding\"] // 2,\n pady=(self.state[\"padding\"] // 4, self.state[\"padding\"]),\n sticky=\"nsew\",\n )\n "},{"location":"error-frame/#evaluator.frontend.components.evaluation_frames.error_frame.ErrorFrame.update_state","title":"update_state(app_state, run_state) ","text":"Update the component state. Parameters: Name Type Description Default app_state AppState The updated app state. required run_state RunState The updated run state. required Source code in evaluator/frontend/components/evaluation_frames/error_frame.py def update_state(self, app_state: AppState, run_state: RunState) -> None:\n \"\"\"Update the component state.\n\n Parameters\n ----------\n app_state : AppState\n The updated app state.\n run_state : RunState\n The updated run state.\n \"\"\"\n self.run = run_state\n self.state = app_state\n self.error_eval = self.run[\"eval_data\"][\"error_eval\"]\n\n self.inf_err_var = ctk.StringVar(\n value=reverse_cast_checkbox(\n self.error_eval.get(\n \"inferred_knowledge_error\",\n EVAL_DEFAULTS[\"inferred_knowledge_error\"],\n )\n )\n )\n self.inf_checkbox.configure(variable=self.inf_err_var)\n\n self.ext_err_var = ctk.StringVar(\n value=reverse_cast_checkbox(\n self.error_eval.get(\n \"external_knowledge_error\",\n EVAL_DEFAULTS[\"external_knowledge_error\"],\n )\n )\n )\n self.ext_checkbox.configure(variable=self.ext_err_var)\n\n self.json_err_var = ctk.StringVar(\n value=reverse_cast_checkbox(\n self.error_eval.get(\n \"json_format_error\", EVAL_DEFAULTS[\"json_format_error\"]\n )\n )\n )\n self.json_checkbox.configure(variable=self.json_err_var)\n\n self.other_err_var = ctk.StringVar(\n value=reverse_cast_checkbox(\n self.error_eval.get(\"other_error\", EVAL_DEFAULTS[\"other_error\"])\n )\n )\n self.other_err_checkbox.configure(variable=self.other_err_var)\n\n self.error_notes.delete(0.0, \"end\")\n self.error_notes.insert(\n 0.0, self.error_eval.get(\"notes\", EVAL_DEFAULTS[\"notes\"])\n )\n "},{"location":"error-frame/#evaluator.frontend.components.evaluation_frames.error_frame.ErrorFrame.get_results","title":"get_results() ","text":"Returns the error evaluations. Returns: Type Description ErrorEval The error evaluation results. Source code in evaluator/frontend/components/evaluation_frames/error_frame.py def get_results(self) -> ErrorEval:\n \"\"\"Returns the error evaluations.\n\n Returns\n -------\n ErrorEval\n The error evaluation results.\n \"\"\"\n error_eval = create_error_val(\n inf_err=self.inf_err_var.get(),\n ext_err=self.ext_err_var.get(),\n json_err=self.json_err_var.get(),\n other_err=self.other_err_var.get(),\n notes=self.error_notes.get(0.0, \"end\"),\n )\n return error_eval\n "},{"location":"evaluation-app/","title":"Evaluation Application","text":" - Starting the Application
- View Page
- Sidebar
- Tab View
- Compare JSON
- Source Nodes
- Parameter Set
- Evaluate
- Score Evaluation
- Error Evaluation
- Reference Evaluation
- General Evaluation
- Miscellaneous Evaluation
In order to accurately and consistently evaluate generated domains, such as those created with parameter searches, the BcoRag tool has an accompanying evaluation application that provides a more user friendly GUI. "},{"location":"evaluation-app/#starting-the-application","title":"Starting the Application","text":"The evaluation application can be run from the main.py entrpoint using the evaluate positional argument like so: (env) python main.py evaluate\n On startup, you will be presented with the login screen. The login mechanism is a naive first and last name login that just keeps track of which domains you have already evaluated and what scores you submitted for each domain. If you are a new user, you'll be prompted to start from the beginning. If you are a returning user, you will be prompted whether you want to start from the beginning or resume from your last session. "},{"location":"evaluation-app/#view-page","title":"View Page","text":"The view page consists of the tab view and a sidebar. "},{"location":"evaluation-app/#sidebar","title":"Sidebar","text":"The top of the sidebar contains the navigation buttons. The Previous button will navigate back one generated domain and the Next button will navigate to the next generated domain. If you are at the first run the Previous button will be greyed out, and similarily when you are at the last available run the Next button will be greyed out. If you have already submitted an evaluation for that particular run, a red notice label will appear below the run counter showing the message Already Evaluated . The Save button will save your evaluation results to disk. The Exit button will exit the application. At the bottom of the sidebar you can switch between Light and Dark mode. Underneath the appearance dropdown there is a scaling dropdown for UI scaling. "},{"location":"evaluation-app/#tab-view","title":"Tab View","text":""},{"location":"evaluation-app/#compare-json","title":"Compare JSON","text":"The compare JSON tab allows you to inspect the generated domain against a human curated domain for the same paper. If the JSON serialization failed after generating the domain, the raw text file will be displayed with a note at the top saying Failed JSON serialization. Raw text output: . "},{"location":"evaluation-app/#source-nodes","title":"Source Nodes","text":"The source node tab will display the nodes that were retrieved during the retrieval process and sent as context with the domain query. Depending on the similarity_top_k parameter chosen for the run, the number of reference nodes will be marked with delimiting lines in the format of: ----------------- Source Node x/n -----------------\n "},{"location":"evaluation-app/#parameter-set","title":"Parameter Set","text":"The parameter set tab will display the exact parameter set that was used to generate the target domain. "},{"location":"evaluation-app/#evaluate","title":"Evaluate","text":"The evaluate tab is where reviewers will input there evaluation ratings. All evaluation sections have corresponding Notes sections that allow for free text notes regarding that evaluation category. All numeric segmented buttons have a range from 0 to 2 (with 0 being the worst, 1 being satisfactory, and 2 being the best score). The default score of -1 is just a placeholder value used to filter out potentially un-finished or erroneously pre-maturly submitted evaluations. The bottom right Submit button will save the evaluation to the session in memory. If you click on the Next or Previous buttons before submitting the evaluation it will be lost. "},{"location":"evaluation-app/#score-evaluation","title":"Score Evaluation","text":"The score evaluation frame contains evaluation options for the BCO domain score. The Score label displays the calculated BCO score returned from the BCO score API endpoint (on API error a default value of -1.0 is shown). The Score version label displays the score version according to the BCO score API endpoint (on API error a default value of 0.0 is shown). Depending on the quality of the generated domain, the evaluator should mark whether the score should actually be higher, lower, or if it is about right using the segmented button. "},{"location":"evaluation-app/#error-evaluation","title":"Error Evaluation","text":"The error evaluation frame allows the user to indicate any errors in the generated domain. The types of errors are: - Inferred Knowledge Errors: Fields that require inferred knowledge can result in undefined behavior. For example, multiple domains make use of the uri object that is defined in the top level BCO JSON schema. The uri object has a field for access_time, which expects a fully JSON compliant
date-time . An exact timestamp is very likely to not be explicitly listed in the source material. In early testing, the tool seems to use a default value of 2023-11-01T12:00:00Z for these fields. - External Knowledge Error: External knowledge errors result in non-specific information when the field requires knowledge from external dependencies. A common scenario is for the authors of the paper to include links to the Github repository that contains the source code and corresponding input/output files. In the Description domain, each pipeline step includes a target field for output files generated by the particular step. Since the specific location of the scripts and output files are usually not explicitly described in the paper, the tool will fill the output file field with a generated link to the repository, not the specific location of the file within the repository. For example, a link to the repository such as https://github.com/biocompute-objects/bco-rag/tree/main, versus a link to the specific file within the repository such as https://github.com/biocompute-objects/bco-rag/blob/main/docs/evaluation_app.md.
- JSON Formatting Error: JSON formatting errors occur when the generated domain either 1) is not valid JSON or 2) when the generated domain does not validate against the BCO JSON schema.
- Other Error: Other errors can be any other errors not caught by the previous three categories.
"},{"location":"evaluation-app/#reference-evaluation","title":"Reference Evaluation","text":"The reference evaluation frame allows for the user to rate the reference nodes. The reference nodes are arguably the most important part of the domain generation process as they provide the LLM with the context required to complete the domain request. Ideally, all reference nodes should be relevant to the domain purpose and should logically make sense. For example, the Usability domain \"is a plain language description of what was done in the workflow\". Most often, much of that information would be present in the paper abstract, and not in the paper citations section. "},{"location":"evaluation-app/#general-evaluation","title":"General Evaluation","text":"The general evaluation frame allows for the user to rate the generated domain directly. Ideally, generated domains should be relevant to the domain purpose, human readable, and comprehensible by someone with less knowledge of the source material. "},{"location":"evaluation-app/#miscellaneous-evaluation","title":"Miscellaneous Evaluation","text":"The miscellaneous evaluation frame allows for the user to include some metadata for the evaluation not relating directly to the generated domain. In evaluation of scoring data, it should be noted how confident and familiar with the source material the evaluator was. In doing so, isolating high quality reviews will be clearer. The user can also evaluate the quality of the human curated domain. "},{"location":"evaluator-custom-types/","title":"Types","text":"Handles the custom types for the App backend. Type Aliases ScoreEvalLiteral = Literal[\"Lower\", \"About right\", \"Higher\"] RunStateKey = Literal[\"paper\", \"domain\", \"generated_domain\", \"score\", \"score_version\", \"generated_file_path\", \"human_curated_domain\", \"param_set\", \"reference_nodes\", \"run_index\", \"total_runs\", \"already_evaluated\", \"logger\", \"eval_data\"] AppStateKey = Literal[\"logger\", \"results_dir_path\", \"bco_results_file_name\", \"bco_results_data\", \"user_results_file_name\", \"user_results_data\", \"users_file_name\", \"users_data\", \"generated_directory_paths\", \"padding\", \"font\", \"user_hash\"] "},{"location":"evaluator-custom-types/#evaluator.backend.custom_types.ConfigData","title":"ConfigData ","text":" Bases: TypedDict Defines the schema for the JSON config data. Attributes: Name Type Description logger_path str The path to the logger. logger_name str The name of the logger. generated_output_dir_path str The filepath to the generated domains directory to evaluate. glob_pattern str The glob patterns to traverse the generated output directory. results_dir_path str The path to the directory to dump the evaluation results. ignore_files list[str] Identifiers to ignore certain files (used like if ignore_files[x] in filename ). bco_results_file_name str The file name for the BCO results file. user_results_file_name str The file name for the user evaluations results file. users_file_name str The file name for the users file. padding int The default root padding used throughout the frontend components. font str The default font used throughout the frontend components. Source code in evaluator/backend/custom_types.py class ConfigData(TypedDict):\n \"\"\"Defines the schema for the JSON config data.\n\n Attributes\n ----------\n logger_path : str\n The path to the logger.\n logger_name : str\n The name of the logger.\n generated_output_dir_path : str\n The filepath to the generated domains directory to evaluate.\n glob_pattern : str\n The glob patterns to traverse the generated output directory.\n results_dir_path : str\n The path to the directory to dump the evaluation results.\n ignore_files : list[str]\n Identifiers to ignore certain files (used like `if ignore_files[x] in filename`).\n bco_results_file_name : str\n The file name for the BCO results file.\n user_results_file_name : str\n The file name for the user evaluations results file.\n users_file_name : str\n The file name for the users file.\n padding : int\n The default root padding used throughout the frontend components.\n font : str\n The default font used throughout the frontend components.\n \"\"\"\n\n logger_path: str\n logger_name: str\n generated_output_dir_path: str\n glob_pattern: str\n results_dir_path: str\n ignore_files: list[str]\n bco_results_file_name: str\n user_results_file_name: str\n users_file_name: str\n padding: int\n font: str\n "},{"location":"evaluator-custom-types/#evaluator.backend.custom_types.ScoreEval","title":"ScoreEval ","text":" Bases: TypedDict Score evaluation results. Attributes: Name Type Description eval ScoreEvalLiteral The score eval literal. eval_code int The casted score eval literal. notes str Any additional notes from the evaluator regarding the score evaluation. Source code in evaluator/backend/custom_types.py class ScoreEval(TypedDict):\n \"\"\"Score evaluation results.\n\n Attributes\n ----------\n eval : ScoreEvalLiteral\n The score eval literal.\n eval_code : int\n The casted score eval literal.\n notes : str\n Any additional notes from the evaluator regarding the score evaluation.\n \"\"\"\n\n eval: ScoreEvalLiteral\n eval_code: int\n notes: str\n "},{"location":"evaluator-custom-types/#evaluator.backend.custom_types.ErrorEval","title":"ErrorEval ","text":" Bases: TypedDict Error evaluation data. Attributes: Name Type Description inferred_knowledge_error bool Whether there was an inferred knowledge error. external_knowledge_error bool Whether there was an external knowledge error. json_format_error bool Whether there was a JSON formatting error. other_error bool Whether there was any other error. notes str Any additional notes from the evaluator regarding the error evaluation. Source code in evaluator/backend/custom_types.py class ErrorEval(TypedDict):\n \"\"\"Error evaluation data.\n\n Attributes\n ----------\n inferred_knowledge_error: bool\n Whether there was an inferred knowledge error.\n external_knowledge_error: bool\n Whether there was an external knowledge error.\n json_format_error: bool\n Whether there was a JSON formatting error.\n other_error: bool\n Whether there was any other error.\n notes: str\n Any additional notes from the evaluator regarding the error evaluation.\n \"\"\"\n\n inferred_knowledge_error: bool\n external_knowledge_error: bool\n json_format_error: bool\n other_error: bool\n notes: str\n "},{"location":"evaluator-custom-types/#evaluator.backend.custom_types.RefereceEval","title":"RefereceEval ","text":" Bases: TypedDict Reference evaluation data. Attributes: Name Type Description reference_relevancy int Indicates how relevant the reference nodes were to the domain. top_reference_retrieval bool Whether the top node retrieved was the most relevant. notes str Any additional notes from the evaluator regarding the reference evaluation. Source code in evaluator/backend/custom_types.py class RefereceEval(TypedDict):\n \"\"\"Reference evaluation data.\n\n Attributes\n ----------\n reference_relevancy : int\n Indicates how relevant the reference nodes were to the domain.\n top_reference_retrieval : bool\n Whether the top node retrieved was the most relevant.\n notes : str\n Any additional notes from the evaluator regarding the reference evaluation.\n \"\"\"\n\n reference_relevancy: int\n top_reference_retrieval: bool\n notes: str\n "},{"location":"evaluator-custom-types/#evaluator.backend.custom_types.GeneralEval","title":"GeneralEval ","text":" Bases: TypedDict General evaluation data. Attributes: Name Type Description relevancy int Indicates how relevant the generated domain was. readability int Indicates how readable the generated domain was. reproducibility int Indicates how reproduceable the domain steps are. confidence_rating int Indicates how confident the evaluator was in their evaluation. notes str Any additional notes from the evaluator regarding the general evaluation. Source code in evaluator/backend/custom_types.py class GeneralEval(TypedDict):\n \"\"\"General evaluation data.\n\n Attributes\n ----------\n relevancy : int\n Indicates how relevant the generated domain was.\n readability : int\n Indicates how readable the generated domain was.\n reproducibility : int\n Indicates how reproduceable the domain steps are.\n confidence_rating : int\n Indicates how confident the evaluator was in their evaluation.\n notes : str\n Any additional notes from the evaluator regarding the general evaluation.\n \"\"\"\n\n relevancy: int\n readability: int\n reproducibility: int\n confidence_rating: int\n notes: str\n "},{"location":"evaluator-custom-types/#evaluator.backend.custom_types.MiscEval","title":"MiscEval ","text":" Bases: TypedDict Miscellaneous evaluation data. Attributes: Name Type Description human_domain_rating int The high level human domain rating for the generated domain. evaluator_confidence_rating int Indicates how confident the evaluator is in their evaluation. evaluator_familiarity_level int Indicates how familiar the evaluator is with the paper content. notes str Any additional notes from the evaluator regarding the miscellaneous evaluation. Source code in evaluator/backend/custom_types.py class MiscEval(TypedDict):\n \"\"\"Miscellaneous evaluation data.\n\n Attributes\n ----------\n human_domain_rating : int\n The high level human domain rating for the generated domain.\n evaluator_confidence_rating : int\n Indicates how confident the evaluator is in their evaluation.\n evaluator_familiarity_level: int\n Indicates how familiar the evaluator is with the paper content.\n notes : str\n Any additional notes from the evaluator regarding the miscellaneous evaluation.\n \"\"\"\n\n human_domain_rating: int\n evaluator_confidence_rating: int\n evaluator_familiarity_level: int\n notes: str\n "},{"location":"evaluator-custom-types/#evaluator.backend.custom_types.EvalData","title":"EvalData ","text":" Bases: TypedDict Full evaluation data. Attributes: Name Type Description score_eval ScoreEval error_eval ErrorEval reference_eval RefereceEval general_eval GeneralEval misc_eval MiscEval Source code in evaluator/backend/custom_types.py class EvalData(TypedDict):\n \"\"\"Full evaluation data.\n\n Attributes\n ----------\n score_eval: ScoreEval\n error_eval: ErrorEval\n reference_eval: RefereceEval\n general_eval: GeneralEval\n misc_eval: MiscEval\n \"\"\"\n\n score_eval: ScoreEval\n error_eval: ErrorEval\n reference_eval: RefereceEval\n general_eval: GeneralEval\n misc_eval: MiscEval\n "},{"location":"evaluator-custom-types/#evaluator.backend.custom_types.RunState","title":"RunState ","text":" Bases: TypedDict Holds the data for the current run being evaluated. Attributes: Name Type Description paper str The paper for the current run state. domain str The domain the current run is for. generated_domain str The generated domain string for the current run. score float The score for the current run (from the BCO score API). score_version float The score version for the score (from the BCO score API). generated_file_path str The generated domain file path (points to the JSON file if valid JSON, otherwise points to the raw text file). human_curated_domain str The human curated domain string. param_set str The parameter set string for the run. reference_nodes str The retrieved reference node values. run_index int The run index. total_runs int The total number of runs to potentially evaluate. already_evaluated bool Whether the user has already evaluated this run. logger Logger The logger for the App. eval_data EvalData The evaluation data for the run. Source code in evaluator/backend/custom_types.py class RunState(TypedDict):\n \"\"\"Holds the data for the current run being evaluated.\n\n Attributes\n ----------\n paper: str\n The paper for the current run state.\n domain: str\n The domain the current run is for.\n generated_domain: str\n The generated domain string for the current run.\n score: float\n The score for the current run (from the BCO score API).\n score_version: float\n The score version for the score (from the BCO score API).\n generated_file_path: str\n The generated domain file path (points to the JSON file if valid JSON, otherwise points to the raw text file).\n human_curated_domain: str\n The human curated domain string.\n param_set: str\n The parameter set string for the run.\n reference_nodes: str\n The retrieved reference node values.\n run_index: int\n The run index.\n total_runs: int\n The total number of runs to potentially evaluate.\n already_evaluated: bool\n Whether the user has already evaluated this run.\n logger: Logger\n The logger for the App.\n eval_data: EvalData\n The evaluation data for the run.\n \"\"\"\n\n paper: str\n domain: str\n generated_domain: str\n score: float\n score_version: float\n generated_file_path: str\n human_curated_domain: str\n param_set: str\n reference_nodes: str\n run_index: int\n total_runs: int\n already_evaluated: bool\n logger: Logger\n eval_data: EvalData\n "},{"location":"evaluator-custom-types/#evaluator.backend.custom_types.AppAttributes","title":"AppAttributes ","text":" Bases: TypedDict Handles the app initialization attributes. Attributes: Name Type Description logger Logger The App logger. results_dir_path str The path to the directory to dump the evaluation results. bco_results_file_name str The file name for the BCO results file. bco_results_data dict The aggregates BCO results data. user_results_file_name str The file name for the user evaluations results file. user_results_data dict[str, dict[str, EvalData | None] | None] The user evaluation results. users_file_name str The file name for the users file. users_data dict The users data. generated_output_dir_root str The root filepath to the generated domains directory to evaluate. generated_directory_paths list[str] List of directory paths for all the papers. padding int The default root padding to use for all the frontend components. font str The default font to use for all the frontend components. Source code in evaluator/backend/custom_types.py class AppAttributes(TypedDict):\n \"\"\"Handles the app initialization attributes.\n\n Attributes\n ----------\n logger : Logger\n The App logger.\n results_dir_path : str\n The path to the directory to dump the evaluation results.\n bco_results_file_name : str\n The file name for the BCO results file.\n bco_results_data: dict\n The aggregates BCO results data.\n user_results_file_name: str\n The file name for the user evaluations results file.\n user_results_data: dict[str, dict[str, EvalData | None] | None]\n The user evaluation results.\n users_file_name: str\n The file name for the users file.\n users_data: dict\n The users data.\n generated_output_dir_root: str\n The root filepath to the generated domains directory to evaluate.\n generated_directory_paths: list[str]\n List of directory paths for all the papers.\n padding: int\n The default root padding to use for all the frontend components.\n font: str\n The default font to use for all the frontend components. \n \"\"\"\n\n logger: Logger\n results_dir_path: str\n bco_results_file_name: str\n bco_results_data: dict\n user_results_file_name: str\n user_results_data: dict[str, dict[str, EvalData | None] | None]\n users_file_name: str\n users_data: dict\n generated_output_dir_root: str\n generated_directory_paths: list[str]\n padding: int\n font: str\n "},{"location":"evaluator-custom-types/#evaluator.backend.custom_types.AppState","title":"AppState ","text":" Bases: AppAttributes Holds the application state information, essentially just the attributes plus the current user hash, new user flag and start from last session boolean. Attributes: Name Type Description user_hash str The user hash. new_user bool New user flag. resume_session bool Resume session flag. Source code in evaluator/backend/custom_types.py class AppState(AppAttributes):\n \"\"\"Holds the application state information, essentially\n just the attributes plus the current user hash, new user\n flag and start from last session boolean.\n\n Attributes\n ----------\n user_hash: str\n The user hash.\n new_user: bool\n New user flag.\n resume_session: bool\n Resume session flag.\n \"\"\"\n\n user_hash: str\n new_user: bool\n resume_session: bool\n "},{"location":"evaluator-custom-types/#evaluator.backend.custom_types.cast_checkbox","title":"cast_checkbox(val) ","text":"Cast checkbox string to boolean (assuming checkbox values are on , off ). Parameters: Name Type Description Default val str The value to cast. required Returns: Type Description bool The casted checkbox value. Source code in evaluator/backend/custom_types.py def cast_checkbox(val: str) -> bool:\n \"\"\"Cast checkbox string to boolean (assuming checkbox values are `on`, `off`).\n\n Parameters\n ----------\n val : str\n The value to cast.\n\n Returns\n -------\n bool\n The casted checkbox value.\n \"\"\"\n val = val.strip().lower()\n if val == \"on\":\n return True\n elif val == \"off\":\n return False\n raise ValueError(f\"Error casting `{val}` to bool.\")\n "},{"location":"evaluator-custom-types/#evaluator.backend.custom_types.reverse_cast_checkbox","title":"reverse_cast_checkbox(err) ","text":"Reverse cast checkbox bool to string (assuming checkbox values are on , off ). Parameters: Name Type Description Default err bool The value to revserse cast. required Returns: Type Description str The reverse casted value. Source code in evaluator/backend/custom_types.py def reverse_cast_checkbox(err: bool) -> str:\n \"\"\"Reverse cast checkbox bool to string (assuming checkbox values are `on`, `off`).\n\n Parameters\n ----------\n err : bool\n The value to revserse cast.\n\n Returns\n -------\n str\n The reverse casted value.\n \"\"\"\n if err:\n return \"on\"\n else:\n return \"off\"\n "},{"location":"evaluator-custom-types/#evaluator.backend.custom_types.create_score_eval","title":"create_score_eval(eval, notes) ","text":"Constructor for the ScoreEval TypedDict. The score eval literal will be automatically casted to the eval code. Parameters: Name Type Description Default eval ScoreEvalLiteral The score eval literal. required notes str Any additional notes from the evaluator regarding the score evaluation. required Returns: Type Description ScoreEval Source code in evaluator/backend/custom_types.py def create_score_eval(eval: ScoreEvalLiteral, notes: str) -> ScoreEval:\n \"\"\"Constructor for the ScoreEval TypedDict. The score eval literal\n will be automatically casted to the eval code.\n\n Parameters\n ----------\n eval : ScoreEvalLiteral\n The score eval literal.\n notes : str\n Any additional notes from the evaluator regarding the score evaluation.\n\n Returns\n -------\n ScoreEval\n \"\"\"\n eval_str = str(eval.strip().lower())\n\n eval_code = 0\n match eval_str:\n case \"lower\":\n eval_code = -1\n case \"higher\":\n eval_code = 1\n\n return_data: ScoreEval = {\n \"eval\": eval,\n \"eval_code\": eval_code,\n \"notes\": notes.strip(),\n }\n\n return return_data\n "},{"location":"evaluator-custom-types/#evaluator.backend.custom_types.cast_score_eval","title":"cast_score_eval(score_eval_str) ","text":"Cast a string to ScoreEvalLiteral (if possible). Parameters: Name Type Description Default score_eval_str str The string to cast. required Returns: Type Description ScoreEvalLiteral Source code in evaluator/backend/custom_types.py def cast_score_eval(score_eval_str: str) -> ScoreEvalLiteral:\n \"\"\"Cast a string to ScoreEvalLiteral (if possible).\n\n Parameters\n ----------\n score_eval_str : str\n The string to cast.\n\n Returns\n -------\n ScoreEvalLiteral\n \"\"\"\n score_eval_str = score_eval_str.strip().lower()\n match score_eval_str:\n case \"lower\":\n return \"Lower\"\n case \"about right\":\n return \"About right\"\n case \"higher\":\n return \"Higher\"\n raise ValueError(f\"Error casting `{score_eval_str}` to ScoreEvalLiteral.\")\n "},{"location":"evaluator-custom-types/#evaluator.backend.custom_types.create_error_val","title":"create_error_val(inf_err, ext_err, json_err, other_err, notes) ","text":"Constructor for the ErrorEval TypedDict. Parameters: Name Type Description Default inf_err bool | str The inferred knowledge error indicator. required ext_err bool | str The external knowledge error indicator. required json_err bool | str The JSON formattign error indicator. required notes str Any additional notes from the evaluator regarding the error evaluation. required Source code in evaluator/backend/custom_types.py def create_error_val(\n inf_err: bool | str,\n ext_err: bool | str,\n json_err: bool | str,\n other_err: bool | str,\n notes: str,\n) -> ErrorEval:\n \"\"\"Constructor for the ErrorEval TypedDict.\n\n Parameters\n ----------\n inf_err : bool | str\n The inferred knowledge error indicator.\n ext_err : bool | str\n The external knowledge error indicator.\n json_err : bool | str\n The JSON formattign error indicator.\n notes : str\n Any additional notes from the evaluator regarding the error evaluation.\n \"\"\"\n if isinstance(inf_err, str):\n inf_err = cast_checkbox(inf_err)\n if isinstance(ext_err, str):\n ext_err = cast_checkbox(ext_err)\n if isinstance(json_err, str):\n json_err = cast_checkbox(json_err)\n if isinstance(other_err, str):\n other_err = cast_checkbox(other_err)\n\n return_data: ErrorEval = {\n \"inferred_knowledge_error\": inf_err,\n \"external_knowledge_error\": ext_err,\n \"json_format_error\": json_err,\n \"other_error\": other_err,\n \"notes\": notes.strip(),\n }\n return return_data\n "},{"location":"evaluator-custom-types/#evaluator.backend.custom_types.create_reference_eval","title":"create_reference_eval(reference_relevancy, top_reference_retrieval, notes) ","text":"Constructor for the RefereceEval TypedDict. Parameters: Name Type Description Default reference_relevancy int Indicates how relevant the reference nodes were to the domain. required top_reference_retrieval bool Whether the top node retrieved was the most relevant. required notes str Any additional notes from the evaluator regarding the reference evaluation. required Returns: Type Description ReferenceEval Source code in evaluator/backend/custom_types.py def create_reference_eval(\n reference_relevancy: int, top_reference_retrieval: bool | str, notes: str\n) -> RefereceEval:\n \"\"\"Constructor for the RefereceEval TypedDict.\n\n Parameters\n ----------\n reference_relevancy : int\n Indicates how relevant the reference nodes were to the domain.\n top_reference_retrieval : bool\n Whether the top node retrieved was the most relevant.\n notes : str\n Any additional notes from the evaluator regarding the reference evaluation.\n\n Returns\n -------\n ReferenceEval\n \"\"\"\n if isinstance(top_reference_retrieval, str):\n top_reference_retrieval = cast_checkbox(top_reference_retrieval)\n\n return_data: RefereceEval = {\n \"reference_relevancy\": reference_relevancy,\n \"top_reference_retrieval\": top_reference_retrieval,\n \"notes\": notes.strip(),\n }\n\n return return_data\n "},{"location":"evaluator-custom-types/#evaluator.backend.custom_types.create_general_eval","title":"create_general_eval(relevancy, readability, reproducibility, confidence_rating, notes) ","text":"Constructor for the GeneralEval TypedDict. Parameters: Name Type Description Default relevancy int Indicates how relevant the generated domain was. required readability int Indicates how readable the generated domain was. required reproducibility int Indicates how reproduceable the domain steps are. required confidence_rating int Indicates how confident the evaluator is in the generated domain. required notes str Any additional notes from the evaluator regarding the general evaluation. required Returns: Type Description GeneralEval Source code in evaluator/backend/custom_types.py def create_general_eval(\n relevancy: int,\n readability: int,\n reproducibility: int,\n confidence_rating: int,\n notes: str,\n) -> GeneralEval:\n \"\"\"Constructor for the GeneralEval TypedDict.\n\n Parameters\n ----------\n relevancy : int\n Indicates how relevant the generated domain was.\n readability : int\n Indicates how readable the generated domain was.\n reproducibility : int\n Indicates how reproduceable the domain steps are.\n confidence_rating : int\n Indicates how confident the evaluator is in the generated domain.\n notes : str\n Any additional notes from the evaluator regarding the general evaluation.\n\n Returns\n -------\n GeneralEval\n \"\"\"\n return_data: GeneralEval = {\n \"relevancy\": relevancy,\n \"readability\": readability,\n \"reproducibility\": reproducibility,\n \"confidence_rating\": confidence_rating,\n \"notes\": notes.strip(),\n }\n return return_data\n "},{"location":"evaluator-custom-types/#evaluator.backend.custom_types.create_misc_eval","title":"create_misc_eval(human_domain_rating, evaluator_confidence_rating, evaluator_familiarity_level, notes) ","text":"Constructor for the MiscEval TypedDict. Parameters: Name Type Description Default human_domain_rating int The high level human domain rating for the generated domain. required evaluator_confidence_rating int Indicates how confident the evaluator is in their evaluation. required evaluator_familiarity_level int Indicates how familiar the evaluator is with the paper content. required notes str Any additional notes from the evaluator regarding the miscellaneous evaluation. required Returns: Type Description MiscEval Source code in evaluator/backend/custom_types.py def create_misc_eval(\n human_domain_rating: int,\n evaluator_confidence_rating: int,\n evaluator_familiarity_level: int,\n notes: str,\n) -> MiscEval:\n \"\"\"Constructor for the MiscEval TypedDict.\n\n Parameters\n ----------\n human_domain_rating : int\n The high level human domain rating for the generated domain.\n evaluator_confidence_rating : int\n Indicates how confident the evaluator is in their evaluation.\n evaluator_familiarity_level: int\n Indicates how familiar the evaluator is with the paper content.\n notes : str\n Any additional notes from the evaluator regarding the miscellaneous evaluation.\n\n Returns\n -------\n MiscEval\n \"\"\"\n return_data: MiscEval = {\n \"human_domain_rating\": human_domain_rating,\n \"evaluator_confidence_rating\": evaluator_confidence_rating,\n \"evaluator_familiarity_level\": evaluator_familiarity_level,\n \"notes\": notes.strip(),\n }\n return return_data\n "},{"location":"evaluator-custom-types/#evaluator.backend.custom_types.create_full_eval","title":"create_full_eval(score_eval, error_eval, reference_eval, general_eval, misc_eval) ","text":"Constructor for the EvalData TypedDict. Parameters: Name Type Description Default score_eval ScoreEval required error_eval ErrorEval required reference_eval RefereceEval required general_eval GeneralEval required misc_eval MiscEval required Source code in evaluator/backend/custom_types.py def create_full_eval(\n score_eval: ScoreEval,\n error_eval: ErrorEval,\n reference_eval: RefereceEval,\n general_eval: GeneralEval,\n misc_eval: MiscEval,\n) -> EvalData:\n \"\"\"Constructor for the EvalData TypedDict.\n\n Parameters\n ----------\n score_eval: ScoreEval\n error_eval: ErrorEval\n reference_eval: RefereceEval\n general_eval: GeneralEval\n misc_eval: MiscEval\n \"\"\"\n return_data: EvalData = {\n \"score_eval\": score_eval,\n \"error_eval\": error_eval,\n \"reference_eval\": reference_eval,\n \"general_eval\": general_eval,\n \"misc_eval\": misc_eval,\n }\n return return_data\n "},{"location":"evaluator-custom-types/#evaluator.backend.custom_types.load_score_defaults","title":"load_score_defaults(filepath='./evaluator/backend/score_defaults.json') ","text":"Loads the score defaults JSON file. Parameters: Name Type Description Default filepath str The filepath to the score defaults JSON file. './evaluator/backend/score_defaults.json' Returns: Type Description EvalData | None The evaluation data with the default values or None on error. Source code in evaluator/backend/custom_types.py def load_score_defaults(\n filepath: str = \"./evaluator/backend/score_defaults.json\",\n) -> Optional[EvalData]:\n \"\"\"Loads the score defaults JSON file.\n\n Parameters\n ----------\n filepath : str, optional\n The filepath to the score defaults JSON file.\n\n Returns\n -------\n EvalData | None\n The evaluation data with the default values or None on error.\n \"\"\"\n naive_load_data = misc_fns.load_json(filepath)\n if naive_load_data is None:\n return None\n if isinstance(naive_load_data, dict):\n eval_defaults = cast(EvalData, naive_load_data)\n return eval_defaults\n return None\n "},{"location":"evaluator-custom-types/#evaluator.backend.custom_types.default_eval","title":"default_eval() ","text":"Get a default EvalData. Returns: Type Description EvalData Source code in evaluator/backend/custom_types.py def default_eval() -> EvalData:\n \"\"\"Get a default EvalData.\n\n Returns\n -------\n EvalData\n \"\"\"\n eval_defaults = load_score_defaults()\n if eval_defaults is None:\n misc_fns.graceful_exit(1, \"Error loading score defaults.\")\n return eval_defaults\n "},{"location":"evaluator-custom-types/#evaluator.backend.custom_types.check_default_eval","title":"check_default_eval(val) ","text":"Checks if the EvalData is still the default. This helps to prevent saving erroneous save data. Parameters: Name Type Description Default val dict | EvalData The evaluation data to check. required Returns: Type Description bool True if still the default, False if different. Source code in evaluator/backend/custom_types.py def check_default_eval(val: dict | EvalData) -> bool:\n \"\"\"Checks if the EvalData is still the default. This\n helps to prevent saving erroneous save data.\n\n Parameters\n ----------\n val : dict | EvalData\n The evaluation data to check.\n\n Returns\n -------\n bool\n True if still the default, False if different.\n \"\"\"\n default_eval_dict = default_eval()\n diff = DeepDiff(\n default_eval_dict,\n val,\n ignore_order=True,\n ignore_string_case=True,\n ignore_nan_inequality=True,\n )\n if diff == {}:\n return True\n else:\n return False\n "},{"location":"evaluator-custom-types/#evaluator.backend.custom_types.create_run_state","title":"create_run_state(paper, domain, generated_domain, generated_file_path, human_curated_domain, param_set, reference_nodes, run_index, total_runs, already_evaluated, logger, eval_data) ","text":"Constructor for the RunState TypedDict. Parameters: Name Type Description Default paper str The paper for the current run state. required domain str The domain the current run is for. required generated_domain str | dict The generated domain for the current run. required generated_file_path str The generated domain file path (points to the JSON file if valid JSON, otherwise points to the raw text file). required human_curated_domain str The human curated domain string. required param_set str The parameter set string for the run. required reference_nodes str The retrieved reference node values. required run_index int The run index. required total_runs int The total number of runs to potentially evaluate. required already_evaluated bool Whether the user has already evaluated this run. required logger Logger The logger for the App. required eval_data EvalData The evaluation data for the run. required Source code in evaluator/backend/custom_types.py def create_run_state(\n paper: str,\n domain: str,\n generated_domain: str | dict,\n generated_file_path: str,\n human_curated_domain: str,\n param_set: str,\n reference_nodes: str,\n run_index: int,\n total_runs: int,\n already_evaluated: bool,\n logger: Logger,\n eval_data: EvalData,\n) -> RunState:\n \"\"\"Constructor for the RunState TypedDict.\n\n Parameters\n ----------\n paper: str\n The paper for the current run state.\n domain: str\n The domain the current run is for.\n generated_domain: str | dict\n The generated domain for the current run.\n generated_file_path: str\n The generated domain file path (points to the JSON file if valid JSON, otherwise points to the raw text file).\n human_curated_domain: str\n The human curated domain string.\n param_set: str\n The parameter set string for the run.\n reference_nodes: str\n The retrieved reference node values.\n run_index: int\n The run index.\n total_runs: int\n The total number of runs to potentially evaluate.\n already_evaluated: bool\n Whether the user has already evaluated this run.\n logger: Logger\n The logger for the App.\n eval_data: EvalData\n The evaluation data for the run.\n \"\"\"\n score = -1.0\n score_version = 0.0\n if isinstance(generated_domain, dict):\n # TODO : whenever the BCO score API endpoint is\n # created hit that here.\n generated_domain = json.dumps(generated_domain, indent=4)\n\n return_data: RunState = {\n \"paper\": paper,\n \"domain\": domain,\n \"generated_domain\": generated_domain,\n \"score\": score,\n \"score_version\": score_version,\n \"generated_file_path\": generated_file_path,\n \"human_curated_domain\": human_curated_domain,\n \"param_set\": param_set,\n \"reference_nodes\": reference_nodes,\n \"run_index\": run_index,\n \"total_runs\": total_runs,\n \"already_evaluated\": already_evaluated,\n \"logger\": logger,\n \"eval_data\": eval_data,\n }\n return return_data\n "},{"location":"evaluator-custom-types/#evaluator.backend.custom_types.create_app_attributes","title":"create_app_attributes(logger, results_dir_path, bco_results_file_name, bco_results_data, user_results_file_name, user_results_data, users_file_name, users_data, generated_output_dir_root, generated_directory_paths, padding, font) ","text":"Constructor for the AppAttributes TypedDict. Parameters: Name Type Description Default logger Logger The App logger. required results_dir_path str The path to the directory to dump the evaluation results. required bco_results_file_name str The file name for the BCO results file. required bco_results_data dict The aggregates BCO results data. required user_results_file_name str The file name for the user evaluations results file. required user_results_data dict[str, dict[str, EvalData | None] | None] The user evaluation results. required users_file_name str The file name for the users file. required users_data dict The users data. required generated_output_dir_root str The root filepath to the generated domains directory to evaluate. required generated_directory_paths list[str] List of directory paths for all the papers. required padding int The default root padding to use for all the frontend components. required font str The default font to use for all the frontend components. required Source code in evaluator/backend/custom_types.py def create_app_attributes(\n logger: Logger,\n results_dir_path: str,\n bco_results_file_name: str,\n bco_results_data: dict,\n user_results_file_name: str,\n user_results_data: dict[str, dict[str, EvalData | None] | None],\n users_file_name: str,\n users_data: dict,\n generated_output_dir_root: str,\n generated_directory_paths: list[str],\n padding: int,\n font: str,\n) -> AppAttributes:\n \"\"\"Constructor for the AppAttributes TypedDict.\n\n Parameters\n ----------\n logger : Logger\n The App logger.\n results_dir_path : str\n The path to the directory to dump the evaluation results.\n bco_results_file_name : str\n The file name for the BCO results file.\n bco_results_data: dict\n The aggregates BCO results data.\n user_results_file_name: str\n The file name for the user evaluations results file.\n user_results_data: dict[str, dict[str, EvalData | None] | None]\n The user evaluation results.\n users_file_name: str\n The file name for the users file.\n users_data: dict\n The users data.\n generated_output_dir_root: str\n The root filepath to the generated domains directory to evaluate.\n generated_directory_paths: list[str]\n List of directory paths for all the papers.\n padding: int\n The default root padding to use for all the frontend components.\n font: str\n The default font to use for all the frontend components. \n \"\"\"\n return_data: AppAttributes = {\n \"logger\": logger,\n \"results_dir_path\": results_dir_path,\n \"bco_results_file_name\": bco_results_file_name,\n \"bco_results_data\": bco_results_data,\n \"user_results_file_name\": user_results_file_name,\n \"user_results_data\": user_results_data,\n \"users_file_name\": users_file_name,\n \"users_data\": users_data,\n \"generated_output_dir_root\": generated_output_dir_root,\n \"generated_directory_paths\": generated_directory_paths,\n \"padding\": padding,\n \"font\": font,\n }\n return return_data\n "},{"location":"evaluator-custom-types/#evaluator.backend.custom_types.create_app_state","title":"create_app_state(attributes, user_hash, new_user, resume_session=False) ","text":"Constructor for the AppState TypedDict. Parameters: Name Type Description Default attributes AppAttributes The app attributes to base the state off of. required user_hash str The user hash. required new_user bool New user flag. required resume_session bool Resume session flag. False Returns: Type Description AppState Source code in evaluator/backend/custom_types.py def create_app_state(\n attributes: AppAttributes,\n user_hash: str,\n new_user: bool,\n resume_session: bool = False,\n) -> AppState:\n \"\"\"Constructor for the AppState TypedDict.\n\n Parameters\n ----------\n attributes : AppAttributes\n The app attributes to base the state off of.\n user_hash: str\n The user hash.\n new_user: bool\n New user flag.\n resume_session: bool, optional\n Resume session flag.\n\n Returns\n -------\n AppState\n \"\"\"\n return_data: AppState = {\n \"logger\": attributes[\"logger\"],\n \"results_dir_path\": attributes[\"results_dir_path\"],\n \"bco_results_file_name\": attributes[\"bco_results_file_name\"],\n \"bco_results_data\": attributes[\"bco_results_data\"],\n \"user_results_file_name\": attributes[\"user_results_file_name\"],\n \"user_results_data\": attributes[\"user_results_data\"],\n \"users_file_name\": attributes[\"users_file_name\"],\n \"users_data\": attributes[\"users_data\"],\n \"generated_output_dir_root\": attributes[\"generated_output_dir_root\"],\n \"generated_directory_paths\": attributes[\"generated_directory_paths\"],\n \"padding\": attributes[\"padding\"],\n \"font\": attributes[\"font\"],\n \"user_hash\": user_hash,\n \"new_user\": new_user,\n \"resume_session\": resume_session,\n }\n return return_data\n "},{"location":"general-frame/","title":"General Frame","text":""},{"location":"general-frame/#evaluator.frontend.components.evaluation_frames.general_frame.GeneralFrame","title":"GeneralFrame ","text":" Bases: CTkFrame , EvaluationBaseFrame Class for the general evaluation frame. Source code in evaluator/frontend/components/evaluation_frames/general_frame.py class GeneralFrame(ctk.CTkFrame, EvaluationBaseFrame):\n \"\"\"Class for the general evaluation frame.\"\"\"\n\n def __init__(\n self, master: ctk.CTkFrame, app_state: AppState, run_state: RunState, **kwargs\n ):\n \"\"\"Constructor.\"\"\"\n super().__init__(master, **kwargs)\n self.state = app_state\n self.run = run_state\n self.general_eval = self.run[\"eval_data\"][\"general_eval\"]\n\n self.grid_columnconfigure(0, weight=1)\n self.grid_rowconfigure(1, weight=1)\n\n self.main_gen_label = ctk.CTkLabel(\n master=self,\n text=\"General Evaluation\",\n font=(self.state[\"font\"], 28, \"bold\"),\n )\n self.main_gen_label.grid(\n row=0, columnspan=2, padx=self.state[\"padding\"], pady=self.state[\"padding\"]\n )\n\n self.relevancy_label = ctk.CTkLabel(\n master=self,\n text=\"How relevant is the domain content?\",\n font=(self.state[\"font\"], 16, \"bold\"),\n )\n self.relevancy_label.grid(\n row=2,\n column=0,\n padx=(self.state[\"padding\"], self.state[\"padding\"] // 4),\n pady=(self.state[\"padding\"] // 2, self.state[\"padding\"] // 4),\n sticky=\"w\",\n )\n\n self.relevancy_var = ctk.IntVar(\n value=self.general_eval.get(\"relevancy\", EVAL_DEFAULTS[\"relevancy\"])\n )\n self.relevancy_button = ctk.CTkSegmentedButton(\n master=self, values=[-1, 0, 1, 2], variable=self.relevancy_var\n )\n self.relevancy_button.grid(\n row=2,\n column=1,\n padx=(self.state[\"padding\"] // 4, self.state[\"padding\"]),\n pady=(self.state[\"padding\"] // 2, self.state[\"padding\"] // 4),\n sticky=\"w\",\n )\n\n self.readability_label = ctk.CTkLabel(\n master=self,\n text=\"How readable is the domain content?\",\n font=(self.state[\"font\"], 16, \"bold\"),\n )\n self.readability_label.grid(\n row=3,\n column=0,\n padx=(self.state[\"padding\"], self.state[\"padding\"] // 4),\n pady=self.state[\"padding\"] // 4,\n sticky=\"w\",\n )\n\n self.readability_var = ctk.IntVar(\n value=self.general_eval.get(\"readability\", EVAL_DEFAULTS[\"readability\"])\n )\n self.readability_button = ctk.CTkSegmentedButton(\n master=self, values=[-1, 0, 1, 2], variable=self.readability_var\n )\n self.readability_button.grid(\n row=3,\n column=1,\n padx=(self.state[\"padding\"] // 4, self.state[\"padding\"]),\n pady=self.state[\"padding\"] // 4,\n sticky=\"w\",\n )\n\n self.reproducibility_label = ctk.CTkLabel(\n master=self,\n text=\"How reproducible is the domain content?\",\n font=(self.state[\"font\"], 16, \"bold\"),\n )\n self.reproducibility_label.grid(\n row=4,\n column=0,\n padx=(self.state[\"padding\"], self.state[\"padding\"] // 4),\n pady=self.state[\"padding\"] // 4,\n sticky=\"w\",\n )\n\n self.reproducibility_var = ctk.IntVar(\n value=self.general_eval.get(\n \"reproducibility\", EVAL_DEFAULTS[\"reproducibility\"]\n )\n )\n self.reproducibility_button = ctk.CTkSegmentedButton(\n master=self, values=[-1, 0, 1, 2], variable=self.reproducibility_var\n )\n self.reproducibility_button.grid(\n row=4,\n column=1,\n padx=(self.state[\"padding\"] // 4, self.state[\"padding\"]),\n pady=self.state[\"padding\"] // 4,\n sticky=\"w\",\n )\n\n self.conf_label = ctk.CTkLabel(\n master=self,\n text=\"What is your confidence rating for the domain?\",\n font=(self.state[\"font\"], 16, \"bold\"),\n )\n self.conf_label.grid(\n row=5,\n column=0,\n padx=(self.state[\"padding\"], self.state[\"padding\"] // 4),\n pady=self.state[\"padding\"] // 4,\n sticky=\"w\",\n )\n\n self.conf_var = ctk.IntVar(\n value=self.general_eval.get(\n \"confidence_rating\", EVAL_DEFAULTS[\"confidence_rating\"]\n )\n )\n self.conf_button = ctk.CTkSegmentedButton(\n master=self, values=[-1, 0, 1, 2], variable=self.conf_var\n )\n self.conf_button.grid(\n row=5,\n column=1,\n padx=(self.state[\"padding\"] // 4, self.state[\"padding\"]),\n pady=self.state[\"padding\"] // 4,\n sticky=\"w\",\n )\n\n self.general_notes_label = ctk.CTkLabel(\n master=self, text=\"Notes\", font=(self.state[\"font\"], 16, \"bold\")\n )\n self.general_notes_label.grid(\n row=6,\n column=0,\n padx=self.state[\"padding\"],\n pady=(self.state[\"padding\"] // 2, self.state[\"padding\"] // 4),\n sticky=\"w\",\n )\n\n self.general_notes = ctk.CTkTextbox(\n master=self, wrap=\"word\", font=(self.state[\"font\"], 18)\n )\n self.general_notes.grid(\n row=7,\n columnspan=2,\n padx=self.state[\"padding\"] // 2,\n pady=(self.state[\"padding\"] // 4, self.state[\"padding\"]),\n sticky=\"nsew\",\n )\n\n def update_state(self, app_state: AppState, run_state: RunState) -> None:\n \"\"\"Update the component state.\n\n Parameters\n ----------\n app_state : AppState\n The updated app state.\n run_state : RunState\n The updated run state.\n \"\"\"\n self.run = run_state\n self.state = app_state\n self.general_eval = self.run[\"eval_data\"][\"general_eval\"]\n\n self.relevancy_var = ctk.IntVar(\n value=self.general_eval.get(\"relevancy\", EVAL_DEFAULTS[\"relevancy\"])\n )\n self.relevancy_button.configure(variable=self.relevancy_var)\n\n self.readability_var = ctk.IntVar(\n value=self.general_eval.get(\"readability\", EVAL_DEFAULTS[\"readability\"])\n )\n self.readability_button.configure(variable=self.readability_var)\n\n self.reproducibility_var = ctk.IntVar(\n value=self.general_eval.get(\n \"reproducibility\", EVAL_DEFAULTS[\"reproducibility\"]\n )\n )\n self.reproducibility_button.configure(variable=self.reproducibility_var)\n\n self.conf_var = ctk.IntVar(\n value=self.general_eval.get(\n \"confidence_rating\", EVAL_DEFAULTS[\"confidence_rating\"]\n )\n )\n self.conf_button.configure(variable=self.conf_var)\n\n self.general_notes.delete(0.0, \"end\")\n self.general_notes.insert(\n 0.0, self.general_eval.get(\"notes\", EVAL_DEFAULTS[\"notes\"])\n )\n\n def get_results(self) -> GeneralEval:\n \"\"\"Returns the general evaluations.\n\n Returns\n -------\n GeneralEval\n The general evaluation results.\n \"\"\"\n relevancy_val = self.relevancy_var.get()\n readability_var = self.readability_var.get()\n reproducibility_var = self.reproducibility_var.get()\n conf_var = self.conf_var.get()\n general_val = create_general_eval(\n relevancy=relevancy_val,\n readability=readability_var,\n reproducibility=reproducibility_var,\n confidence_rating=conf_var,\n notes=self.general_notes.get(0.0, \"end\"),\n )\n return general_val\n "},{"location":"general-frame/#evaluator.frontend.components.evaluation_frames.general_frame.GeneralFrame.__init__","title":"__init__(master, app_state, run_state, **kwargs) ","text":"Constructor. Source code in evaluator/frontend/components/evaluation_frames/general_frame.py def __init__(\n self, master: ctk.CTkFrame, app_state: AppState, run_state: RunState, **kwargs\n):\n \"\"\"Constructor.\"\"\"\n super().__init__(master, **kwargs)\n self.state = app_state\n self.run = run_state\n self.general_eval = self.run[\"eval_data\"][\"general_eval\"]\n\n self.grid_columnconfigure(0, weight=1)\n self.grid_rowconfigure(1, weight=1)\n\n self.main_gen_label = ctk.CTkLabel(\n master=self,\n text=\"General Evaluation\",\n font=(self.state[\"font\"], 28, \"bold\"),\n )\n self.main_gen_label.grid(\n row=0, columnspan=2, padx=self.state[\"padding\"], pady=self.state[\"padding\"]\n )\n\n self.relevancy_label = ctk.CTkLabel(\n master=self,\n text=\"How relevant is the domain content?\",\n font=(self.state[\"font\"], 16, \"bold\"),\n )\n self.relevancy_label.grid(\n row=2,\n column=0,\n padx=(self.state[\"padding\"], self.state[\"padding\"] // 4),\n pady=(self.state[\"padding\"] // 2, self.state[\"padding\"] // 4),\n sticky=\"w\",\n )\n\n self.relevancy_var = ctk.IntVar(\n value=self.general_eval.get(\"relevancy\", EVAL_DEFAULTS[\"relevancy\"])\n )\n self.relevancy_button = ctk.CTkSegmentedButton(\n master=self, values=[-1, 0, 1, 2], variable=self.relevancy_var\n )\n self.relevancy_button.grid(\n row=2,\n column=1,\n padx=(self.state[\"padding\"] // 4, self.state[\"padding\"]),\n pady=(self.state[\"padding\"] // 2, self.state[\"padding\"] // 4),\n sticky=\"w\",\n )\n\n self.readability_label = ctk.CTkLabel(\n master=self,\n text=\"How readable is the domain content?\",\n font=(self.state[\"font\"], 16, \"bold\"),\n )\n self.readability_label.grid(\n row=3,\n column=0,\n padx=(self.state[\"padding\"], self.state[\"padding\"] // 4),\n pady=self.state[\"padding\"] // 4,\n sticky=\"w\",\n )\n\n self.readability_var = ctk.IntVar(\n value=self.general_eval.get(\"readability\", EVAL_DEFAULTS[\"readability\"])\n )\n self.readability_button = ctk.CTkSegmentedButton(\n master=self, values=[-1, 0, 1, 2], variable=self.readability_var\n )\n self.readability_button.grid(\n row=3,\n column=1,\n padx=(self.state[\"padding\"] // 4, self.state[\"padding\"]),\n pady=self.state[\"padding\"] // 4,\n sticky=\"w\",\n )\n\n self.reproducibility_label = ctk.CTkLabel(\n master=self,\n text=\"How reproducible is the domain content?\",\n font=(self.state[\"font\"], 16, \"bold\"),\n )\n self.reproducibility_label.grid(\n row=4,\n column=0,\n padx=(self.state[\"padding\"], self.state[\"padding\"] // 4),\n pady=self.state[\"padding\"] // 4,\n sticky=\"w\",\n )\n\n self.reproducibility_var = ctk.IntVar(\n value=self.general_eval.get(\n \"reproducibility\", EVAL_DEFAULTS[\"reproducibility\"]\n )\n )\n self.reproducibility_button = ctk.CTkSegmentedButton(\n master=self, values=[-1, 0, 1, 2], variable=self.reproducibility_var\n )\n self.reproducibility_button.grid(\n row=4,\n column=1,\n padx=(self.state[\"padding\"] // 4, self.state[\"padding\"]),\n pady=self.state[\"padding\"] // 4,\n sticky=\"w\",\n )\n\n self.conf_label = ctk.CTkLabel(\n master=self,\n text=\"What is your confidence rating for the domain?\",\n font=(self.state[\"font\"], 16, \"bold\"),\n )\n self.conf_label.grid(\n row=5,\n column=0,\n padx=(self.state[\"padding\"], self.state[\"padding\"] // 4),\n pady=self.state[\"padding\"] // 4,\n sticky=\"w\",\n )\n\n self.conf_var = ctk.IntVar(\n value=self.general_eval.get(\n \"confidence_rating\", EVAL_DEFAULTS[\"confidence_rating\"]\n )\n )\n self.conf_button = ctk.CTkSegmentedButton(\n master=self, values=[-1, 0, 1, 2], variable=self.conf_var\n )\n self.conf_button.grid(\n row=5,\n column=1,\n padx=(self.state[\"padding\"] // 4, self.state[\"padding\"]),\n pady=self.state[\"padding\"] // 4,\n sticky=\"w\",\n )\n\n self.general_notes_label = ctk.CTkLabel(\n master=self, text=\"Notes\", font=(self.state[\"font\"], 16, \"bold\")\n )\n self.general_notes_label.grid(\n row=6,\n column=0,\n padx=self.state[\"padding\"],\n pady=(self.state[\"padding\"] // 2, self.state[\"padding\"] // 4),\n sticky=\"w\",\n )\n\n self.general_notes = ctk.CTkTextbox(\n master=self, wrap=\"word\", font=(self.state[\"font\"], 18)\n )\n self.general_notes.grid(\n row=7,\n columnspan=2,\n padx=self.state[\"padding\"] // 2,\n pady=(self.state[\"padding\"] // 4, self.state[\"padding\"]),\n sticky=\"nsew\",\n )\n "},{"location":"general-frame/#evaluator.frontend.components.evaluation_frames.general_frame.GeneralFrame.update_state","title":"update_state(app_state, run_state) ","text":"Update the component state. Parameters: Name Type Description Default app_state AppState The updated app state. required run_state RunState The updated run state. required Source code in evaluator/frontend/components/evaluation_frames/general_frame.py def update_state(self, app_state: AppState, run_state: RunState) -> None:\n \"\"\"Update the component state.\n\n Parameters\n ----------\n app_state : AppState\n The updated app state.\n run_state : RunState\n The updated run state.\n \"\"\"\n self.run = run_state\n self.state = app_state\n self.general_eval = self.run[\"eval_data\"][\"general_eval\"]\n\n self.relevancy_var = ctk.IntVar(\n value=self.general_eval.get(\"relevancy\", EVAL_DEFAULTS[\"relevancy\"])\n )\n self.relevancy_button.configure(variable=self.relevancy_var)\n\n self.readability_var = ctk.IntVar(\n value=self.general_eval.get(\"readability\", EVAL_DEFAULTS[\"readability\"])\n )\n self.readability_button.configure(variable=self.readability_var)\n\n self.reproducibility_var = ctk.IntVar(\n value=self.general_eval.get(\n \"reproducibility\", EVAL_DEFAULTS[\"reproducibility\"]\n )\n )\n self.reproducibility_button.configure(variable=self.reproducibility_var)\n\n self.conf_var = ctk.IntVar(\n value=self.general_eval.get(\n \"confidence_rating\", EVAL_DEFAULTS[\"confidence_rating\"]\n )\n )\n self.conf_button.configure(variable=self.conf_var)\n\n self.general_notes.delete(0.0, \"end\")\n self.general_notes.insert(\n 0.0, self.general_eval.get(\"notes\", EVAL_DEFAULTS[\"notes\"])\n )\n "},{"location":"general-frame/#evaluator.frontend.components.evaluation_frames.general_frame.GeneralFrame.get_results","title":"get_results() ","text":"Returns the general evaluations. Returns: Type Description GeneralEval The general evaluation results. Source code in evaluator/frontend/components/evaluation_frames/general_frame.py def get_results(self) -> GeneralEval:\n \"\"\"Returns the general evaluations.\n\n Returns\n -------\n GeneralEval\n The general evaluation results.\n \"\"\"\n relevancy_val = self.relevancy_var.get()\n readability_var = self.readability_var.get()\n reproducibility_var = self.reproducibility_var.get()\n conf_var = self.conf_var.get()\n general_val = create_general_eval(\n relevancy=relevancy_val,\n readability=readability_var,\n reproducibility=reproducibility_var,\n confidence_rating=conf_var,\n notes=self.general_notes.get(0.0, \"end\"),\n )\n return general_val\n "},{"location":"grid-search/","title":"Grid Search","text":"Grid search class implementation. "},{"location":"grid-search/#parameter_search.grid_search.BcoGridSearch","title":"BcoGridSearch ","text":" Bases: BcoParameterSearch BCO grid search class. Subclass of BcoParameterSearch . Source code in parameter_search/grid_search.py class BcoGridSearch(BcoParameterSearch):\n \"\"\"BCO grid search class. Subclass of `BcoParameterSearch`.\n \"\"\"\n\n def __init__(self, search_space: SearchSpace):\n \"\"\"Constructor.\n\n Parameters\n ----------\n search_space : SearchSpace\n The parameter search space.\n \"\"\"\n super().__init__(search_space)\n\n def _setup_logger(self, path: str = \"./logs\", name: str = \"grid-search\") -> Logger:\n \"\"\"Sets up the logger.\n\n Parameters\n ----------\n path : str, optional\n File path for the logger.\n name : str, optional\n Name for the logger output.\n\n Returns\n -------\n Logger\n The grid search logger.\n \"\"\"\n check_dir(path)\n if not name.endswith(\".log\"):\n name = f\"{name}.log\"\n return setup_root_logger(os.path.join(path, name))\n\n def _create_param_sets(self) -> list[UserSelections]:\n \"\"\"Creates a cartesian product of the parameter space.\n\n Returns\n -------\n list[UserSelections]\n Every comination of the parameter search space.\n \"\"\"\n param_sets: list[UserSelections] = []\n\n for (\n llm,\n embedding_model,\n filepath,\n loader,\n chunking_config,\n vector_store,\n similarity_top_k,\n ) in product(\n self._llms,\n self._embedding_models,\n self._files,\n self._loaders,\n self._chunking_configs,\n self._vector_stores,\n self._similarity_top_k,\n ):\n base_selections = {\n \"llm\": llm,\n \"embedding_model\": embedding_model,\n \"filename\": os.path.basename(str(filepath)),\n \"filepath\": filepath,\n \"vector_store\": vector_store,\n \"loader\": loader,\n \"mode\": \"production\",\n \"similarity_top_k\": similarity_top_k,\n \"chunking_config\": chunking_config,\n }\n\n if self._git_data is None:\n base_selections[\"git_data\"] = None\n else:\n for git_data in self._git_data:\n if git_data[\"filename\"] == filepath or git_data[\n \"filename\"\n ] == os.path.basename(str(filepath)):\n base_selections[\"git_data\"] = create_git_data(\n user=git_data[\"git_info\"][\"user\"],\n repo=git_data[\"git_info\"][\"repo\"],\n branch=git_data[\"git_info\"][\"branch\"],\n filters=git_data[\"git_info\"][\"filters\"],\n )\n user_selections = create_user_selections(\n base_selections[\"llm\"],\n base_selections[\"embedding_model\"],\n base_selections[\"filename\"],\n base_selections[\"filepath\"],\n base_selections[\"vector_store\"],\n base_selections[\"loader\"],\n base_selections[\"mode\"],\n base_selections[\"similarity_top_k\"],\n base_selections[\"chunking_config\"],\n base_selections[\"git_data\"],\n )\n param_sets.append(user_selections)\n\n return param_sets\n "},{"location":"grid-search/#parameter_search.grid_search.BcoGridSearch.__init__","title":"__init__(search_space) ","text":"Constructor. Parameters: Name Type Description Default search_space SearchSpace The parameter search space. required Source code in parameter_search/grid_search.py def __init__(self, search_space: SearchSpace):\n \"\"\"Constructor.\n\n Parameters\n ----------\n search_space : SearchSpace\n The parameter search space.\n \"\"\"\n super().__init__(search_space)\n "},{"location":"grid-search/#parameter_search.grid_search.BcoGridSearch._setup_logger","title":"_setup_logger(path='./logs', name='grid-search') ","text":"Sets up the logger. Parameters: Name Type Description Default path str File path for the logger. './logs' name str Name for the logger output. 'grid-search' Returns: Type Description Logger The grid search logger. Source code in parameter_search/grid_search.py def _setup_logger(self, path: str = \"./logs\", name: str = \"grid-search\") -> Logger:\n \"\"\"Sets up the logger.\n\n Parameters\n ----------\n path : str, optional\n File path for the logger.\n name : str, optional\n Name for the logger output.\n\n Returns\n -------\n Logger\n The grid search logger.\n \"\"\"\n check_dir(path)\n if not name.endswith(\".log\"):\n name = f\"{name}.log\"\n return setup_root_logger(os.path.join(path, name))\n "},{"location":"grid-search/#parameter_search.grid_search.BcoGridSearch._create_param_sets","title":"_create_param_sets() ","text":"Creates a cartesian product of the parameter space. Returns: Type Description list[UserSelections] Every comination of the parameter search space. Source code in parameter_search/grid_search.py def _create_param_sets(self) -> list[UserSelections]:\n \"\"\"Creates a cartesian product of the parameter space.\n\n Returns\n -------\n list[UserSelections]\n Every comination of the parameter search space.\n \"\"\"\n param_sets: list[UserSelections] = []\n\n for (\n llm,\n embedding_model,\n filepath,\n loader,\n chunking_config,\n vector_store,\n similarity_top_k,\n ) in product(\n self._llms,\n self._embedding_models,\n self._files,\n self._loaders,\n self._chunking_configs,\n self._vector_stores,\n self._similarity_top_k,\n ):\n base_selections = {\n \"llm\": llm,\n \"embedding_model\": embedding_model,\n \"filename\": os.path.basename(str(filepath)),\n \"filepath\": filepath,\n \"vector_store\": vector_store,\n \"loader\": loader,\n \"mode\": \"production\",\n \"similarity_top_k\": similarity_top_k,\n \"chunking_config\": chunking_config,\n }\n\n if self._git_data is None:\n base_selections[\"git_data\"] = None\n else:\n for git_data in self._git_data:\n if git_data[\"filename\"] == filepath or git_data[\n \"filename\"\n ] == os.path.basename(str(filepath)):\n base_selections[\"git_data\"] = create_git_data(\n user=git_data[\"git_info\"][\"user\"],\n repo=git_data[\"git_info\"][\"repo\"],\n branch=git_data[\"git_info\"][\"branch\"],\n filters=git_data[\"git_info\"][\"filters\"],\n )\n user_selections = create_user_selections(\n base_selections[\"llm\"],\n base_selections[\"embedding_model\"],\n base_selections[\"filename\"],\n base_selections[\"filepath\"],\n base_selections[\"vector_store\"],\n base_selections[\"loader\"],\n base_selections[\"mode\"],\n base_selections[\"similarity_top_k\"],\n base_selections[\"chunking_config\"],\n base_selections[\"git_data\"],\n )\n param_sets.append(user_selections)\n\n return param_sets\n "},{"location":"in-progress/","title":"In-Progress Report Documentation","text":"The BCO standard describes comprehensive documentation on the complete specifications of a bioinformatics workflow. Unfortunately, this makes it difficult to create BCOs while work is still in progress. If a full paper describing the complete workflow for the project has not yet been completed, the in-progress mode can be used to create in progress documentation using the Aggregator tool. The Aggregator tool leverages the OpenAI gpt-4o-mini model to generate a plain text summary that follows a similar structure to the domains of a BioCompute Object (BCO). The in progress documentation aggregator can be run from the main.py entrypoint using the in-progress positional argument and the --path option. The available options for the in progress mode are as follows: --path : The path to the directory to process (required). --include : Comma delimited list of glob patterns to include (optional). --exclude : Comma delimited list of glob patterns to exclude (optional). --exclude-from-tree : Whether to exclude non-included files in the source tree (optional, store true argument). --include-priority : Whether to prioritize include or exclude patterns in the case of conflict (optional, store false argument). Here's an example output from the Aggregator tool when run on this project: "},{"location":"in-progress/#biocompute-object-documentation-for-the-bco-rag-project","title":"BioCompute Object Documentation for the BCO-RAG Project","text":""},{"location":"in-progress/#usability-domain","title":"Usability Domain","text":"The BCO-RAG project aims to provide an automated assistant for generating BioCompute Objects (BCOs) from existing biological research publications. This tool allows researchers to easily convert their publications into a standardized format, thus enhancing reproducibility and transparency in biological data analysis workflows. The primary use case is to reduce the overhead of retroactively documenting existing workflows used in research, making it easier for users to adhere to BCO standards while leveraging advanced language models for generation. "},{"location":"in-progress/#io-domain","title":"IO Domain","text":""},{"location":"in-progress/#input-files","title":"Input Files:","text":" - High resolution measurement PDF file located in
bco-rag/test_papers/High resolution measurement.pdf . "},{"location":"in-progress/#output-files","title":"Output Files:","text":" - Output directory structure will be created under
output/high_resolution_measurement/ containing: generated_domains/ subdirectory with generated domain files. - JSON and TXT files for each domain generated (e.g.,
usability-1-{hash}.json , io-1-{hash}.txt ). reference_sources/ subdirectory for tracking source references. output_map.json and output_map.tsv files that track generated domains and parameter sets. "},{"location":"in-progress/#description-domain","title":"Description Domain","text":""},{"location":"in-progress/#keywords","title":"Keywords:","text":" - BCO-RAG, BioCompute Object, automation, reproducibility, biological data analysis, retrieval-augmented generation, documentation standardization.
"},{"location":"in-progress/#workflow-steps","title":"Workflow Steps:","text":" - Load the PDF: Use PDF or directory reader to ingest the publication.
- Generate Domain: Execute
perform_query for each BCO domain including usability, IO, description, execution, parametric, and error domains. - Store Outputs: Save generated outputs to the specified output directory.
- Log Data: Keep track of input/output files and their relationships in
output_map.json . "},{"location":"in-progress/#execution-domain","title":"Execution Domain","text":"The BCO-RAG requires the following setup for execution: - Dependencies: Users must have Python 3.10 or higher installed. - Required Packages: Install dependencies specified in requirements.txt using pip install -r requirements.txt . - Environment Configuration: - Set the OpenAI API key in a .env file. - Set the Github personal access token if using Github options. "},{"location":"in-progress/#run-instructions","title":"Run Instructions:","text":""},{"location":"in-progress/#parametric-domain","title":"Parametric Domain","text":"The following parameters affect the computational workflow: - loader (str): Data loader used (e.g., PDFReader ). - chunking_config (str): Configuration for chunking strategy (e.g., 1024 chunk size/20 chunk overlap ). - embedding_model (str): Model used for embeddings (e.g., text-embedding-3-large ). - vector_store (str): Name of the vector store used (e.g., VectorStoreIndex ). - similarity_top_k (int): Number of top entries to retrieve during similarity search. - llm (str): Language model choice (e.g., gpt-4-turbo ). - git_data (Optional[GitData]): Includes repository info if GitHub is used. "},{"location":"in-progress/#examples-of-parameters","title":"Examples of Parameters:","text":" - Sample settings might include: LLM as
gpt-4 , embedding model as text-embedding-3-large , similarity_top_k as 3 , etc. "},{"location":"in-progress/#error-domain","title":"Error Domain","text":"The project tracks potential errors in the generated domains: - Inferred Knowledge Errors: Errors related to information that require inference based on external conditions not stated in the source material. - External Knowledge Errors: Errors arising from insufficient context provided for the domain's connections to external references. - JSON Formatting Errors: Issues arising if the generated output is not valid JSON. - Miscellaneous Errors: Any other discrepancies consistently tracked for documentation purposes. "},{"location":"in-progress/#evaluation","title":"Evaluation:","text":"For each output generated, the tool logs potential errors and evaluations of the output quality, ensuring that all relevant data is captured in the final documentation. "},{"location":"in-progress/#overall-functionality","title":"Overall Functionality:","text":"The BCO-RAG project automates the generation of a structured and standardized representation of computational research workflows, significantly aiding in data sharing and reproducibility within the biological research community. "},{"location":"installation/","title":"Installation and Setup","text":" - Prerequisites
- Virtual Environment
- Create Log Directory
- OpenAI API Key
"},{"location":"installation/#prerequisites","title":"Prerequisites","text":"This directory requires at least Python 3.10 to setup. The code in this directory makes extensive use of an alternate way to indicate union type annotations as X | Y instead of Union[X, Y] from the Typing library. "},{"location":"installation/#clone-the-repository","title":"Clone the repository","text":"First, clone the repository to your machine: git clone git@github.com:biocompute-objects/bco-rag.git\n This example uses the ssh method, replace with HTTPS URL as needed. "},{"location":"installation/#virtual-environment","title":"Virtual Environment","text":"Create a virtual environment from with the bco-rag/ root directory: virtualenv env\n To activate the virtual environment on Windows: env/Scripts/activate\n To activate the virtual environment on MacOS/Linux: source env/bin/activate \n Then install the project dependencies: (env) pip install -r requirements.txt\n "},{"location":"installation/#openai-api-key","title":"OpenAI API Key","text":"Create your .env file and add your OpenAI API key and Github personal access token (if using Github option). For example: OPENAI_API_KEY=<KEY>\nGITHUB_TOKEN=<TOKEN>\n References: - OpenAI API Key - Github Personal Access Token "},{"location":"installation/#create-log-directory","title":"Create log Directory","text":"Within the root of the project, create the log directory: mkdir logs/\n "},{"location":"installation/#basic-usage","title":"Basic Usage","text":"The base one-shot approach can be run like so: (env) python main.py\n or (env) python main.py one-shot\n "},{"location":"intermediate-screen/","title":"Intermediate Screen","text":""},{"location":"intermediate-screen/#evaluator.frontend.components.intermediate_screen.IntermediateScreen","title":"IntermediateScreen ","text":" Bases: CTkFrame Class for the intermediate screen for the user to choose to start from the beginning or to continue from last session. Source code in evaluator/frontend/components/intermediate_screen.py class IntermediateScreen(ctk.CTkFrame):\n \"\"\"Class for the intermediate screen for the user to choose\n to start from the beginning or to continue from last session.\n \"\"\"\n\n def __init__(\n self,\n master: ctk.CTk,\n on_start: Callable[[AppState], None],\n app_state: AppState,\n **kwargs\n ):\n \"\"\"Constructor.\"\"\"\n super().__init__(master, **kwargs)\n\n self.state = app_state\n self.on_start = on_start\n\n self.grid_rowconfigure(0, weight=1)\n self.grid_columnconfigure(0, weight=1)\n self.grid(\n row=0,\n column=0,\n padx=self.state[\"padding\"] + 10,\n pady=self.state[\"padding\"] + 10,\n )\n\n self.welcome_label = ctk.CTkLabel(\n master=self, text=\"\", font=(self.state[\"font\"], 32, \"bold\")\n )\n self.welcome_label.grid(\n row=0, column=0, padx=self.state[\"padding\"], pady=self.state[\"padding\"] + 10\n )\n\n self.start_new_button = ctk.CTkButton(\n master=self,\n text=\"Start From Beginning\",\n command=self._start_new,\n font=(self.state[\"font\"], 16),\n )\n self.start_new_button.grid(\n row=1, column=0, padx=self.state[\"padding\"], pady=self.state[\"padding\"] + 10\n )\n\n if self.state[\"new_user\"]:\n welcome_text = \"New User\"\n else:\n welcome_text = \"Welcome Back\"\n self.continue_button = ctk.CTkButton(\n master=self,\n text=\"Continue Last Session\",\n command=self._continue_last,\n font=(self.state[\"font\"], 16),\n )\n self.continue_button.grid(\n row=2,\n column=0,\n padx=self.state[\"padding\"],\n pady=self.state[\"padding\"] + 10,\n )\n self.welcome_label.configure(text=welcome_text)\n\n def _start_new(self) -> None:\n \"\"\"User chose to start a new session.\"\"\"\n self.state = set_resume_session(self.state, False)\n self.on_start(self.state)\n\n def _continue_last(self) -> None:\n \"\"\"User chose to continue from last session.\"\"\"\n self.state = set_resume_session(self.state, True)\n self.on_start(self.state)\n "},{"location":"intermediate-screen/#evaluator.frontend.components.intermediate_screen.IntermediateScreen.__init__","title":"__init__(master, on_start, app_state, **kwargs) ","text":"Constructor. Source code in evaluator/frontend/components/intermediate_screen.py def __init__(\n self,\n master: ctk.CTk,\n on_start: Callable[[AppState], None],\n app_state: AppState,\n **kwargs\n):\n \"\"\"Constructor.\"\"\"\n super().__init__(master, **kwargs)\n\n self.state = app_state\n self.on_start = on_start\n\n self.grid_rowconfigure(0, weight=1)\n self.grid_columnconfigure(0, weight=1)\n self.grid(\n row=0,\n column=0,\n padx=self.state[\"padding\"] + 10,\n pady=self.state[\"padding\"] + 10,\n )\n\n self.welcome_label = ctk.CTkLabel(\n master=self, text=\"\", font=(self.state[\"font\"], 32, \"bold\")\n )\n self.welcome_label.grid(\n row=0, column=0, padx=self.state[\"padding\"], pady=self.state[\"padding\"] + 10\n )\n\n self.start_new_button = ctk.CTkButton(\n master=self,\n text=\"Start From Beginning\",\n command=self._start_new,\n font=(self.state[\"font\"], 16),\n )\n self.start_new_button.grid(\n row=1, column=0, padx=self.state[\"padding\"], pady=self.state[\"padding\"] + 10\n )\n\n if self.state[\"new_user\"]:\n welcome_text = \"New User\"\n else:\n welcome_text = \"Welcome Back\"\n self.continue_button = ctk.CTkButton(\n master=self,\n text=\"Continue Last Session\",\n command=self._continue_last,\n font=(self.state[\"font\"], 16),\n )\n self.continue_button.grid(\n row=2,\n column=0,\n padx=self.state[\"padding\"],\n pady=self.state[\"padding\"] + 10,\n )\n self.welcome_label.configure(text=welcome_text)\n "},{"location":"intermediate-screen/#evaluator.frontend.components.intermediate_screen.IntermediateScreen._start_new","title":"_start_new() ","text":"User chose to start a new session. Source code in evaluator/frontend/components/intermediate_screen.py def _start_new(self) -> None:\n \"\"\"User chose to start a new session.\"\"\"\n self.state = set_resume_session(self.state, False)\n self.on_start(self.state)\n "},{"location":"intermediate-screen/#evaluator.frontend.components.intermediate_screen.IntermediateScreen._continue_last","title":"_continue_last() ","text":"User chose to continue from last session. Source code in evaluator/frontend/components/intermediate_screen.py def _continue_last(self) -> None:\n \"\"\"User chose to continue from last session.\"\"\"\n self.state = set_resume_session(self.state, True)\n self.on_start(self.state)\n "},{"location":"login-screen/","title":"Login Screen","text":""},{"location":"login-screen/#evaluator.frontend.components.login_screen.LoginScreen","title":"LoginScreen ","text":" Bases: CTkFrame Class for the login screen. Source code in evaluator/frontend/components/login_screen.py class LoginScreen(ctk.CTkFrame):\n \"\"\"Class for the login screen.\"\"\"\n\n def __init__(\n self,\n master: ctk.CTk,\n on_login: Callable[[str, str, AppAttributes], tuple[str, Optional[AppState]]],\n on_login_success: Callable[[AppState], None],\n on_exit: Callable[[], NoReturn],\n attributes: AppAttributes,\n **kwargs\n ):\n \"\"\"Constructor.\"\"\"\n super().__init__(master, **kwargs)\n\n self.on_login = on_login\n self.on_login_success = on_login_success\n self.on_exit = on_exit\n self.attributes = attributes\n\n self.grid_rowconfigure(0, weight=1)\n self.grid_columnconfigure(0, weight=1)\n self.grid(\n row=0,\n column=0,\n padx=self.attributes[\"padding\"] + 10,\n pady=self.attributes[\"padding\"] + 10,\n )\n\n self.login_label = ctk.CTkLabel(\n master=self, text=\"Login\", font=(self.attributes[\"font\"], 32, \"bold\")\n )\n self.login_label.grid(\n row=0,\n column=0,\n columnspan=2,\n pady=(self.attributes[\"padding\"], self.attributes[\"padding\"] + 10),\n )\n\n self.first_name_entry = ctk.CTkEntry(\n master=self,\n placeholder_text=\"First name\",\n font=(self.attributes[\"font\"], 16),\n )\n self.first_name_entry.grid(\n row=1,\n column=0,\n padx=self.attributes[\"padding\"],\n pady=self.attributes[\"padding\"],\n )\n\n self.last_name_entry = ctk.CTkEntry(\n master=self,\n placeholder_text=\"Last name\",\n font=(self.attributes[\"font\"], 16),\n )\n self.last_name_entry.grid(\n row=1,\n column=1,\n padx=self.attributes[\"padding\"],\n pady=self.attributes[\"padding\"],\n )\n\n self.login_button = ctk.CTkButton(\n master=self,\n text=\"Login\",\n command=self._login,\n font=(self.attributes[\"font\"], 16),\n )\n self.login_button.grid(\n row=2, column=0, rowspan=2, columnspan=3, pady=self.attributes[\"padding\"]\n )\n\n self.exit_button = ctk.CTkButton(\n master=self,\n text=\"Exit\",\n command=self._exit_app,\n font=(self.attributes[\"font\"], 16),\n )\n self.exit_button.grid(\n row=3, column=0, rowspan=2, columnspan=3, pady=self.attributes[\"padding\"]\n )\n\n self.error_label = ctk.CTkLabel(\n master=self, text=\"\", font=(self.attributes[\"font\"], 14), text_color=\"red\"\n )\n self.error_label.grid(\n row=4, column=0, columnspan=2, pady=(self.attributes[\"padding\"], 0)\n )\n\n def _login(self) -> None:\n \"\"\"Intermediate callback for the login button.\"\"\"\n return_str, state = self.on_login(\n self.first_name_entry.get(), self.last_name_entry.get(), self.attributes\n )\n if state is None:\n self.error_label.configure(text=return_str)\n return\n self.on_login_success(state)\n\n def _exit_app(self) -> NoReturn:\n \"\"\"Intermediate callback for the exit button.\"\"\"\n self.on_exit()\n "},{"location":"login-screen/#evaluator.frontend.components.login_screen.LoginScreen.__init__","title":"__init__(master, on_login, on_login_success, on_exit, attributes, **kwargs) ","text":"Constructor. Source code in evaluator/frontend/components/login_screen.py def __init__(\n self,\n master: ctk.CTk,\n on_login: Callable[[str, str, AppAttributes], tuple[str, Optional[AppState]]],\n on_login_success: Callable[[AppState], None],\n on_exit: Callable[[], NoReturn],\n attributes: AppAttributes,\n **kwargs\n):\n \"\"\"Constructor.\"\"\"\n super().__init__(master, **kwargs)\n\n self.on_login = on_login\n self.on_login_success = on_login_success\n self.on_exit = on_exit\n self.attributes = attributes\n\n self.grid_rowconfigure(0, weight=1)\n self.grid_columnconfigure(0, weight=1)\n self.grid(\n row=0,\n column=0,\n padx=self.attributes[\"padding\"] + 10,\n pady=self.attributes[\"padding\"] + 10,\n )\n\n self.login_label = ctk.CTkLabel(\n master=self, text=\"Login\", font=(self.attributes[\"font\"], 32, \"bold\")\n )\n self.login_label.grid(\n row=0,\n column=0,\n columnspan=2,\n pady=(self.attributes[\"padding\"], self.attributes[\"padding\"] + 10),\n )\n\n self.first_name_entry = ctk.CTkEntry(\n master=self,\n placeholder_text=\"First name\",\n font=(self.attributes[\"font\"], 16),\n )\n self.first_name_entry.grid(\n row=1,\n column=0,\n padx=self.attributes[\"padding\"],\n pady=self.attributes[\"padding\"],\n )\n\n self.last_name_entry = ctk.CTkEntry(\n master=self,\n placeholder_text=\"Last name\",\n font=(self.attributes[\"font\"], 16),\n )\n self.last_name_entry.grid(\n row=1,\n column=1,\n padx=self.attributes[\"padding\"],\n pady=self.attributes[\"padding\"],\n )\n\n self.login_button = ctk.CTkButton(\n master=self,\n text=\"Login\",\n command=self._login,\n font=(self.attributes[\"font\"], 16),\n )\n self.login_button.grid(\n row=2, column=0, rowspan=2, columnspan=3, pady=self.attributes[\"padding\"]\n )\n\n self.exit_button = ctk.CTkButton(\n master=self,\n text=\"Exit\",\n command=self._exit_app,\n font=(self.attributes[\"font\"], 16),\n )\n self.exit_button.grid(\n row=3, column=0, rowspan=2, columnspan=3, pady=self.attributes[\"padding\"]\n )\n\n self.error_label = ctk.CTkLabel(\n master=self, text=\"\", font=(self.attributes[\"font\"], 14), text_color=\"red\"\n )\n self.error_label.grid(\n row=4, column=0, columnspan=2, pady=(self.attributes[\"padding\"], 0)\n )\n "},{"location":"login-screen/#evaluator.frontend.components.login_screen.LoginScreen._login","title":"_login() ","text":"Intermediate callback for the login button. Source code in evaluator/frontend/components/login_screen.py def _login(self) -> None:\n \"\"\"Intermediate callback for the login button.\"\"\"\n return_str, state = self.on_login(\n self.first_name_entry.get(), self.last_name_entry.get(), self.attributes\n )\n if state is None:\n self.error_label.configure(text=return_str)\n return\n self.on_login_success(state)\n "},{"location":"login-screen/#evaluator.frontend.components.login_screen.LoginScreen._exit_app","title":"_exit_app() ","text":"Intermediate callback for the exit button. Source code in evaluator/frontend/components/login_screen.py def _exit_app(self) -> NoReturn:\n \"\"\"Intermediate callback for the exit button.\"\"\"\n self.on_exit()\n "},{"location":"login/","title":"Login","text":"Handles the backend for the login process. "},{"location":"login/#evaluator.backend.login.login","title":"login(first_name, last_name, attributes) ","text":"Login entry point. Parameters: Name Type Description Default first_name str First name entered by the user. required last_name str Last name entered by the user. required attributes AppAttributes The current app attributes. required Returns: Type Description (str, AppState | None) A string containing the user hash on success or an error message on errror and the current app state on success or None on error. Source code in evaluator/backend/login.py def login(\n first_name: str, last_name: str, attributes: AppAttributes\n) -> tuple[str, Optional[AppState]]:\n \"\"\"Login entry point.\n\n Parameters\n ----------\n first_name : str\n First name entered by the user.\n last_name : str\n Last name entered by the user.\n attributes : AppAttributes\n The current app attributes.\n\n Returns\n -------\n (str, AppState | None)\n A string containing the user hash on success or an\n error message on errror and the current app state\n on success or None on error.\n \"\"\"\n if not first_name and not last_name:\n return \"Error: First and last name are required.\", None\n elif not first_name:\n return \"Error: First name is required.\", None\n elif not last_name:\n return \"Error: Last name is required.\", None\n\n first_name = first_name.strip().lower()\n last_name = last_name.strip().lower()\n\n user_hash = _generate_user_hash(first_name, last_name)\n\n if _check_user_existence(user_hash, attributes):\n attributes[\"logger\"].info(f\"Found existing user for {last_name}, {first_name}\")\n new_user = False\n else:\n new_user = True\n\n app_state = create_app_state(\n attributes=attributes, user_hash=user_hash, new_user=new_user\n )\n if new_user:\n app_state = create_new_user(\n app_state=app_state, first_name=first_name, last_name=last_name\n )\n\n log_state(app_state, \"app\")\n\n return user_hash, app_state\n "},{"location":"login/#evaluator.backend.login._check_user_existence","title":"_check_user_existence(user_hash, attributes) ","text":"Checks if the user already exists or not. user_hash : str The user's md5 hash. attributes : AppAttributes The current app state. Returns: Type Description bool True if the user exists, False otherwise. Source code in evaluator/backend/login.py def _check_user_existence(user_hash: str, attributes: AppAttributes) -> bool:\n \"\"\"Checks if the user already exists or not.\n\n user_hash : str\n The user's md5 hash.\n attributes : AppAttributes\n The current app state.\n\n Returns\n -------\n bool\n True if the user exists, False otherwise.\n \"\"\"\n if user_hash in attributes[\"users_data\"]:\n return True\n else:\n return False\n "},{"location":"login/#evaluator.backend.login._generate_user_hash","title":"_generate_user_hash(first_name, last_name) ","text":"Generates the user's MD5 hash. Parameters: Name Type Description Default first_name str The user's first name. required last_name str The user's last name. required Returns: Type Description str The user hash. Source code in evaluator/backend/login.py def _generate_user_hash(first_name: str, last_name: str) -> str:\n \"\"\"Generates the user's MD5 hash.\n\n Parameters\n ----------\n first_name : str\n The user's first name.\n last_name : str\n The user's last name.\n\n Returns\n -------\n str\n The user hash.\n \"\"\"\n name_list = [first_name, last_name]\n sorted(name_list)\n name_str = \"_\".join(name_list)\n hash_hex = md5(name_str.encode(\"utf-8\")).hexdigest()\n return hash_hex\n "},{"location":"misc_functions/","title":"Utils","text":"Miscellaneous util functions. "},{"location":"misc_functions/#bcorag.misc_functions.graceful_exit","title":"graceful_exit(exit_code=0, error_msg=None) ","text":"Gracefully exits the program with an exit code. Parameters: Name Type Description Default exit_code int The exit code. 0 error_msg str | None The error message to print before exiting. None Source code in bcorag/misc_functions.py def graceful_exit(exit_code: int = 0, error_msg: Optional[str] = None) -> NoReturn:\n \"\"\"Gracefully exits the program with an exit code.\n\n Parameters\n ----------\n exit_code : int, optional\n The exit code.\n error_msg : str | None, optional\n The error message to print before exiting.\n \"\"\"\n if exit_code != 0:\n if error_msg is not None:\n print(f\"{error_msg}\")\n print(f\"exit code: {exit_code}\")\n print(\"Exiting...\")\n logging.info(f\"Exiting with status code {exit_code}.\")\n logging.info(\n \"---------------------------------- RUN END ----------------------------------\"\n )\n sys.exit(exit_code)\n "},{"location":"misc_functions/#bcorag.misc_functions.load_json","title":"load_json(filepath) ","text":"Loads a JSON file and returns the deserialized data (or an empty dict if the file doesn't exist). Parameters: Name Type Description Default filepath str File path to the JSON file to load. required Returns: Type Description dict | None The deserialized JSON data or None if the file doesn't exist. Source code in bcorag/misc_functions.py def load_json(filepath: str) -> Optional[dict]:\n \"\"\"Loads a JSON file and returns the deserialized data (or\n an empty dict if the file doesn't exist).\n\n Parameters\n ----------\n filepath : str\n File path to the JSON file to load.\n\n Returns\n -------\n dict | None\n The deserialized JSON data or None if the file doesn't exist.\n \"\"\"\n if not os.path.isfile(filepath):\n return None\n with open(filepath, \"r\") as f:\n data = json.load(f)\n return data\n "},{"location":"misc_functions/#bcorag.misc_functions.load_config_data","title":"load_config_data(filepath='./conf.json') ","text":"Loads the config JSON object file. Parameters: Name Type Description Default filepath str File path to the config JSON file. './conf.json' Returns: Type Description ConfigObject | None Casted ConfigObject or None on some type of error. Source code in bcorag/misc_functions.py def load_config_data(filepath: str = \"./conf.json\") -> Optional[ConfigObject]:\n \"\"\"Loads the config JSON object file.\n\n Parameters\n ----------\n filepath : str, optional\n File path to the config JSON file.\n\n Returns\n -------\n ConfigObject | None\n Casted ConfigObject or None on some type of error.\n \"\"\"\n naive_load_data = load_json(filepath)\n if naive_load_data is None:\n return None\n if isinstance(naive_load_data, dict):\n config_object = cast(ConfigObject, naive_load_data)\n return config_object\n return None\n "},{"location":"misc_functions/#bcorag.misc_functions.load_output_tracker","title":"load_output_tracker(filepath) ","text":"Loads the JSON output tracker file. Parameters: Name Type Description Default filepath str File path to the JSON file to load. required Returns: Type Description OutputTrackerFile | None Casted OutputTrackerFile or None on some type of error. Source code in bcorag/misc_functions.py def load_output_tracker(filepath: str) -> Optional[OutputTrackerFile]:\n \"\"\"Loads the JSON output tracker file.\n\n Parameters\n ----------\n filepath : str\n File path to the JSON file to load.\n\n Returns\n -------\n OutputTrackerFile | None\n Casted OutputTrackerFile or None on some type of error.\n \"\"\"\n naive_load_data = load_json(filepath)\n if naive_load_data is None:\n return None\n if isinstance(naive_load_data, dict):\n output_tracker_data = cast(OutputTrackerFile, naive_load_data)\n return output_tracker_data\n return None\n "},{"location":"misc_functions/#bcorag.misc_functions.write_json","title":"write_json(output_path, data) ","text":"Writes JSON out to the output path. Will create the file if it doesn't exist. Parameters: Name Type Description Default output_path str The output file path. required data dict | list | OutputTrackerFile The data to dump. required Returns: Type Description bool Whether the process was successful. Source code in bcorag/misc_functions.py def write_json(output_path: str, data: dict | list | OutputTrackerFile) -> bool:\n \"\"\"Writes JSON out to the output path. Will create the file if it doesn't exist.\n\n Parameters\n ----------\n output_path : str\n The output file path.\n data : dict | list | OutputTrackerFile\n The data to dump.\n\n Returns\n -------\n bool\n Whether the process was successful.\n \"\"\"\n try:\n with open(output_path, \"w\") as f:\n json.dump(data, f, indent=4)\n return True\n except Exception as e:\n logging.error(f\"Failed to dump JSON to output path '{output_path}'.\\n{e}\")\n return False\n "},{"location":"misc_functions/#bcorag.misc_functions.dump_output_file_map_tsv","title":"dump_output_file_map_tsv(output_path, data) ","text":"Dumps the OutputTrackerFile object into a TSV table for better human readability. Parameters: Name Type Description Default output_path str The output file path. required data OutputTrackerFile The OutputTrackerFile object to format for a TSV file. required Source code in bcorag/misc_functions.py def dump_output_file_map_tsv(output_path: str, data: OutputTrackerFile):\n \"\"\"Dumps the OutputTrackerFile object into a TSV table for better\n human readability.\n\n Parameters\n ----------\n output_path : str\n The output file path.\n data: OutputTrackerFile\n The OutputTrackerFile object to format for a TSV file.\n \"\"\"\n with open(output_path, mode=\"w\", newline=\"\") as out_file:\n tsv_writer = csv.writer(out_file, delimiter=\"\\t\")\n tsv_writer.writerow(\n [\n \"timestamp\",\n \"domain\",\n \"txt_file\",\n \"json_file\",\n \"node_source_file\",\n \"hash_string\",\n \"index\",\n \"loader\",\n \"vector_store\",\n \"llm\",\n \"embedding_model\",\n \"similarity_top_k\",\n \"chunking_config\",\n \"git_user\",\n \"git_repo\",\n \"git_branch\",\n \"directory_filter\",\n \"file_ext_filter\",\n \"elapsed_time\",\n \"version\",\n ]\n )\n domain: DomainKey\n for domain in get_args(DomainKey):\n domain_entry_list = data[domain]\n for entry_set in domain_entry_list:\n for entry in entry_set[\"entries\"][\"runs\"]:\n row = [\n entry[\"timestamp\"],\n domain,\n os.path.basename(entry[\"txt_file\"]),\n os.path.basename(entry[\"json_file\"]),\n os.path.basename(entry[\"source_node_file\"]),\n entry_set[\"hash_str\"],\n entry[\"index\"],\n entry_set[\"entries\"][\"params\"][\"loader\"],\n entry_set[\"entries\"][\"params\"][\"vector_store\"],\n entry_set[\"entries\"][\"params\"][\"llm\"],\n entry_set[\"entries\"][\"params\"][\"embedding_model\"],\n entry_set[\"entries\"][\"params\"][\"similarity_top_k\"],\n entry_set[\"entries\"][\"params\"][\"chunking_config\"],\n entry_set[\"entries\"][\"params\"][\"git_user\"],\n entry_set[\"entries\"][\"params\"][\"git_repo\"],\n entry_set[\"entries\"][\"params\"][\"git_branch\"],\n entry_set[\"entries\"][\"params\"][\"directory_git_filter\"],\n entry_set[\"entries\"][\"params\"][\"file_ext_git_filter\"],\n entry[\"elapsed_time\"],\n entry[\"version\"],\n ]\n tsv_writer.writerow(row)\n "},{"location":"misc_functions/#bcorag.misc_functions.dump_string","title":"dump_string(output_path, data) ","text":"Dumps a string to a text file. Parameters: Name Type Description Default output_path str The output file path. required data str The string to dump. required Source code in bcorag/misc_functions.py def dump_string(output_path: str, data: str):\n \"\"\"Dumps a string to a text file.\n\n Parameters\n ----------\n output_path : str\n The output file path.\n data: str\n The string to dump.\n \"\"\"\n check_dir(os.path.split(output_path)[0])\n with open(output_path, \"w\") as f:\n f.write(data)\n "},{"location":"misc_functions/#bcorag.misc_functions.check_dir","title":"check_dir(path) ","text":"Checks whether a directory creates and if it doesn't, create it. Note, this really only works for checking/creating the last level direcotry. Will fail if there are issues in the parent level directories in the path. Parameters: Name Type Description Default path str Directory filepath to check. required Source code in bcorag/misc_functions.py def check_dir(path: str):\n \"\"\"Checks whether a directory creates and if it doesn't, create it. Note, this\n really only works for checking/creating the last level direcotry. Will fail if\n there are issues in the parent level directories in the path.\n\n Parameters\n ----------\n path : str\n Directory filepath to check.\n \"\"\"\n if not os.path.isdir(path):\n os.mkdir(path)\n "},{"location":"misc_functions/#bcorag.misc_functions.setup_root_logger","title":"setup_root_logger(log_path, name='bcorag') ","text":"Configures the root logger. Parameters: Name Type Description Default log_path str The filepath to the log handler. required name str The name of the root logger. 'bcorag' Returns: Type Description Logger The root logger. Source code in bcorag/misc_functions.py def setup_root_logger(log_path: str, name: str = \"bcorag\") -> logging.Logger:\n \"\"\"Configures the root logger.\n\n Parameters\n ----------\n log_path : str\n The filepath to the log handler.\n name : str, optional\n The name of the root logger.\n\n Returns\n -------\n logging.Logger\n The root logger.\n \"\"\"\n logger = logging.getLogger(name)\n logger.setLevel(logging.DEBUG)\n handler = logging.FileHandler(filename=log_path, encoding=\"utf-8\", mode=\"w\")\n formatter = logging.Formatter(\n \"%(asctime)s - %(levelname)s - %(name)s - %(message)s\"\n )\n handler.setFormatter(formatter)\n logger.addHandler(handler)\n return logger\n "},{"location":"misc_functions/#bcorag.misc_functions.setup_document_logger","title":"setup_document_logger(name, parent_logger='bcorag') ","text":"Configures a document specific logger. Parameters: Name Type Description Default name str The name of the document to setup the logger for. required parent_logger str Name of the parent logger to setup under. 'bcorag' Returns: Type Description Logger The document logger. Source code in bcorag/misc_functions.py def setup_document_logger(name: str, parent_logger: str = \"bcorag\") -> logging.Logger:\n \"\"\"Configures a document specific logger.\n\n Parameters\n ----------\n name : str\n The name of the document to setup the logger for.\n parent_logger : str, optional\n Name of the parent logger to setup under.\n\n Returns\n -------\n logging.Logger\n The document logger.\n \"\"\"\n document_logger_name = f\"{parent_logger}.{name}\"\n return logging.getLogger(document_logger_name)\n "},{"location":"misc_functions/#bcorag.misc_functions.create_timestamp","title":"create_timestamp() ","text":"Creates a current timestamp. Returns: Type Description str The current timestamp as a string. Source code in bcorag/misc_functions.py def create_timestamp() -> str:\n \"\"\"Creates a current timestamp.\n\n Returns\n -------\n str\n The current timestamp as a string.\n \"\"\"\n timestamp = datetime.datetime.now(pytz.timezone(TIMEZONE)).strftime(\n TIMESTAMP_FORMAT\n )\n return timestamp\n "},{"location":"misc_functions/#bcorag.misc_functions.extract_repo_data","title":"extract_repo_data(url) ","text":"Extracts the repository information from the repo URL. Parameters: Name Type Description Default url str The Github repository URL. required Returns: Type Description (str, str) | None Returns the tuple containing the extracted github user and repo or None on failure to parse the URL. Source code in bcorag/misc_functions.py def extract_repo_data(url: str) -> Optional[tuple[str, str]]:\n \"\"\"Extracts the repository information from the repo URL.\n\n Parameters\n ----------\n url : str\n The Github repository URL.\n\n Returns\n -------\n (str, str) | None\n Returns the tuple containing the extracted github user\n and repo or None on failure to parse the URL.\n \"\"\"\n url = url.strip().lower()\n pattern = r\"https://github\\.com/([^/]+)/([^/]+)\"\n match = re.match(pattern, url)\n if match is None:\n return None\n user = str(match.groups()[0])\n repo = str(match.groups()[1])\n return user, repo\n "},{"location":"misc_functions/#bcorag.misc_functions.get_file_list","title":"get_file_list(path, filetype='pdf') ","text":"Gets the files from a glob pattern. Parameters: Name Type Description Default path str The file path to the target directory. required filetype str The file type to capture. 'pdf' Returns: Type Description list[str] List of the file paths found from the glob pattern. Source code in bcorag/misc_functions.py def get_file_list(path: str, filetype: str = \"pdf\") -> list[str]:\n \"\"\"Gets the files from a glob pattern.\n\n Parameters\n ----------\n path : str\n The file path to the target directory.\n filetype : str, optional\n The file type to capture.\n\n Returns\n -------\n list[str]\n List of the file paths found from the glob pattern.\n \"\"\"\n target_files = glob.glob(os.path.join(path, f\"*.{filetype}\"))\n return target_files\n "},{"location":"miscellaneous-frame/","title":"Miscellaneous Frame","text":""},{"location":"miscellaneous-frame/#evaluator.frontend.components.evaluation_frames.miscellaneous_frame.MiscFrame","title":"MiscFrame ","text":" Bases: CTkFrame , EvaluationBaseFrame Class for the miscellaneous evaluation frame. Source code in evaluator/frontend/components/evaluation_frames/miscellaneous_frame.py class MiscFrame(ctk.CTkFrame, EvaluationBaseFrame):\n \"\"\"Class for the miscellaneous evaluation frame.\"\"\"\n\n def __init__(\n self, master: ctk.CTkFrame, app_state: AppState, run_state: RunState, **kwargs\n ):\n \"\"\"Constructor.\"\"\"\n super().__init__(master, **kwargs)\n self.state = app_state\n self.run = run_state\n self.misc_eval = self.run[\"eval_data\"][\"misc_eval\"]\n\n self.grid_columnconfigure(0, weight=1)\n self.grid_columnconfigure(1, weight=20)\n self.grid_rowconfigure(8, weight=1)\n\n self.main_misc_label = ctk.CTkLabel(\n master=self,\n text=\"Miscellaneous Evaluation\",\n font=(self.state[\"font\"], 28, \"bold\"),\n )\n self.main_misc_label.grid(\n row=0,\n column=0,\n columnspan=2,\n padx=self.state[\"padding\"],\n pady=self.state[\"padding\"],\n sticky=\"n\",\n )\n\n self.human_domain_rating_label = ctk.CTkLabel(\n master=self,\n text=\"What would you rate the human curated domain?\",\n font=(self.state[\"font\"], 16, \"bold\"),\n )\n self.human_domain_rating_label.grid(\n row=2,\n column=0,\n padx=(self.state[\"padding\"], 0),\n pady=(self.state[\"padding\"] // 2, self.state[\"padding\"] // 4),\n sticky=\"w\",\n )\n\n self.human_domain_rating_var = ctk.IntVar(\n value=self.misc_eval.get(\n \"human_domain_rating\", EVAL_DEFAULTS[\"human_domain_rating\"]\n )\n )\n self.human_domain_rating_button = ctk.CTkSegmentedButton(\n master=self, values=[-1, 0, 1, 2], variable=self.human_domain_rating_var\n )\n self.human_domain_rating_button.grid(\n row=3,\n column=0,\n padx=(self.state[\"padding\"], 0),\n pady=(self.state[\"padding\"] // 4, self.state[\"padding\"] // 2),\n sticky=\"w\",\n )\n\n self.evaluator_conf_label = ctk.CTkLabel(\n master=self,\n text=\"What is your confidence in your evaluation?\",\n font=(self.state[\"font\"], 16, \"bold\"),\n )\n self.evaluator_conf_label.grid(\n row=4,\n column=0,\n padx=(self.state[\"padding\"], 0),\n pady=(self.state[\"padding\"] // 2, self.state[\"padding\"] // 4),\n sticky=\"w\",\n )\n\n self.evaluator_conf_var = ctk.IntVar(\n value=self.misc_eval.get(\n \"evaluator_confidence_rating\",\n EVAL_DEFAULTS[\"evaluator_confidence_rating\"],\n )\n )\n self.evaluator_conf_button = ctk.CTkSegmentedButton(\n master=self, values=[-1, 0, 1, 2], variable=self.evaluator_conf_var\n )\n self.evaluator_conf_button.grid(\n row=5,\n column=0,\n padx=(self.state[\"padding\"], 0),\n pady=(self.state[\"padding\"] // 4, self.state[\"padding\"] // 2),\n sticky=\"w\",\n )\n\n self.evaluator_fam_label = ctk.CTkLabel(\n master=self,\n text=\"What is your familiarity with the paper content?\",\n font=(self.state[\"font\"], 16, \"bold\"),\n )\n self.evaluator_fam_label.grid(\n row=6,\n column=0,\n padx=(self.state[\"padding\"], 0),\n pady=(self.state[\"padding\"] // 2, self.state[\"padding\"] // 4),\n sticky=\"w\",\n )\n\n self.evaluator_fam_var = ctk.IntVar(\n value=self.misc_eval.get(\n \"evaluator_familiarity_level\",\n EVAL_DEFAULTS[\"evaluator_familiarity_level\"],\n )\n )\n self.evaluator_fam_button = ctk.CTkSegmentedButton(\n master=self, values=[-1, 0, 1, 2], variable=self.evaluator_conf_var\n )\n self.evaluator_fam_button.grid(\n row=7,\n column=0,\n padx=(self.state[\"padding\"], 0),\n pady=(self.state[\"padding\"] // 4, self.state[\"padding\"] // 2),\n sticky=\"w\",\n )\n\n self.misc_notes_label = ctk.CTkLabel(\n master=self, text=\"Notes\", font=(self.state[\"font\"], 16, \"bold\")\n )\n self.misc_notes_label.grid(\n row=2,\n column=1,\n padx=(0, self.state[\"padding\"] // 4),\n pady=(self.state[\"padding\"] // 2, self.state[\"padding\"] // 4),\n sticky=\"w\",\n )\n\n self.misc_notes = ctk.CTkTextbox(\n master=self, wrap=\"word\", font=(self.state[\"font\"], 18)\n )\n self.misc_notes.grid(\n row=3,\n rowspan=6,\n column=1,\n padx=(0, self.state[\"padding\"] // 2),\n pady=(self.state[\"padding\"] // 4, self.state[\"padding\"]),\n sticky=\"nsew\",\n )\n\n def update_state(self, app_state: AppState, run_state: RunState) -> None:\n \"\"\"Update the component state.\n\n Parameters\n ----------\n app_state : AppState\n The updated app state.\n run_state : RunState\n The updated run state.\n \"\"\"\n self.run = run_state\n self.state = app_state\n self.misc_eval = self.run[\"eval_data\"][\"misc_eval\"]\n\n self.human_domain_rating_var = ctk.IntVar(\n value=self.misc_eval.get(\n \"human_domain_rating\", EVAL_DEFAULTS[\"human_domain_rating\"]\n )\n )\n self.human_domain_rating_button.configure(variable=self.human_domain_rating_var)\n\n self.evaluator_conf_var = ctk.IntVar(\n value=self.misc_eval.get(\n \"evaluator_confidence_rating\",\n EVAL_DEFAULTS[\"evaluator_confidence_rating\"],\n )\n )\n self.evaluator_conf_button.configure(variable=self.evaluator_conf_var)\n\n self.evaluator_fam_var = ctk.IntVar(\n value=self.misc_eval.get(\n \"evaluator_familiarity_level\",\n EVAL_DEFAULTS[\"evaluator_familiarity_level\"],\n )\n )\n self.evaluator_fam_button.configure(variable=self.evaluator_fam_var)\n\n self.misc_notes.delete(0.0, \"end\")\n self.misc_notes.insert(0.0, self.misc_eval.get(\"notes\", EVAL_DEFAULTS[\"notes\"]))\n\n def get_results(self) -> MiscEval:\n \"\"\"Returns the miscellaneous evaluations.\n\n Returns\n -------\n MiscEval\n The miscellaneous evaluation results.\n \"\"\"\n human_domain_rating = self.human_domain_rating_var.get()\n evaluator_conf_rating = self.evaluator_conf_var.get()\n evaluator_familiarity_level = self.evaluator_fam_var.get()\n misc_eval = create_misc_eval(\n human_domain_rating=human_domain_rating,\n evaluator_confidence_rating=evaluator_conf_rating,\n evaluator_familiarity_level=evaluator_familiarity_level,\n notes=self.misc_notes.get(0.0, \"end\"),\n )\n return misc_eval\n "},{"location":"miscellaneous-frame/#evaluator.frontend.components.evaluation_frames.miscellaneous_frame.MiscFrame.__init__","title":"__init__(master, app_state, run_state, **kwargs) ","text":"Constructor. Source code in evaluator/frontend/components/evaluation_frames/miscellaneous_frame.py def __init__(\n self, master: ctk.CTkFrame, app_state: AppState, run_state: RunState, **kwargs\n):\n \"\"\"Constructor.\"\"\"\n super().__init__(master, **kwargs)\n self.state = app_state\n self.run = run_state\n self.misc_eval = self.run[\"eval_data\"][\"misc_eval\"]\n\n self.grid_columnconfigure(0, weight=1)\n self.grid_columnconfigure(1, weight=20)\n self.grid_rowconfigure(8, weight=1)\n\n self.main_misc_label = ctk.CTkLabel(\n master=self,\n text=\"Miscellaneous Evaluation\",\n font=(self.state[\"font\"], 28, \"bold\"),\n )\n self.main_misc_label.grid(\n row=0,\n column=0,\n columnspan=2,\n padx=self.state[\"padding\"],\n pady=self.state[\"padding\"],\n sticky=\"n\",\n )\n\n self.human_domain_rating_label = ctk.CTkLabel(\n master=self,\n text=\"What would you rate the human curated domain?\",\n font=(self.state[\"font\"], 16, \"bold\"),\n )\n self.human_domain_rating_label.grid(\n row=2,\n column=0,\n padx=(self.state[\"padding\"], 0),\n pady=(self.state[\"padding\"] // 2, self.state[\"padding\"] // 4),\n sticky=\"w\",\n )\n\n self.human_domain_rating_var = ctk.IntVar(\n value=self.misc_eval.get(\n \"human_domain_rating\", EVAL_DEFAULTS[\"human_domain_rating\"]\n )\n )\n self.human_domain_rating_button = ctk.CTkSegmentedButton(\n master=self, values=[-1, 0, 1, 2], variable=self.human_domain_rating_var\n )\n self.human_domain_rating_button.grid(\n row=3,\n column=0,\n padx=(self.state[\"padding\"], 0),\n pady=(self.state[\"padding\"] // 4, self.state[\"padding\"] // 2),\n sticky=\"w\",\n )\n\n self.evaluator_conf_label = ctk.CTkLabel(\n master=self,\n text=\"What is your confidence in your evaluation?\",\n font=(self.state[\"font\"], 16, \"bold\"),\n )\n self.evaluator_conf_label.grid(\n row=4,\n column=0,\n padx=(self.state[\"padding\"], 0),\n pady=(self.state[\"padding\"] // 2, self.state[\"padding\"] // 4),\n sticky=\"w\",\n )\n\n self.evaluator_conf_var = ctk.IntVar(\n value=self.misc_eval.get(\n \"evaluator_confidence_rating\",\n EVAL_DEFAULTS[\"evaluator_confidence_rating\"],\n )\n )\n self.evaluator_conf_button = ctk.CTkSegmentedButton(\n master=self, values=[-1, 0, 1, 2], variable=self.evaluator_conf_var\n )\n self.evaluator_conf_button.grid(\n row=5,\n column=0,\n padx=(self.state[\"padding\"], 0),\n pady=(self.state[\"padding\"] // 4, self.state[\"padding\"] // 2),\n sticky=\"w\",\n )\n\n self.evaluator_fam_label = ctk.CTkLabel(\n master=self,\n text=\"What is your familiarity with the paper content?\",\n font=(self.state[\"font\"], 16, \"bold\"),\n )\n self.evaluator_fam_label.grid(\n row=6,\n column=0,\n padx=(self.state[\"padding\"], 0),\n pady=(self.state[\"padding\"] // 2, self.state[\"padding\"] // 4),\n sticky=\"w\",\n )\n\n self.evaluator_fam_var = ctk.IntVar(\n value=self.misc_eval.get(\n \"evaluator_familiarity_level\",\n EVAL_DEFAULTS[\"evaluator_familiarity_level\"],\n )\n )\n self.evaluator_fam_button = ctk.CTkSegmentedButton(\n master=self, values=[-1, 0, 1, 2], variable=self.evaluator_conf_var\n )\n self.evaluator_fam_button.grid(\n row=7,\n column=0,\n padx=(self.state[\"padding\"], 0),\n pady=(self.state[\"padding\"] // 4, self.state[\"padding\"] // 2),\n sticky=\"w\",\n )\n\n self.misc_notes_label = ctk.CTkLabel(\n master=self, text=\"Notes\", font=(self.state[\"font\"], 16, \"bold\")\n )\n self.misc_notes_label.grid(\n row=2,\n column=1,\n padx=(0, self.state[\"padding\"] // 4),\n pady=(self.state[\"padding\"] // 2, self.state[\"padding\"] // 4),\n sticky=\"w\",\n )\n\n self.misc_notes = ctk.CTkTextbox(\n master=self, wrap=\"word\", font=(self.state[\"font\"], 18)\n )\n self.misc_notes.grid(\n row=3,\n rowspan=6,\n column=1,\n padx=(0, self.state[\"padding\"] // 2),\n pady=(self.state[\"padding\"] // 4, self.state[\"padding\"]),\n sticky=\"nsew\",\n )\n "},{"location":"miscellaneous-frame/#evaluator.frontend.components.evaluation_frames.miscellaneous_frame.MiscFrame.update_state","title":"update_state(app_state, run_state) ","text":"Update the component state. Parameters: Name Type Description Default app_state AppState The updated app state. required run_state RunState The updated run state. required Source code in evaluator/frontend/components/evaluation_frames/miscellaneous_frame.py def update_state(self, app_state: AppState, run_state: RunState) -> None:\n \"\"\"Update the component state.\n\n Parameters\n ----------\n app_state : AppState\n The updated app state.\n run_state : RunState\n The updated run state.\n \"\"\"\n self.run = run_state\n self.state = app_state\n self.misc_eval = self.run[\"eval_data\"][\"misc_eval\"]\n\n self.human_domain_rating_var = ctk.IntVar(\n value=self.misc_eval.get(\n \"human_domain_rating\", EVAL_DEFAULTS[\"human_domain_rating\"]\n )\n )\n self.human_domain_rating_button.configure(variable=self.human_domain_rating_var)\n\n self.evaluator_conf_var = ctk.IntVar(\n value=self.misc_eval.get(\n \"evaluator_confidence_rating\",\n EVAL_DEFAULTS[\"evaluator_confidence_rating\"],\n )\n )\n self.evaluator_conf_button.configure(variable=self.evaluator_conf_var)\n\n self.evaluator_fam_var = ctk.IntVar(\n value=self.misc_eval.get(\n \"evaluator_familiarity_level\",\n EVAL_DEFAULTS[\"evaluator_familiarity_level\"],\n )\n )\n self.evaluator_fam_button.configure(variable=self.evaluator_fam_var)\n\n self.misc_notes.delete(0.0, \"end\")\n self.misc_notes.insert(0.0, self.misc_eval.get(\"notes\", EVAL_DEFAULTS[\"notes\"]))\n "},{"location":"miscellaneous-frame/#evaluator.frontend.components.evaluation_frames.miscellaneous_frame.MiscFrame.get_results","title":"get_results() ","text":"Returns the miscellaneous evaluations. Returns: Type Description MiscEval The miscellaneous evaluation results. Source code in evaluator/frontend/components/evaluation_frames/miscellaneous_frame.py def get_results(self) -> MiscEval:\n \"\"\"Returns the miscellaneous evaluations.\n\n Returns\n -------\n MiscEval\n The miscellaneous evaluation results.\n \"\"\"\n human_domain_rating = self.human_domain_rating_var.get()\n evaluator_conf_rating = self.evaluator_conf_var.get()\n evaluator_familiarity_level = self.evaluator_fam_var.get()\n misc_eval = create_misc_eval(\n human_domain_rating=human_domain_rating,\n evaluator_confidence_rating=evaluator_conf_rating,\n evaluator_familiarity_level=evaluator_familiarity_level,\n notes=self.misc_notes.get(0.0, \"end\"),\n )\n return misc_eval\n "},{"location":"miscellaneous/","title":"Utils","text":""},{"location":"miscellaneous/#evaluator.backend.miscellaneous.exit_app","title":"exit_app() ","text":"Gracefully exits the app. Source code in evaluator/backend/miscellaneous.py def exit_app() -> NoReturn:\n \"\"\"Gracefully exits the app.\"\"\"\n misc_functions.graceful_exit(0)\n "},{"location":"miscellaneous/#evaluator.backend.miscellaneous.log_state","title":"log_state(state, state_type) ","text":"Logs the app state. Parameters: Name Type Description Default state AppState or RunState The state to log. required state_type app or run The type of state being logged. required Source code in evaluator/backend/miscellaneous.py def log_state(state: AppState | RunState, state_type: Literal[\"app\", \"run\"]) -> None:\n \"\"\"Logs the app state.\n\n Parameters\n ----------\n state : AppState or RunState\n The state to log.\n state_type : \"app\" or \"run\"\n The type of state being logged.\n \"\"\"\n app_state_flag = True if state_type.lower().strip() == \"app\" else False\n log_str = \"App state:\\n\" if app_state_flag else \"Run state:\\n\"\n\n if app_state_flag:\n app_state = cast(AppState, state)\n app_key: AppStateKey\n for app_key in get_args(AppStateKey):\n if app_key == \"logger\":\n continue\n log_str += f\"\\t{app_key}: {app_state[app_key]}\\n\"\n else:\n run_state = cast(RunState, state)\n run_key: RunStateKey\n for run_key in get_args(RunStateKey):\n if run_key in {\n \"generated_domain\",\n \"human_curated_domain\",\n \"reference_nodes\",\n \"param_set\",\n \"logger\",\n \"eval_data\"\n }:\n continue\n log_str += f\"\\t{run_key}: {run_state[run_key]}\\n\"\n\n state[\"logger\"].info(log_str)\n "},{"location":"option-picker/","title":"Option Picker","text":"Simple CLI interface for choosing one of the pre-selected baseline testing paper. Will automatically grab any PDF file in the ../../papers/ directory. "},{"location":"option-picker/#bcorag.option_picker.initialize_picker","title":"initialize_picker(filetype='pdf') ","text":"Kicks off the initial pipeline step where the user picks their PDF file to index and chooser the data loader from a pre-set list. Parameters: Name Type Description Default filetype str The filetype to filter on, this project was build to handle PDF files so it is highly unlikely you will want to override this default. 'pdf' Returns: Type Description UserSelections | None The user selections or None indicating user chose to exit or error. Source code in bcorag/option_picker.py def initialize_picker(filetype: str = \"pdf\") -> Optional[UserSelections]:\n \"\"\"Kicks off the initial pipeline step where the user picks their\n PDF file to index and chooser the data loader from a pre-set list.\n\n Parameters\n ----------\n filetype : str, optional\n The filetype to filter on, this project was build to handle PDF\n files so it is highly unlikely you will want to override this default.\n\n Returns\n -------\n UserSelections | None\n The user selections or None indicating user chose to exit or error.\n \"\"\"\n\n presets = misc_fns.load_config_data(\"./bcorag/conf.json\")\n if presets is None or isinstance(presets, list):\n print(f\"Error reading config file. Got type `{type(presets)}` for `presets`.\")\n misc_fns.graceful_exit()\n\n # set base keys\n return_data: UserSelections = { # type: ignore\n f\"{option}\": None for option in presets[\"options\"].keys()\n }\n\n target_file_information = _file_picker(presets[\"paper_directory\"], filetype)\n if target_file_information is None:\n return None\n return_data[\"filename\"] = target_file_information[0]\n return_data[\"filepath\"] = target_file_information[1]\n\n option: OptionKey\n for option in get_args(OptionKey):\n target_option = _create_picker(\n option,\n presets[\"options\"][option][\"documentation\"],\n presets[\"options\"][option][\"list\"],\n presets[\"options\"][option].get(\"default\", None),\n )\n if target_option is None:\n return None\n return_data[option] = int(target_option) if option in {\"similarity_top_k\"} else target_option # type: ignore\n\n repo_data = _repo_picker()\n if repo_data == 0:\n return None\n if repo_data is None:\n return_data[\"git_data\"] = None\n else:\n return_data[\"git_data\"] = repo_data\n\n in_progress_docs_path = _in_progress_docs()\n if in_progress_docs_path:\n return_data[\"other_docs\"] = [in_progress_docs_path]\n\n return return_data\n "},{"location":"option-picker/#bcorag.option_picker._file_picker","title":"_file_picker(path, filetype='pdf') ","text":"Create the CLI menu to pick the PDF file from the papers directory. Parameters: Name Type Description Default path str The path to the directory to display the CLI menu for. required filetype str The filetype to filter on, this project was build to handle PDF files so it is highly unlikely you will want to override this default. 'pdf' Returns: Type Description (str, str) | None Returns the name and path of the selected file or None if the user selects exit. Source code in bcorag/option_picker.py def _file_picker(path: str, filetype: str = \"pdf\") -> Optional[Tuple[str, str]]:\n \"\"\"Create the CLI menu to pick the PDF file from the papers directory.\n\n Parameters\n ----------\n path : str\n The path to the directory to display the CLI menu for.\n filetype : str, optional\n The filetype to filter on, this project was build to handle PDF\n files so it is highly unlikely you will want to override this default.\n\n Returns\n -------\n (str, str) | None\n Returns the name and path of the selected file or None if the user selects exit.\n \"\"\"\n target_files = misc_fns.get_file_list(path, filetype)\n pick_options = [os.path.basename(filename) for filename in target_files]\n pick_options.append(EXIT_OPTION)\n pick_title = \"Please choose the PDF file to index:\"\n option, _ = pick(pick_options, pick_title, indicator=\"->\")\n option = str(option)\n if option == EXIT_OPTION:\n return None\n return str(option), f\"{path}{option}\"\n "},{"location":"option-picker/#bcorag.option_picker._repo_picker","title":"_repo_picker() ","text":"Allows the user to input a github repository link to be included in the indexing. Returns: Type Description GitData | None | 0 Returns parsed repo information from the link, None if the user skips this step, or 0 (exit status) if the user chooses to exit. Source code in bcorag/option_picker.py def _repo_picker() -> Optional[GitData] | Literal[0]:\n \"\"\"Allows the user to input a github repository link to be included in the indexing.\n\n Returns\n -------\n GitData | None | 0\n Returns parsed repo information from the link, None if the user skips this step,\n or 0 (exit status) if the user chooses to exit.\n \"\"\"\n\n while True:\n\n url_prompt = 'If you would like to include a Github repository enter the URL below. Enter \"x\" to exit or leave blank to skip.\\n> '\n url = input(url_prompt)\n if not url or url is None:\n print(\"Skipping Github repo...\")\n return None\n elif url == \"x\":\n return 0\n\n match = misc_fns.extract_repo_data(url)\n if match is None:\n print(\"Error parsing repository URL.\")\n continue\n user = match[0]\n repo = match[1]\n\n branch = input(\"Repo branch to index (case sensitive):\\n> \")\n if not branch:\n branch = \"main\"\n\n git_filters: list[GitFilters] = []\n\n directory_filter_prompt = \"Would you like to include a directory filter?\"\n directory_filter_prompt += \"\\nEnter a list of comma-delimited directories to either conditionally exclude or inclusively include. \"\n directory_filter_prompt += \"Or leave blank to skip.\\n> \"\n directory_filter_val = input(directory_filter_prompt)\n if directory_filter_val and directory_filter_val is not None:\n directories = [\n dir.strip() for dir in directory_filter_val.split(\",\") if dir.strip()\n ]\n directory_filter_condition_prompt = (\n 'Enter \"include\" or \"exclude\" for the directory filter.\\n> '\n )\n directory_filter_condition_val = input(directory_filter_condition_prompt)\n directory_filter_type = (\n GithubRepositoryReader.FilterType.INCLUDE\n if directory_filter_condition_val.lower().strip() == \"include\"\n else GithubRepositoryReader.FilterType.EXCLUDE\n )\n directory_filter = create_git_filters(directory_filter_type, GitFilter.DIRECTORY, value=directories)\n git_filters.append(directory_filter)\n\n file_ext_filter_prompt = \"Would you like to include a file extension filter?\"\n file_ext_filter_prompt += \"\\nEnter a list of comma-delimited file extensions to either conditionally exclude or inclusively include. \"\n file_ext_filter_prompt += \"Or leave blank to skip.\\n> \"\n file_ext_filter_val = input(file_ext_filter_prompt)\n if file_ext_filter_val and file_ext_filter_val is not None:\n file_exts = [\n ext.strip() for ext in file_ext_filter_val.split(\",\") if ext.strip()\n ]\n file_ext_filter_condition_prompt = (\n 'Enter \"include\" or \"exclude\" for the file extension filter.\\n> '\n )\n file_ext_filter_condition_val = input(file_ext_filter_condition_prompt)\n file_ext_filter_type = (\n GithubRepositoryReader.FilterType.INCLUDE\n if file_ext_filter_condition_val.lower().strip() == \"include\"\n else GithubRepositoryReader.FilterType.EXCLUDE\n )\n file_ext_filter = create_git_filters(file_ext_filter_type, GitFilter.FILE_EXTENSION, value=file_exts)\n git_filters.append(file_ext_filter)\n\n return_data = create_git_data(user, repo, branch, git_filters)\n return return_data\n "},{"location":"option-picker/#bcorag.option_picker._create_picker","title":"_create_picker(title_keyword, documentation, option_list, default=None) ","text":"Creates a general picker CLI based on a list of options and the functionality to optionally mark one option as the default. Parameters: Name Type Description Default title_keyword str The keyword to use for the picker title. required documentation str Link to the documentation for the option. required option_list list[str] The list of options to display in the picker menu. required default str | None The option to mark one option as the default. None Returns: Type Description str | None The chosen option of None if the user selected to exit. Source code in bcorag/option_picker.py def _create_picker(\n title_keyword: str,\n documentation: str,\n option_list: list[str],\n default: Optional[str] = None,\n) -> Optional[str]:\n \"\"\"Creates a general picker CLI based on a list of options and the\n functionality to optionally mark one option as the default.\n\n Parameters\n ----------\n title_keyword : str\n The keyword to use for the picker title.\n documentation : str\n Link to the documentation for the option.\n option_list : list[str]\n The list of options to display in the picker menu.\n default : str | None, optional\n The option to mark one option as the default.\n\n Returns\n -------\n str | None\n The chosen option of None if the user selected to exit.\n \"\"\"\n pick_title = f\"Please choose one of the following {title_keyword.replace('_', ' ').title()}s.\\nDocumentation can be found at:\\n{documentation}.\"\n pick_options = [\n f\"{option} (default)\" if option == default else option for option in option_list\n ]\n pick_options.append(EXIT_OPTION)\n option, _ = pick(pick_options, pick_title, indicator=\"->\")\n option = str(option)\n if option == EXIT_OPTION:\n return None\n if \" (default)\" in option:\n option = option.replace(\" (default)\", \"\")\n return option\n "},{"location":"option-picker/#bcorag.option_picker._in_progress_docs","title":"_in_progress_docs() ","text":"Checks if in progress documentation is found. Returns: Type Description str or None The file path to the in progress documentation to include or None if the user chose not to include or no documentation was found. Source code in bcorag/option_picker.py def _in_progress_docs() -> Optional[str]:\n \"\"\"Checks if in progress documentation is found.\n\n Returns\n -------\n str or None\n The file path to the in progress documentation to include or None\n if the user chose not to include or no documentation was found.\n \"\"\"\n in_progress_docs_path = os.path.join(os.getcwd(), \"aggregator\", \"summary.md\")\n if os.path.isfile(in_progress_docs_path):\n prompt = \"Found summary.md, include it in the vector store? (y/n)\\n> \"\n answer = input(prompt)\n answer = answer.strip().lower()\n if answer == \"y\":\n return in_progress_docs_path\n return None\n "},{"location":"options/","title":"Usage","text":" - Preliminary Steps
- Startup
- Generate Domains
- Options
- Data Loader
- Chunking Strategy
- Embedding Model
- Vector Store
- Similarity Top K
- LLM Model
- Mode
- Github Repository
"},{"location":"options/#preliminary-steps","title":"Preliminary Steps","text":"Make sure the setup steps in the Installation and Setup documentation are complete. "},{"location":"options/#startup","title":"Startup","text":"From within the rag/ directory, start the project like so: (env) python main.py\n On startup, you will be prompted to choose some configuration options. More details on the specifics of each option are documented in the Options section. "},{"location":"options/#generate-domains","title":"Generate Domains","text":"After your configurations selections are confirmed, you'll be asked which domain you would like to generate. You can enter either the one letter shortcode for each domain or the full domain name. A new output subdirectory will be created in the output/ directory named after the PDF file. Each domain will have at least one output file on each generation. The code will attempt to serialize the return response into a valid JSON object. Regardless if the JSON serialization succeeds, the raw return response will be dumped in a text file. More detailed information about the output behaviour and structure can be found in the output structure documentation. "},{"location":"options/#options","title":"Options","text":"The option picker interface can be navigated with the n or down arrow keys for the next option, p or up arrow key for the previous option, and the Enter key to choose the option. If you choose the Exit option at any step in the process the program will exit with a status code of 0 . "},{"location":"options/#data-loader","title":"Data Loader","text":"The data loader (or Reader) is one of the key abstraction concepts in the LlamaIndex library. Data loaders handle the data ingestion and formatting into Document objects, which will eventually be chunked into Node objects by the vector store. At a high level, Documents are a generic container for the data source. By default, Documents store the text (and/or images) from the data source, a dictionary of annotations containing the metadata, and a dictionary of relationships to other Documents and Nodes. A Node represents a \"chunk\" of a source Document. Aside from the built in generic data loaders, LLamaIndex hosts an open source hub for various community built data loaders for a variety of data sources. Different data loaders differ in how they create and structure the resulting Documents. Depending on the specialization of the data loader in relation to the structure of the raw data source, this can have a significant impact on the overall performance of the downstream pipeline steps. The currently supported data loaders are: SimpleDirectoryReader (default): This is a built-in data loader provided directly by the LlamaIndex library. It is the most generic option and is not specialized in any specific file type. PDFReader : This is an external data loader from LlamaHub that is specialized to PDF files. PDFMarker : The PDF marker converts the PDF file to clean markdown before ingesting. "},{"location":"options/#chunking-strategy","title":"Chunking Strategy","text":"The chunking strategy is the specific technique to split the Documents into Nodes. The chunking strategy chosen should influence downstream configuration choices, specifically the embedding model and similarity top k parameter selections. Recent research has shown that chunking optimization in RAG systems can have more of an impact on performance then most other parameter configurations, making it one of the most important configuration options. There are two general chunking strategies that this tool currently supports: fixed sized chunking and semantic chunking. Fixed size chunking strategies involve pre-setting the chunk_size and chunk_overlap parameters. The chunk_size controls the granularity of the chunks (or Nodes) by setting the token limit per chunk. For example, a chunk size of 256 will create more granular chunks, and as a result, more Nodes. However, vital information might not be among the top retrieved chunks, especially if the similarity-top-k parameter is not scaled accordingly. Conversly, a chunk size of 2048 is more likely to encompass relevant information at the cost of increased noise and a loss of specificity. With fixed size chunking stragies, it is important to scale the similarity-top-k parameter appropriately and to choose an embedding model that both supports (and performs well on) the chosen chunk size. The semantic chunking supported by this tool involves using a semantic splitter to adaptively pick the breakpoint in-between sentences using embedding similarity. This ensure sthat a chunk contains sentences that are semantically related to each other. Note, semantic chunking introduces non-trival overhead in terms of computational resources and API calls. Especially for very large documents, expect worse runtime performance. There is also a possibility that the semantic splitter creates chunks that are too large for your chosen embedding model. While this bug is not specically addressed right now, it will probably have to be addressed with a custom second level safety net splitter eventually. The currently supported chunking strategies are: 256 chunk size/20 chunk overlap : Fixed chunking strategy with 256 tokens and a 20 token overlap between chunks. 512 chunk size/50 chunk overlap : Fixed chunking strategy with 512 tokens and a 50 token overlap between chunks. 1024 chunk size/20 chunk overlap (default): Fixed chunking strategy with 1024 tokens and a 20 token overlap between chunks. 2048 chunk size/50 chunk overlap : Fixed chunking strategy with 2048 tokens and a 50 token overlap between chunks. semantic : Semantic chunking based on adaptive chunk splitting. Note: There are known bugs with the semantic chunker, see here. "},{"location":"options/#embedding-model","title":"Embedding Model","text":"The embedding model is responsible for converting the text into a numerical representation, or embedding. The embedding model is used to transform both the query and the chunked nodes into embeddings which are then compared to find the most similar nodes relating to the query during the information retrieval process. Different embedding models can significantly impact the performance of the RAG pipeline. Additionally, different embedding models perform optimally on different chunk sizes, so the embedding model choice should ideally be harmonized with the chosen chunking strategy. The currently supported embedding models are: text-embedding-3-small (default): This is one of OpenAI's newest embedding models, designed for highly efficient embedding. text-embedding-3-large : This is the other new OpenAI embedding model, designed for maximum performance with support for embeddings up to 3,072 dimensions. text-embedding-ada-002 : This is an older OpenAI embedding model, generally not recommended outside testing purposes as it is less efficient and less powerful than both the text-embedding-3-small and text-embedding-3-large models. Currently, only OpenAI embedding models are supported. Futher documentation on the embedding models can be found here and information on pricing can be found here. "},{"location":"options/#vector-store","title":"Vector Store","text":"The vector store handles the indexing, retrieval, and storage process. The indexing process is the method by which a vector store chunks, embeds, organizes, and stores the resulting embeddings of the chunked documents in the vector store. This process can vary depending on the specific implementation of the vector store chosen. The specific chunking strategy chosen can have a significant impact on the retrieval process and effects your embedding model choice (different embedding models perform optimally on different chunk sizes). The retrieval process first converts the query into a vector embedding and then performs a dense search operation to rank all the embeddings by how semantically similar they are to the query. Once the ranking is complete, the vector store returns, or retrieves, the most similar embeddings. The number of chosen retrievals to send to the LLM is controlled by the similarity_top_k parameter. Different vector stores also support different metadata filtering methods that allow for filtering the candidate set of documents based on certain metadata before performing the semantic search. Aside from the built in generic vector stores, LLamaIndex hosts an open source hub for various other vector store options. The currently supported vector stores are: VectorStoreIndex (default): This is the default built-in vector store provided directly by the LlamaIndex library. While it does support metadata filtering, by default it does not perform any metadata filtering. "},{"location":"options/#similarity-top-k","title":"Similarity Top K","text":"The similarity_top_k parameter in the similarity search process refers to the number of nodes to return as a result of the semantic retrieval process. When the semantic search process is performend, the node embeddings are ranked by how smenatically similar they are to the query embedding. After the ranking process is completed, the top k most similar embeddings are sent to the LLM along with the query. Larger values will result in more input tokens. Note: The similarity_top_k parameter here is unrelated to the top k parameter for large language models which limits the model's vocabulary sampling set when considering the next word to generate. "},{"location":"options/#llm-model","title":"LLM Model","text":"The currently supported LLM models are: gpt-3.5-turbo : This is the least powerful model, offering the fastest performance at a low cost with the caveat of being the least powerful of the OpenAI offerings. gpt-4-turbo (default): This is the default model and is OpenAI's newest offering. As of writing, this model currently points to the gpt-4-turbo-2024-04-09 model. gpt-4-turbo-preview : As of writing, this model currently points to the gpt-4-0125-preview model. Generally not recommended outside of testing purposes as the gpt-4-turbo offers better performance at the same cost. gpt-4 : This is the most powerful model, but also the most expensive. Currently, only OpenAI LLM models are supported. Futher documentation on the specific LLM models can be found here and information on pricing can be found here. "},{"location":"options/#mode","title":"Mode","text":"The mode option has no effect on the RAG performance, but controls how much extra information is included in the run log. Choosing the debug mode will include an extensive logging of everything that is happening during each run. Choosing the production mode will only include the necessary logging, such as the user options and return responses. "},{"location":"options/#github-repository","title":"Github Repository","text":"After choosing the configuration options, you have the choice to also include a Github repository URL to include in the indexing process. The URL provided will automatically be parsed for the repository owner and repository name information. This will supplement the PDF data ingestion to provide more specific output for workflow specific steps in the description and parametric domains. If a github URL is entered, you'll be asked to confirm the branch of the repo to index (if none is entered, will default to main ). You will also have the choice to specify directory and filter extension filters. For each filter, you will have the option to specify whether to conditionally exclude certain directories and file types or to inclusively include certain directories and file types. Specify the directory path for directories to include in the filter. For file types, include the file extension, for example, \".txt\", \".md\" with the \"include\" filter type will only include files that are of type text and markdown. Note, the filters are potentially important for large repositories. Indexing repositories with large output, log, or data files can incur signficant performance overhead and additionally can lower output quality by polluting the retrieval step with noise. "},{"location":"output-map-types/","title":"Output Map Types","text":"The output map custom types. "},{"location":"output-map-types/#bcorag.custom_types.output_map_types.OutputTrackerGitFilter","title":"OutputTrackerGitFilter ","text":" Bases: TypedDict Parsed git filter TypedDict used for output map formatting. Attributes: Name Type Description filter tuple[str, list[str]] Tuple representing the filter type (include or exclude) and the filter values. Source code in bcorag/custom_types/output_map_types.py class OutputTrackerGitFilter(TypedDict):\n \"\"\"Parsed git filter TypedDict used for output map formatting.\n\n Attributes\n ----------\n filter : tuple[str, list[str]]\n Tuple representing the filter type (include or exclude) and the filter values.\n \"\"\"\n\n filter: tuple[str, list[str]]\n "},{"location":"output-map-types/#bcorag.custom_types.output_map_types.OutputTrackerParamSet","title":"OutputTrackerParamSet ","text":" Bases: TypedDict Parameter set for a run. Attributes: Name Type Description loader str The data loader used for the run. vector_store str The vector store used for the run. llm str The LLM name used for the run. embedding_model str The embedding model used for the run. similarity_top_k int The similarity top k value used for the run. chunking_config str The chunking strategy used for the run. git_user Optional[str] The user who owns the github repository included in the document ingestion for the run (if applicable). git_repo Optional[str] The github repository included in the document ingestion for the run (if applicable). git_branch Optional[str] The github repository branch indexed during the document ingestion for the run (if applicable). directory_git_filter Optional[OutputTrackerGitFilter] The directory filter used for indexing the github repository (if applicable). file_ext_git_filter Optional[OutputTrackerGitFilter] The file extension filter used for indexing the github repository (if applicable). other_docs Optional[list[str]] The file path to any additional documentation included in the documents. Source code in bcorag/custom_types/output_map_types.py class OutputTrackerParamSet(TypedDict):\n \"\"\"Parameter set for a run.\n\n Attributes\n ----------\n loader : str\n The data loader used for the run.\n vector_store : str\n The vector store used for the run.\n llm : str\n The LLM name used for the run.\n embedding_model : str\n The embedding model used for the run.\n similarity_top_k : int\n The similarity top k value used for the run.\n chunking_config : str\n The chunking strategy used for the run.\n git_user : Optional[str]\n The user who owns the github repository included in the document ingestion for the run (if applicable).\n git_repo : Optional[str]\n The github repository included in the document ingestion for the run (if applicable).\n git_branch : Optional[str]\n The github repository branch indexed during the document ingestion for the run (if applicable).\n directory_git_filter : Optional[OutputTrackerGitFilter]\n The directory filter used for indexing the github repository (if applicable).\n file_ext_git_filter : Optional[OutputTrackerGitFilter]\n The file extension filter used for indexing the github repository (if applicable).\n other_docs : Optional[list[str]]\n The file path to any additional documentation included in the documents.\n \"\"\"\n\n loader: str\n vector_store: str\n llm: str\n embedding_model: str\n similarity_top_k: int\n chunking_config: str\n git_user: Optional[str]\n git_repo: Optional[str]\n git_branch: Optional[str]\n directory_git_filter: Optional[OutputTrackerGitFilter]\n file_ext_git_filter: Optional[OutputTrackerGitFilter]\n other_docs: Optional[list[str]]\n "},{"location":"output-map-types/#bcorag.custom_types.output_map_types.OutputTrackerRunsEntry","title":"OutputTrackerRunsEntry ","text":" Bases: TypedDict Specific file data under a parameter set. Attributes: Name Type Description index int The index for the run (the index represents the run number for that specific domain parameter set). timestamp str The timestamp for the run. txt_file str File path to the raw output dump text file. json_file str File path to the JSON output file. source_node_file str File path to the source node text file. elapsed_time float The elapsed time (in seconds) for how long the domain generation took. version str The version of the bcorag tool used. Source code in bcorag/custom_types/output_map_types.py class OutputTrackerRunsEntry(TypedDict):\n \"\"\"Specific file data under a parameter set.\n\n Attributes\n ----------\n index : int\n The index for the run (the index represents the run number for that specific domain parameter set).\n timestamp : str\n The timestamp for the run.\n txt_file : str\n File path to the raw output dump text file.\n json_file : str\n File path to the JSON output file.\n source_node_file : str\n File path to the source node text file.\n elapsed_time : float\n The elapsed time (in seconds) for how long the domain generation took.\n version : str\n The version of the bcorag tool used.\n \"\"\"\n\n index: int\n timestamp: str\n txt_file: str\n json_file: str\n source_node_file: str\n elapsed_time: float\n version: str\n "},{"location":"output-map-types/#bcorag.custom_types.output_map_types.OutputTrackerEntry","title":"OutputTrackerEntry ","text":" Bases: TypedDict Entry in the output map under a specific domain hash string. Attributes: Name Type Description curr_index int The most recent run index. params OutputTrackerParamSet The parameter set for the run. runs list[OutputTrackerRunsEntry] The list of runs for this parameter set. Source code in bcorag/custom_types/output_map_types.py class OutputTrackerEntry(TypedDict):\n \"\"\"Entry in the output map under a specific domain hash string.\n\n Attributes\n ----------\n curr_index : int\n The most recent run index.\n params : OutputTrackerParamSet\n The parameter set for the run.\n runs : list[OutputTrackerRunsEntry]\n The list of runs for this parameter set.\n \"\"\"\n\n curr_index: int\n params: OutputTrackerParamSet\n runs: list[OutputTrackerRunsEntry]\n "},{"location":"output-map-types/#bcorag.custom_types.output_map_types.OutputTrackerDomainEntry","title":"OutputTrackerDomainEntry ","text":" Bases: TypedDict Entry for a specific domain. Note: this isn't the most ideal way to do this. Ideally the hash string itself for the parameter set would be the key instead of forcing the OutputTrackerDomainField to be kept as a list of objects. However, there doesn't seem to be a good way to do this in a pythonic way while enforcing type safety with static type checkers. As they currently exist, TypedDict's require all keys are specified at the time of creating the definition. I would rather not specify regular dictionaries with extensive and verbose type annotations and I expect these map output files are likely to be small enough that serious linear runtime complexity won't cause issues. Attributes: Name Type Description hash_str str The hash of the parameter set used for run collision identification. entries OutputTrackerEntry The run objects. Source code in bcorag/custom_types/output_map_types.py class OutputTrackerDomainEntry(TypedDict):\n \"\"\"Entry for a specific domain.\n\n *Note*: this isn't the most ideal way to do this. Ideally\n the hash string itself for the parameter set would be the\n key instead of forcing the OutputTrackerDomainField to be\n kept as a list of objects. However, there doesn't seem to\n be a good way to do this in a pythonic way while enforcing\n type safety with static type checkers. As they currently\n exist, TypedDict's require all keys are specified at the\n time of creating the definition. I would rather not specify\n regular dictionaries with extensive and verbose type annotations\n and I expect these map output files are likely to be small enough\n that serious linear runtime complexity won't cause issues.\n\n Attributes\n ----------\n hash_str : str\n The hash of the parameter set used for run collision identification.\n entries : OutputTrackerEntry\n The run objects.\n \"\"\"\n\n hash_str: str\n entries: OutputTrackerEntry\n "},{"location":"output-map-types/#bcorag.custom_types.output_map_types.OutputTrackerFile","title":"OutputTrackerFile ","text":" Bases: TypedDict Top level schema for the output file. Attributes: Name Type Description usability list[OutputTrackerDomainEntry] The output map for the usability domain. io list[OutputTrackerDomainEntry] The output map for the io domain. description list[OutputTrackerDomainEntry] The output map for the description domain. execution list[OutputTrackerDomainEntry] The output map for the execution domain. parametric list[OutputTrackerDomainEntry] The output map for the parametric domain. error list[OutputTrackerDomainEntry] The output map for the error domain. Source code in bcorag/custom_types/output_map_types.py class OutputTrackerFile(TypedDict):\n \"\"\"Top level schema for the output file.\n\n Attributes\n ----------\n usability : list[OutputTrackerDomainEntry]\n The output map for the usability domain.\n io : list[OutputTrackerDomainEntry]\n The output map for the io domain.\n description : list[OutputTrackerDomainEntry]\n The output map for the description domain.\n execution : list[OutputTrackerDomainEntry]\n The output map for the execution domain.\n parametric : list[OutputTrackerDomainEntry]\n The output map for the parametric domain.\n error : list[OutputTrackerDomainEntry]\n The output map for the error domain.\n \"\"\"\n\n usability: list[OutputTrackerDomainEntry]\n io: list[OutputTrackerDomainEntry]\n description: list[OutputTrackerDomainEntry]\n execution: list[OutputTrackerDomainEntry]\n parametric: list[OutputTrackerDomainEntry]\n error: list[OutputTrackerDomainEntry]\n "},{"location":"output-map-types/#bcorag.custom_types.output_map_types.create_output_tracker_git_filter","title":"create_output_tracker_git_filter(filter) ","text":"Constructor for the OutputTrackerGitFilter TypedDict. Parameters: Name Type Description Default filter tuple[str, list[str]] required Returns: Type Description OutputTrackerGitFilter Source code in bcorag/custom_types/output_map_types.py def create_output_tracker_git_filter(\n filter: tuple[str, list[str]]\n) -> OutputTrackerGitFilter:\n \"\"\"Constructor for the `OutputTrackerGitFilter` TypedDict.\n\n Parameters\n ----------\n filter : tuple[str, list[str]]\n\n Returns\n -------\n OutputTrackerGitFilter\n \"\"\"\n return_data: OutputTrackerGitFilter = {\"filter\": filter}\n return return_data\n "},{"location":"output-map-types/#bcorag.custom_types.output_map_types.create_output_tracker_param_set","title":"create_output_tracker_param_set(loader, vector_store, llm, embedding_model, similarity_top_k, chunking_config, git_user, git_repo, git_branch, directory_git_filter=None, file_ext_git_filter=None, other_docs=None) ","text":"Constructor for the OutputTrackerParamSet TypedDict. Parameters: Name Type Description Default loader str The data loader used for the run. required vector_store str The vector store used for the run. required llm str The LLM name used for the run. required embedding_model str The embedding model used for the run. required similarity_top_k int The similarity top k value used for the run. required chunking_config str The chunking strategy used for the run. required git_user Optional[str] The user who owns the github repository included in the document ingestion for the run (if applicable). required git_repo Optional[str] The github repository included in the document ingestion for the run (if applicable). required git_branch Optional[str] The github repository branch indexed during the document ingestion for the run (if applicable). required directory_git_filter Optional[OutputTrackerGitFilter] The directory filter used for indexing the github repository (if applicable). None file_ext_git_filter Optional[OutputTrackerGitFilter] The file extension filter used for indexing the github repository (if applicable). None other_docs Optional[list[str]] The file path to any additional documentation included in the documents. None Returns: Type Description OutputTrackerParamSet Source code in bcorag/custom_types/output_map_types.py def create_output_tracker_param_set(\n loader: str,\n vector_store: str,\n llm: str,\n embedding_model: str,\n similarity_top_k: int,\n chunking_config: str,\n git_user: Optional[str],\n git_repo: Optional[str],\n git_branch: Optional[str],\n directory_git_filter: Optional[OutputTrackerGitFilter] = None,\n file_ext_git_filter: Optional[OutputTrackerGitFilter] = None,\n other_docs: Optional[list[str]] = None\n) -> OutputTrackerParamSet:\n \"\"\"Constructor for the `OutputTrackerParamSet` TypedDict.\n\n Parameters\n ----------\n loader : str\n The data loader used for the run.\n vector_store : str\n The vector store used for the run.\n llm : str\n The LLM name used for the run.\n embedding_model : str\n The embedding model used for the run.\n similarity_top_k : int\n The similarity top k value used for the run.\n chunking_config : str\n The chunking strategy used for the run.\n git_user : Optional[str]\n The user who owns the github repository included in the document ingestion for the run (if applicable).\n git_repo : Optional[str]\n The github repository included in the document ingestion for the run (if applicable).\n git_branch : Optional[str]\n The github repository branch indexed during the document ingestion for the run (if applicable).\n directory_git_filter : Optional[OutputTrackerGitFilter], optional\n The directory filter used for indexing the github repository (if applicable).\n file_ext_git_filter : Optional[OutputTrackerGitFilter], optional\n The file extension filter used for indexing the github repository (if applicable).\n other_docs : Optional[list[str]]\n The file path to any additional documentation included in the documents.\n\n Returns\n -------\n OutputTrackerParamSet\n \"\"\"\n return_data: OutputTrackerParamSet = {\n \"loader\": loader,\n \"vector_store\": vector_store,\n \"llm\": llm,\n \"embedding_model\": embedding_model,\n \"similarity_top_k\": similarity_top_k,\n \"chunking_config\": chunking_config,\n \"git_user\": git_user,\n \"git_repo\": git_repo,\n \"git_branch\": git_branch,\n \"directory_git_filter\": directory_git_filter,\n \"file_ext_git_filter\": file_ext_git_filter,\n \"other_docs\": other_docs,\n }\n return return_data\n "},{"location":"output-map-types/#bcorag.custom_types.output_map_types.create_output_tracker_runs_entry","title":"create_output_tracker_runs_entry(index, timestamp, txt_file, json_file, source_node_file, elapsed_time, version=__version__) ","text":"Constructor for the OutputTrackerRunsEntry TypedDict. Parameters: Name Type Description Default index int The index for the run (the index represents the run number for that specific domain parameter set). required timestamp str The timestamp for the run. required txt_file str File path to the raw output dump text file. required json_file str File path to the JSON output file. required source_node_file str File path to the source node text file. required elapsed_time float The elapsed time (in seconds) for how long the domain generation took. required version str The version of the bcorag tool used. __version__ Source code in bcorag/custom_types/output_map_types.py def create_output_tracker_runs_entry(\n index: int,\n timestamp: str,\n txt_file: str,\n json_file: str,\n source_node_file: str,\n elapsed_time: float,\n version: str = __version__,\n) -> OutputTrackerRunsEntry:\n \"\"\"Constructor for the `OutputTrackerRunsEntry` TypedDict.\n\n Parameters\n ----------\n index : int\n The index for the run (the index represents the run number for that specific domain parameter set).\n timestamp : str\n The timestamp for the run.\n txt_file : str\n File path to the raw output dump text file.\n json_file : str\n File path to the JSON output file.\n source_node_file : str\n File path to the source node text file.\n elapsed_time : float\n The elapsed time (in seconds) for how long the domain generation took.\n version : str, optional\n The version of the `bcorag` tool used.\n \"\"\"\n return_data: OutputTrackerRunsEntry = {\n \"index\": index,\n \"timestamp\": timestamp,\n \"txt_file\": txt_file,\n \"json_file\": json_file,\n \"source_node_file\": source_node_file,\n \"elapsed_time\": elapsed_time,\n \"version\": version,\n }\n return return_data\n "},{"location":"output-map-types/#bcorag.custom_types.output_map_types.create_output_tracker_entry","title":"create_output_tracker_entry(curr_index, params, runs) ","text":"Constructor for the OutputTrackerEntry TypedDict. Parameters: Name Type Description Default curr_index int The most recent run index. required params OutputTrackerParamSet The parameter set for the run. required runs list[OutputTrackerRunsEntry] The list of runs for this parameter set. required Returns: Type Description OutputTrackerEntry Source code in bcorag/custom_types/output_map_types.py def create_output_tracker_entry(\n curr_index: int, params: OutputTrackerParamSet, runs: list[OutputTrackerRunsEntry]\n) -> OutputTrackerEntry:\n \"\"\"Constructor for the `OutputTrackerEntry` TypedDict.\n\n Parameters\n ----------\n curr_index : int\n The most recent run index.\n params : OutputTrackerParamSet\n The parameter set for the run.\n runs : list[OutputTrackerRunsEntry]\n The list of runs for this parameter set.\n\n Returns\n -------\n OutputTrackerEntry\n \"\"\"\n return_data: OutputTrackerEntry = {\n \"curr_index\": curr_index,\n \"params\": params,\n \"runs\": runs,\n }\n return return_data\n "},{"location":"output-map-types/#bcorag.custom_types.output_map_types.create_output_tracker_domain_entry","title":"create_output_tracker_domain_entry(hash_str, entries) ","text":"Constructor for the OutputTrackerDomainEntry TypedDict. Parameters: Name Type Description Default hash_str str The hash of the parameter set used for run collision identification. required entries OutputTrackerEntry The run objects. required Returns: Type Description OutputTrackerDomainEntry Source code in bcorag/custom_types/output_map_types.py def create_output_tracker_domain_entry(\n hash_str: str, entries: OutputTrackerEntry\n) -> OutputTrackerDomainEntry:\n \"\"\"Constructor for the `OutputTrackerDomainEntry` TypedDict.\n\n Parameters\n ----------\n hash_str : str\n The hash of the parameter set used for run collision identification.\n entries : OutputTrackerEntry\n The run objects.\n\n Returns\n -------\n OutputTrackerDomainEntry\n \"\"\"\n return_data: OutputTrackerDomainEntry = {\"hash_str\": hash_str, \"entries\": entries}\n return return_data\n "},{"location":"output-map-types/#bcorag.custom_types.output_map_types.default_output_tracker_file","title":"default_output_tracker_file() ","text":"Creates an empty, default output tracker file instance. Returns: Type Description OutputTrackerFile Source code in bcorag/custom_types/output_map_types.py def default_output_tracker_file() -> OutputTrackerFile:\n \"\"\"Creates an empty, default output tracker file instance.\n\n Returns\n -------\n OutputTrackerFile\n \"\"\"\n return_data: OutputTrackerFile = {\n \"usability\": [],\n \"io\": [],\n \"description\": [],\n \"execution\": [],\n \"parametric\": [],\n \"error\": [],\n }\n return return_data\n "},{"location":"output-structure/","title":"Output Structure","text":" - Output Directory
- Generated Content
- Output Maps
"},{"location":"output-structure/#output-directory","title":"Output Directory","text":"All output files and sub-directories will be placed within the output/ directory at the root of this repository. When starting up a run for a PDF file, a new subdirectory will be created with the name of the PDF file. For example, if the paper being indexed is named High resolution measurement.pdf , the output directory created will be at the path output/high_resolution_measurement/ (whitespaces replaced with underscores). Within that sub-directory will be two more sub-directories, generated_domains/ and reference_sources/ , and two files, output_map.json and output_map.json . "},{"location":"output-structure/#generated-content","title":"Generated Content","text":"Output filenames contain three components: Domain - the corresponding BioCompute domain. Index - the run number for the domain under that parameter set (used to delineate between hash collisions). Parameter Set Hash - used to uniquely identify parameter sets for a run. The filename formats are as follows: {domain}-{index}-{parameter set hash}.json\n{domain}-{index}-{parameter set hash}.txt\n When generating a domain, the LLM generated domain response will be attempted to be serialized into a valid JSON object. If successful, a JSON file will be created within the generated_domains/ sub-directory. Whether or not the JSON serialization is successful, the raw response message will be dumped into a text file in the generated_domains/ sub-directory. A key component of any RAG pipeline is the retrieval process. In order to accurately capture the state of the tool when generating a domain, we capture the referenced sources that were retrieved based on the standardized domain queries. These are stored in the referernce_sources/ sub-directory and follow the same filename format as the output text files. "},{"location":"output-structure/#output-maps","title":"Output Maps","text":"Along with the generated content output, an output_map.json file is generated (or updated) to keep track of the parameter sets for each run. As a convenience for human-readability, the JSON output map is also dumped as a TSV file (however, the TSV file is not used for tracking at all by the code). "},{"location":"output-structure/#map-structure","title":"Map Structure","text":"{\n \"{domain}\": [\n {\n \"hash_str\": \"{parameter set hash}\",\n \"entries\": {\n \"curr_index\": \"{current run index}\",\n \"params\": {\n \"loader\": \"{data loader used}\",\n \"vector_store\": \"{vector store used}\",\n \"llm\": \"{llm used}\",\n \"embedding_model\": \"{embedding model used}\",\n \"similarity_top_k\": \"{similarity top k selected}\",\n \"chunking_config\": \"{chunking strategy used for node parsing}\",\n \"git_user\": \"{github user (or org) that owns the github repo used (if applicable)}\",\n \"git_repo\": \"{github repo indexed (if applicable)}\",\n \"git_branch\": \"{github branch to index (if applicable)}\",\n \"directory_git_filter\": \"{the directory filters included, if applicable}\",\n \"fiel_ext_filter\": \"{the file extension filters included, if applicable}\"\n },\n \"runs\": [\n {\n \"index\": \"{index for this run}\",\n \"timestamp\": \"{timestamp of the run}\",\n \"txt_file\": \"{filepath to the raw txt dump}\",\n \"json_file\": \"{filepath to the serialized JSON response (if applicable)}\",\n \"source_node_file\": \"{filepath to the retrieved nodes file}\",\n \"elapsed_time\": \"{elapsed time in seconds to generate the domain}\",\n \"version\": \"{version of the tool that was used}\"\n }\n ]\n }\n }\n ]\n}\n "},{"location":"parameter-custom-types/","title":"Types","text":""},{"location":"parameter-custom-types/#parameter_search.custom_types._AvailFilters","title":"_AvailFilters ","text":" Bases: TypedDict Internal class for the available parameter set. Source code in parameter_search/custom_types.py class _AvailFilters(TypedDict):\n \"\"\"Internal class for the available parameter set.\"\"\"\n\n loader: list[str]\n chunking_config: list[str]\n embedding_model: list[str]\n vector_store: list[str]\n similarity_top_k: list[int]\n llm: list[str]\n mode: list[str]\n "},{"location":"parameter-custom-types/#parameter_search.custom_types.GitDataFileConfig","title":"GitDataFileConfig ","text":" Bases: TypedDict Git data instance for a file. Attributes: Name Type Description filename str The file (paper) to associate this github repository info with. git_info GitData The github repository information for document ingestion. Source code in parameter_search/custom_types.py class GitDataFileConfig(TypedDict):\n \"\"\"Git data instance for a file.\n\n Attributes\n ----------\n filename : str\n The file (paper) to associate this github repository info with.\n git_info : GitData\n The github repository information for document ingestion.\n \"\"\"\n\n filename: str\n git_info: GitData\n "},{"location":"parameter-custom-types/#parameter_search.custom_types.SearchSpace","title":"SearchSpace ","text":" Bases: TypedDict Search space used for hyperparameter search. Attributes: Name Type Description filenames list[str] The file (paper) name's to process. loader list[str] The list of available data loaders to test. chunking_config list[str] The chunking strategies to test. embedding_model list[str] The embedding models to test. vector_store list[str] The vector stores to test. similarity_top_k list[int] The similarity top k values to test. llm list[str] The LLMs to test. git_data Optional[list[GitDataFileConfig]] The git data information. Source code in parameter_search/custom_types.py class SearchSpace(TypedDict):\n \"\"\"Search space used for hyperparameter search.\n\n Attributes\n ----------\n filenames : list[str]\n The file (paper) name's to process.\n loader : list[str]\n The list of available data loaders to test.\n chunking_config : list[str]\n The chunking strategies to test.\n embedding_model : list[str]\n The embedding models to test.\n vector_store : list[str]\n The vector stores to test.\n similarity_top_k : list[int]\n The similarity top k values to test.\n llm : list[str]\n The LLMs to test.\n git_data : Optional[list[GitDataFileConfig]]\n The git data information.\n \"\"\"\n\n filenames: list[str]\n loader: list[str]\n chunking_config: list[str]\n embedding_model: list[str]\n vector_store: list[str]\n similarity_top_k: list[int]\n llm: list[str]\n git_data: Optional[list[GitDataFileConfig]]\n "},{"location":"parameter-custom-types/#parameter_search.custom_types.create_git_data_file_config","title":"create_git_data_file_config(filename, git_info) ","text":"Constructor for the GitDataFileConfig TypedDict. Parameters: Name Type Description Default filename str The file (paper) to associate this github repository info with. required git_info GitData The github repository information for document ingestion. required Returns: Type Description GitDataFileConfig Source code in parameter_search/custom_types.py def create_git_data_file_config(filename: str, git_info: GitData) -> GitDataFileConfig:\n \"\"\"Constructor for the GitDataFileConfig TypedDict.\n\n Parameters\n ----------\n filename : str\n The file (paper) to associate this github repository info with.\n git_info : GitData\n The github repository information for document ingestion.\n\n Returns\n -------\n GitDataFileConfig\n \"\"\"\n return_data: GitDataFileConfig = {\"filename\": filename, \"git_info\": git_info}\n return return_data\n "},{"location":"parameter-custom-types/#parameter_search.custom_types.init_search_space","title":"init_search_space(filenames=None, loader=None, chunking_config=None, embedding_model=None, vector_store=None, similarity_top_k=None, llm=None, git_data=None) ","text":"Creates a search space instance. Parameters: Name Type Description Default filenames list[str] | str | None The filenames to test over for the search space (if None , defaults to all the filenames in the bcorag/test_papers/ directory). Note, many files can increase run time significantly as a full parameter search will be executed on each paper sequentially. None loader list[str] | str | None The data loaders for the search space (if None , defaults to the full list as defined in the conf.json list). None chunking_config list[str] | str | or None The chunking strategies for the search space (if None , defaults to the full list as defined in the conf.json list). None embedding_model list[str] | str | or None The embedding model for the search space (if None , defaults to the full list as defined in the conf.json list). None vector_store list[str] | str | or None The vector store for the search space (if None , defaults to the full list as defined in the conf.json list). None similarity_top_k list[int] | int | or None The similarity top k for the search space (if None , defaults to the full list as defined in the conf.json list). None llm list[str] | str | or None The llm for the search space (if None , defaults to the full list as defined in the conf.json list). None git_data list[GitDataFileConfig] | GitDataFileConfig | None The git data for each file (if None , assumes no git data for any files). None Returns: Type Description SearchSpace The search space grid. Source code in parameter_search/custom_types.py def init_search_space(\n filenames: Optional[list[str] | str] = None,\n loader: Optional[list[str] | str] = None,\n chunking_config: Optional[list[str] | str] = None,\n embedding_model: Optional[list[str] | str] = None,\n vector_store: Optional[list[str] | str] = None,\n similarity_top_k: Optional[list[int] | int] = None,\n llm: Optional[list[str] | str] = None,\n git_data: Optional[list[GitDataFileConfig]] = None,\n) -> SearchSpace:\n \"\"\"Creates a search space instance.\n\n Parameters\n ----------\n filenames : list[str] | str | None, optional\n The filenames to test over for the search space (if `None`,\n defaults to all the filenames in the `bcorag/test_papers/`\n directory). Note, many files can increase run time\n significantly as a full parameter search will be executed\n on each paper sequentially.\n loader : list[str] | str | None, optional\n The data loaders for the search space (if `None`, defaults to\n the full list as defined in the `conf.json` list).\n chunking_config : list[str] | str | or None, optional\n The chunking strategies for the search space (if `None`, defaults\n to the full list as defined in the `conf.json` list).\n embedding_model : list[str] | str | or None, optional\n The embedding model for the search space (if `None`, defaults\n to the full list as defined in the `conf.json` list).\n vector_store : list[str] | str | or None, optional\n The vector store for the search space (if `None`, defaults\n to the full list as defined in the `conf.json` list).\n similarity_top_k : list[int] | int | or None, optional\n The similarity top k for the search space (if `None`, defaults\n to the full list as defined in the `conf.json` list).\n llm : list[str] | str | or None, optional\n The llm for the search space (if `None`, defaults\n to the full list as defined in the `conf.json` list).\n git_data : list[GitDataFileConfig] | GitDataFileConfig | None, optional\n The git data for each file (if `None`, assumes no git data for\n any files).\n\n Returns\n -------\n SearchSpace\n The search space grid.\n \"\"\"\n\n def _validate_options(\n option: OptionKey, option_list: list[str] | list[int]\n ) -> bool:\n if not set(option_list) <= set(_avail_options[option]):\n return False\n return True\n\n match filenames:\n case list():\n filenames_space: list[str] = filenames\n case str():\n filenames_space = [filenames]\n case None:\n filenames_space = get_file_list(\"./bcorag/test_papers\", \"*.pdf\")\n case _:\n graceful_exit(1, \"Invalid type for filenames\")\n for file in filenames_space:\n if not os.path.isfile(file):\n graceful_exit(1, f\"Invalid file `{file}`\")\n\n match loader:\n case list():\n loader_space: list[str] = loader\n if not _validate_options(\"loader\", loader_space):\n graceful_exit(1, \"Invalid or undefined loader in search space\")\n case str():\n loader_space = [loader]\n if not _validate_options(\"loader\", loader_space):\n graceful_exit(1, \"Invalid or undefined loader in search space\")\n case None:\n loader_space = _avail_options[\"loader\"]\n case _:\n graceful_exit(1, \"Invalid type specified for loader\")\n\n match chunking_config:\n case list():\n chunking_space: list[str] = chunking_config\n if not _validate_options(\"chunking_config\", chunking_space):\n graceful_exit(\n 1, \"Invalid or undefined chunking strategy in search space\"\n )\n case str():\n chunking_space = [chunking_config]\n if not _validate_options(\"chunking_config\", chunking_space):\n graceful_exit(\n 1, \"Invalid or undefined chunking strategy in search space\"\n )\n case None:\n chunking_space = _avail_options[\"chunking_config\"]\n case _:\n graceful_exit(1, \"Invalid type specified for chunking_config\")\n\n match embedding_model:\n case list():\n embedding_model_space: list[str] = embedding_model\n if not _validate_options(\"embedding_model\", embedding_model_space):\n graceful_exit(1, \"Invalid or undefined embedding model in search space\")\n case str():\n embedding_model_space = [embedding_model]\n if not _validate_options(\"embedding_model\", embedding_model_space):\n graceful_exit(1, \"Invalid or undefined embedding model in search space\")\n case None:\n embedding_model_space = _avail_options[\"embedding_model\"]\n case _:\n graceful_exit(1, \"Invalid type specified for embedding_model\")\n\n match vector_store:\n case list():\n vector_store_space: list[str] = vector_store\n if not _validate_options(\"vector_store\", vector_store_space):\n graceful_exit(1, \"Invalid or undefined vector store in search space\")\n case str():\n vector_store_space = [vector_store]\n if not _validate_options(\"vector_store\", vector_store_space):\n graceful_exit(1, \"Invalid or undefined vector store in search space\")\n case None:\n vector_store_space = _avail_options[\"vector_store\"]\n case _:\n graceful_exit(1, \"Invalid type specified for vector_store\")\n\n match similarity_top_k:\n case list():\n similarity_top_k_space: list[int] = similarity_top_k\n case int():\n similarity_top_k_space = [similarity_top_k]\n case None:\n similarity_top_k_space = _avail_options[\"similarity_top_k\"]\n case _:\n graceful_exit(1, \"Invalid type for similarity top k\")\n\n match llm:\n case list():\n llm_space: list[str] = llm\n if not _validate_options(\"llm\", llm_space):\n graceful_exit(1, \"Invalid or undefined llm in search space\")\n case str():\n llm_space = [llm]\n if not _validate_options(\"llm\", llm_space):\n graceful_exit(1, \"Invalid or undefined llm in search space\")\n case None:\n llm_space = _avail_options[\"llm\"]\n case _:\n graceful_exit(1, \"Invalid type for llm\")\n\n match git_data:\n case list():\n git_data_space: list[GitDataFileConfig] | None = git_data\n case None:\n git_data_space = None\n\n return_data: SearchSpace = {\n \"filenames\": filenames_space,\n \"loader\": loader_space,\n \"chunking_config\": chunking_space,\n \"embedding_model\": embedding_model_space,\n \"vector_store\": vector_store_space,\n \"similarity_top_k\": similarity_top_k_space,\n \"llm\": llm_space,\n \"git_data\": git_data_space,\n }\n\n return return_data\n "},{"location":"parameter-search-abc/","title":"Parent Class","text":"Parameter search base class. "},{"location":"parameter-search-abc/#parameter_search.parameter_search.BcoParameterSearch","title":"BcoParameterSearch ","text":" Bases: ABC Parent class that lays the foundation for the specific parameter search classes. This class shouldn't be instantiated directly. Attributes: Name Type Description _files list[str] The files search space. _loaders list[str] The data loaders search space. _chunking_configs list[str] The chunking strategies search space. _embedding_models list[str] The embedding models search space. _vector_stores list[str] The vector stores search space. _similarity_top_k list[int] The similarity top k search space. _llms list[str] The LLM search space. _git_data Optional[list[GitDataFileConfig]] The git data to associate with test runs. _verbose bool Parameter search verbosity mode. _logger Logger The logger to use. backoff_time int | float The backoff time between runs. Uses exponential backoff time. delay_reset int The amount of runs in between resetting the backoff time. Source code in parameter_search/parameter_search.py class BcoParameterSearch(ABC):\n \"\"\"Parent class that lays the foundation for the specific parameter\n search classes. This class shouldn't be instantiated directly.\n\n Attributes\n ----------\n _files : list[str]\n The files search space.\n _loaders : list[str]\n The data loaders search space.\n _chunking_configs : list[str]\n The chunking strategies search space.\n _embedding_models : list[str]\n The embedding models search space.\n _vector_stores : list[str]\n The vector stores search space.\n _similarity_top_k : list[int]\n The similarity top k search space.\n _llms : list[str]\n The LLM search space.\n _git_data : Optional[list[GitDataFileConfig]]\n The git data to associate with test runs.\n _verbose : bool\n Parameter search verbosity mode.\n _logger : logging.Logger\n The logger to use.\n backoff_time : int | float\n The backoff time between runs. Uses exponential backoff time.\n delay_reset : int\n The amount of runs in between resetting the backoff time. \n \"\"\"\n\n def __init__(\n self,\n search_space: SearchSpace,\n verbose: bool = True,\n ):\n \"\"\"Constructor.\n\n Parameters\n ----------\n search_space : SearchSpace\n The parameter search space.\n verbose : bool, optional\n The verbosity level. False for no output, True for running output.\n \"\"\"\n\n self._files: list[str] = search_space[\"filenames\"]\n self._loaders: list[str] = search_space[\"loader\"]\n self._chunking_configs: list[str] = search_space[\"chunking_config\"]\n self._embedding_models: list[str] = search_space[\"embedding_model\"]\n self._vector_stores: list[str] = search_space[\"vector_store\"]\n self._similarity_top_k: list[int] = search_space[\"similarity_top_k\"]\n self._llms: list[str] = search_space[\"llm\"]\n self._git_data: Optional[list[GitDataFileConfig]] = search_space[\"git_data\"]\n self._verbose: bool = verbose\n self._logger = self._setup_logger()\n self.backoff_time: int | float = STANDARD_BACKOFF\n self.delay_reset = 3\n\n def train(self):\n \"\"\"Starts the generation workflow.\"\"\"\n\n param_sets = self._create_param_sets()\n for idx, param_set in enumerate(param_sets):\n\n self._log_output(\n f\"------------ Param Set {idx + 1}/{len(param_sets)} ------------\"\n )\n self._log_output(param_set)\n t0 = time.time()\n\n t1 = time.time()\n bco_rag = self._create_bcorag(param_set)\n self._log_output(f\"RAG created, elapsed time: {time.time() - t1}\")\n\n t2 = time.time()\n self._generate_domains(bco_rag)\n self._log_output(\n f\"Domains generated, total elapsed time: {time.time() - t2}\"\n )\n\n self._log_output(f\"Sleeping for {self.backoff_time}...\")\n time.sleep(self.backoff_time)\n if idx % self.delay_reset == 0:\n self.backoff_time = STANDARD_BACKOFF\n else:\n self.backoff_time *= 2 + random.uniform(0, 1)\n\n self._log_output(f\"Param set elapsed time: {time.time() - t0}\")\n\n @abstractmethod\n def _setup_logger(self, path: str, name: str) -> Logger:\n \"\"\"Sets up the logger.\"\"\"\n pass\n\n @abstractmethod\n def _create_param_sets(self) -> list[UserSelections]:\n \"\"\"Creates a list of parameter sets.\"\"\"\n pass\n\n def _generate_domains(self, bcorag: BcoRag):\n \"\"\"Performs the bcorag query on each domain.\n\n Parameters\n ----------\n bcorag : BcoRag\n The setup BcoRag instance.\n \"\"\"\n\n domain: DomainKey\n for domain in get_args(DomainKey):\n\n t0 = time.time()\n with supress_stdout():\n bcorag.perform_query(domain)\n self._log_output(f\"\\t{domain.upper()} domain generated, elapsed time: {time.time() - t0}\")\n\n def _create_bcorag(\n self, user_selections: UserSelections, evaluation_mode: bool = False\n ) -> BcoRag:\n \"\"\"Creates the BcoRag instance.\n\n Parameters\n ----------\n user_selections : UserSelections\n The parameter set.\n evaluation_mode : bool\n The evaluation mode for the BcoRag instance.\n\n Returns\n -------\n BcoRag\n The instantiated BcoRag instance.\n \"\"\"\n bcorag = BcoRag(user_selections, evaluation_metrics=evaluation_mode)\n return bcorag\n\n def _log_output(self, message: str | UserSelections):\n \"\"\"Handles output. If the logger was passed in handles logging, if\n verbose is `True` handles printing (only info level logging).\n\n Parameters\n ----------\n message : str | UserSelections\n The message or param set to log and/or print.\n \"\"\"\n if self._verbose:\n if isinstance(message, str):\n print(message)\n elif isinstance(message, dict):\n pprint.pprint(message)\n if self._logger is not None:\n self._logger.info(message)\n "},{"location":"parameter-search-abc/#parameter_search.parameter_search.BcoParameterSearch.__init__","title":"__init__(search_space, verbose=True) ","text":"Constructor. Parameters: Name Type Description Default search_space SearchSpace The parameter search space. required verbose bool The verbosity level. False for no output, True for running output. True Source code in parameter_search/parameter_search.py def __init__(\n self,\n search_space: SearchSpace,\n verbose: bool = True,\n):\n \"\"\"Constructor.\n\n Parameters\n ----------\n search_space : SearchSpace\n The parameter search space.\n verbose : bool, optional\n The verbosity level. False for no output, True for running output.\n \"\"\"\n\n self._files: list[str] = search_space[\"filenames\"]\n self._loaders: list[str] = search_space[\"loader\"]\n self._chunking_configs: list[str] = search_space[\"chunking_config\"]\n self._embedding_models: list[str] = search_space[\"embedding_model\"]\n self._vector_stores: list[str] = search_space[\"vector_store\"]\n self._similarity_top_k: list[int] = search_space[\"similarity_top_k\"]\n self._llms: list[str] = search_space[\"llm\"]\n self._git_data: Optional[list[GitDataFileConfig]] = search_space[\"git_data\"]\n self._verbose: bool = verbose\n self._logger = self._setup_logger()\n self.backoff_time: int | float = STANDARD_BACKOFF\n self.delay_reset = 3\n "},{"location":"parameter-search-abc/#parameter_search.parameter_search.BcoParameterSearch.train","title":"train() ","text":"Starts the generation workflow. Source code in parameter_search/parameter_search.py def train(self):\n \"\"\"Starts the generation workflow.\"\"\"\n\n param_sets = self._create_param_sets()\n for idx, param_set in enumerate(param_sets):\n\n self._log_output(\n f\"------------ Param Set {idx + 1}/{len(param_sets)} ------------\"\n )\n self._log_output(param_set)\n t0 = time.time()\n\n t1 = time.time()\n bco_rag = self._create_bcorag(param_set)\n self._log_output(f\"RAG created, elapsed time: {time.time() - t1}\")\n\n t2 = time.time()\n self._generate_domains(bco_rag)\n self._log_output(\n f\"Domains generated, total elapsed time: {time.time() - t2}\"\n )\n\n self._log_output(f\"Sleeping for {self.backoff_time}...\")\n time.sleep(self.backoff_time)\n if idx % self.delay_reset == 0:\n self.backoff_time = STANDARD_BACKOFF\n else:\n self.backoff_time *= 2 + random.uniform(0, 1)\n\n self._log_output(f\"Param set elapsed time: {time.time() - t0}\")\n "},{"location":"parameter-search-abc/#parameter_search.parameter_search.BcoParameterSearch._setup_logger","title":"_setup_logger(path, name) abstractmethod ","text":"Sets up the logger. Source code in parameter_search/parameter_search.py @abstractmethod\ndef _setup_logger(self, path: str, name: str) -> Logger:\n \"\"\"Sets up the logger.\"\"\"\n pass\n "},{"location":"parameter-search-abc/#parameter_search.parameter_search.BcoParameterSearch._create_param_sets","title":"_create_param_sets() abstractmethod ","text":"Creates a list of parameter sets. Source code in parameter_search/parameter_search.py @abstractmethod\ndef _create_param_sets(self) -> list[UserSelections]:\n \"\"\"Creates a list of parameter sets.\"\"\"\n pass\n "},{"location":"parameter-search-abc/#parameter_search.parameter_search.BcoParameterSearch._generate_domains","title":"_generate_domains(bcorag) ","text":"Performs the bcorag query on each domain. Parameters: Name Type Description Default bcorag BcoRag The setup BcoRag instance. required Source code in parameter_search/parameter_search.py def _generate_domains(self, bcorag: BcoRag):\n \"\"\"Performs the bcorag query on each domain.\n\n Parameters\n ----------\n bcorag : BcoRag\n The setup BcoRag instance.\n \"\"\"\n\n domain: DomainKey\n for domain in get_args(DomainKey):\n\n t0 = time.time()\n with supress_stdout():\n bcorag.perform_query(domain)\n self._log_output(f\"\\t{domain.upper()} domain generated, elapsed time: {time.time() - t0}\")\n "},{"location":"parameter-search-abc/#parameter_search.parameter_search.BcoParameterSearch._create_bcorag","title":"_create_bcorag(user_selections, evaluation_mode=False) ","text":"Creates the BcoRag instance. Parameters: Name Type Description Default user_selections UserSelections The parameter set. required evaluation_mode bool The evaluation mode for the BcoRag instance. False Returns: Type Description BcoRag The instantiated BcoRag instance. Source code in parameter_search/parameter_search.py def _create_bcorag(\n self, user_selections: UserSelections, evaluation_mode: bool = False\n) -> BcoRag:\n \"\"\"Creates the BcoRag instance.\n\n Parameters\n ----------\n user_selections : UserSelections\n The parameter set.\n evaluation_mode : bool\n The evaluation mode for the BcoRag instance.\n\n Returns\n -------\n BcoRag\n The instantiated BcoRag instance.\n \"\"\"\n bcorag = BcoRag(user_selections, evaluation_metrics=evaluation_mode)\n return bcorag\n "},{"location":"parameter-search-abc/#parameter_search.parameter_search.BcoParameterSearch._log_output","title":"_log_output(message) ","text":"Handles output. If the logger was passed in handles logging, if verbose is True handles printing (only info level logging). Parameters: Name Type Description Default message str | UserSelections The message or param set to log and/or print. required Source code in parameter_search/parameter_search.py def _log_output(self, message: str | UserSelections):\n \"\"\"Handles output. If the logger was passed in handles logging, if\n verbose is `True` handles printing (only info level logging).\n\n Parameters\n ----------\n message : str | UserSelections\n The message or param set to log and/or print.\n \"\"\"\n if self._verbose:\n if isinstance(message, str):\n print(message)\n elif isinstance(message, dict):\n pprint.pprint(message)\n if self._logger is not None:\n self._logger.info(message)\n "},{"location":"parameter-search/","title":"Parameter Search","text":" - Search Space
- Grid Search
- Random Search
If wanting to test multiple parameter sets and/or papers, the BcoRag tool has an accompanying wrapper tool that implements a similar concept to hyperparameter tuning, offering grid and random parameter set search capabilities. "},{"location":"parameter-search/#search-space","title":"Search Space","text":"The parameter search tool uses a custom data type called a SearchSpace , which is defined as such: class SearchSpace(TypedDict):\n \"\"\"Search space used for parameter searches.\"\"\"\n\n filenames: list[str]\n loader: list[str]\n chunking_config: list[str]\n embedding_model: list[str]\n vector_store: list[str]\n similarity_top_k: list[str]\n llm: list[str]\n git_data: Optional[list[GitDataFileConfig]]\n The SearchSpace type has a corresponding initialization function to help with creating a search space. The init_search_space function is defined as such: def init_search_space(\n filenames: Optional[list[str] | str] = None,\n loader: Optional[list[str] | str] = None,\n chunking_config: Optional[list[str] | str] = None,\n embedding_model: Optional[list[str] | str] = None,\n vector_store: Optional[list[str] | str] = None,\n similarity_top_k: Optional[list[int] | int] = None,\n llm: Optional[list[str] | str] = None,\n git_data: Optional[list[GitDataFileConfig]] = None,\n) -> SearchSpace:\n \"\"\"Creates a search space instance.\n\n Parameters\n ----------\n filenames : list[str], str, or None (default: None)\n The filenames to test over for the search space (if None,\n defaults to all the filenames in the `bcorag/test_papers/`\n directory). Note, many files can increase run time\n significantly as a full parameter search will be executed\n on each paper sequentially.\n loader : list[str], str, or None (default: None)\n The data loaders for the search space (if None, defaults to\n the full list as defined in the conf.json list).\n chunking_config : list[str], str, or None (default: None)\n The chunking strategies for the search space (if None, defaults\n to the full list as defined in the conf.json list).\n embedding_model : list[str], str, or None (default: None)\n The embedding model for the search space (if None, defaults\n to the full list as defined in the conf.json list).\n vector_store : list[str], str, or None (default: None)\n The vector store for the search space (if None, defaults\n to the full list as defined in the conf.json list).\n similarity_top_k : list[int], int, or None (default: None)\n The similarity top k for the search space (if None, defaults\n to the full list as defined in the conf.json list).\n llm : list[str], str, or None (default: None)\n The llm for the search space (if None, defaults\n to the full list as defined in the conf.json list).\n git_data : list[GitDataFileConfig], GitDataFileConfig or None (default: None)\n The git data for each file (if None, assumes no git data for\n any files).\n\n Returns\n -------\n SearchSpace\n The search space grid.\n \"\"\"\n # initialization function\n "},{"location":"parameter-search/#grid-search","title":"Grid Search","text":"A grid search can be run from the main.py entrypoint using the grid-search positional argument like so: (env) python main.py grid-search\n This will run a grid search with the default parameter search space defined in the _create_search_space function. "},{"location":"parameter-search/#random-search","title":"Random Search","text":"A random search can be run from the main.py entrypoint using the random-search positional argument like so: (env) python main.py random-search\n This will run a random search with the default parameter search space defined in the _create_search_space function using a parameter subset value of 5 . "},{"location":"prompts/","title":"Prompts","text":"The standardized query prompts. QUERY_PROMPT : The standard wrapper used for each prompt. _TOP_LEVEL_SCHEMA : The entire top level 2791 object schema. SUPPLEMENT_PROMPT : Supplementary prompt for the domains that require the top level schema. USABILITY_DOMAIN : The usability domain specific prompt and schema. IO_DOMAIN : The IO domain specific prompt and schema. DESCRIPTION_DOMAIN : The description domain specific prompt and schema. EXECUTION_DOMAIN : The execution domain specific prompt and schema. PARAMETRIC_DOMAIN : The parametric domain specific prompt and schema. ERROR_DOMAIN : The error domain specific prompt and schema. DOMAIN_MAP : The domain map for the BcoRag object. "},{"location":"random-search/","title":"Random Search","text":"Random search class. "},{"location":"random-search/#parameter_search.random_search.BcoRandomSearch","title":"BcoRandomSearch ","text":" Bases: BcoParameterSearch BCO random search class. Subclass of BcoParameterSearch . Source code in parameter_search/random_search.py class BcoRandomSearch(BcoParameterSearch):\n \"\"\"BCO random search class. Subclass of `BcoParameterSearch`.\n \"\"\"\n\n def __init__(self, search_space: SearchSpace, subset_size: int = 5):\n \"\"\"Constructor.\n\n Parameters\n ----------\n search_space : SearchSpace\n The parameter search space.\n subset_size : int (default: 5)\n The number of parameter sets to search.\n \"\"\"\n super().__init__(search_space)\n self.subset_size = subset_size\n\n def _setup_logger(self, path: str = \"./logs\", name: str = \"random-search\") -> Logger:\n \"\"\"Sets up the logger.\n\n Parameters\n ----------\n path : str, optional\n File path for the logger.\n name : str, optional\n Name for the logger output.\n\n Returns\n -------\n Logger\n The grid search logger.\n \"\"\"\n check_dir(path)\n if not name.endswith(\".log\"):\n name = f\"{name}.log\"\n return setup_root_logger(os.path.join(path, name))\n\n def _create_param_sets(self) -> list[UserSelections]:\n \"\"\"Creates a random subset of the parameter space.\n\n Returns\n -------\n list[UserSelections]\n A random subset of the search space combinations.\n \"\"\"\n param_sets: list[UserSelections] = []\n\n for (\n llm,\n embedding_model,\n filepath,\n loader,\n chunking_config,\n vector_store,\n similarity_top_k,\n ) in product(\n self._llms,\n self._embedding_models,\n self._files,\n self._loaders,\n self._chunking_configs,\n self._vector_stores,\n self._similarity_top_k,\n ):\n base_selections = {\n \"llm\": llm,\n \"embedding_model\": embedding_model,\n \"filename\": os.path.basename(str(filepath)),\n \"filepath\": filepath,\n \"vector_store\": vector_store,\n \"loader\": loader,\n \"mode\": \"production\",\n \"similarity_top_k\": similarity_top_k,\n \"chunking_config\": chunking_config,\n }\n\n if self._git_data is None:\n base_selections[\"git_data\"] = None\n else:\n for git_data in self._git_data:\n if git_data[\"filename\"] == filepath or git_data[\n \"filename\"\n ] == os.path.basename(str(filepath)):\n base_selections[\"git_data\"] = create_git_data(\n user=git_data[\"git_info\"][\"user\"],\n repo=git_data[\"git_info\"][\"repo\"],\n branch=git_data[\"git_info\"][\"branch\"],\n filters=git_data[\"git_info\"][\"filters\"],\n )\n user_selections = create_user_selections(\n base_selections[\"llm\"],\n base_selections[\"embedding_model\"],\n base_selections[\"filename\"],\n base_selections[\"filepath\"],\n base_selections[\"vector_store\"],\n base_selections[\"loader\"],\n base_selections[\"mode\"],\n base_selections[\"similarity_top_k\"],\n base_selections[\"chunking_config\"],\n base_selections[\"git_data\"],\n )\n param_sets.append(user_selections)\n\n if self.subset_size > len(param_sets):\n self.subset_size = len(param_sets)\n\n param_subset = random.sample(param_sets, self.subset_size)\n\n return param_subset\n "},{"location":"random-search/#parameter_search.random_search.BcoRandomSearch.__init__","title":"__init__(search_space, subset_size=5) ","text":"Constructor. Parameters: Name Type Description Default search_space SearchSpace The parameter search space. required subset_size int (default: 5) The number of parameter sets to search. 5 Source code in parameter_search/random_search.py def __init__(self, search_space: SearchSpace, subset_size: int = 5):\n \"\"\"Constructor.\n\n Parameters\n ----------\n search_space : SearchSpace\n The parameter search space.\n subset_size : int (default: 5)\n The number of parameter sets to search.\n \"\"\"\n super().__init__(search_space)\n self.subset_size = subset_size\n "},{"location":"random-search/#parameter_search.random_search.BcoRandomSearch._setup_logger","title":"_setup_logger(path='./logs', name='random-search') ","text":"Sets up the logger. Parameters: Name Type Description Default path str File path for the logger. './logs' name str Name for the logger output. 'random-search' Returns: Type Description Logger The grid search logger. Source code in parameter_search/random_search.py def _setup_logger(self, path: str = \"./logs\", name: str = \"random-search\") -> Logger:\n \"\"\"Sets up the logger.\n\n Parameters\n ----------\n path : str, optional\n File path for the logger.\n name : str, optional\n Name for the logger output.\n\n Returns\n -------\n Logger\n The grid search logger.\n \"\"\"\n check_dir(path)\n if not name.endswith(\".log\"):\n name = f\"{name}.log\"\n return setup_root_logger(os.path.join(path, name))\n "},{"location":"random-search/#parameter_search.random_search.BcoRandomSearch._create_param_sets","title":"_create_param_sets() ","text":"Creates a random subset of the parameter space. Returns: Type Description list[UserSelections] A random subset of the search space combinations. Source code in parameter_search/random_search.py def _create_param_sets(self) -> list[UserSelections]:\n \"\"\"Creates a random subset of the parameter space.\n\n Returns\n -------\n list[UserSelections]\n A random subset of the search space combinations.\n \"\"\"\n param_sets: list[UserSelections] = []\n\n for (\n llm,\n embedding_model,\n filepath,\n loader,\n chunking_config,\n vector_store,\n similarity_top_k,\n ) in product(\n self._llms,\n self._embedding_models,\n self._files,\n self._loaders,\n self._chunking_configs,\n self._vector_stores,\n self._similarity_top_k,\n ):\n base_selections = {\n \"llm\": llm,\n \"embedding_model\": embedding_model,\n \"filename\": os.path.basename(str(filepath)),\n \"filepath\": filepath,\n \"vector_store\": vector_store,\n \"loader\": loader,\n \"mode\": \"production\",\n \"similarity_top_k\": similarity_top_k,\n \"chunking_config\": chunking_config,\n }\n\n if self._git_data is None:\n base_selections[\"git_data\"] = None\n else:\n for git_data in self._git_data:\n if git_data[\"filename\"] == filepath or git_data[\n \"filename\"\n ] == os.path.basename(str(filepath)):\n base_selections[\"git_data\"] = create_git_data(\n user=git_data[\"git_info\"][\"user\"],\n repo=git_data[\"git_info\"][\"repo\"],\n branch=git_data[\"git_info\"][\"branch\"],\n filters=git_data[\"git_info\"][\"filters\"],\n )\n user_selections = create_user_selections(\n base_selections[\"llm\"],\n base_selections[\"embedding_model\"],\n base_selections[\"filename\"],\n base_selections[\"filepath\"],\n base_selections[\"vector_store\"],\n base_selections[\"loader\"],\n base_selections[\"mode\"],\n base_selections[\"similarity_top_k\"],\n base_selections[\"chunking_config\"],\n base_selections[\"git_data\"],\n )\n param_sets.append(user_selections)\n\n if self.subset_size > len(param_sets):\n self.subset_size = len(param_sets)\n\n param_subset = random.sample(param_sets, self.subset_size)\n\n return param_subset\n "},{"location":"reference-frame/","title":"Reference Frame","text":""},{"location":"reference-frame/#evaluator.frontend.components.evaluation_frames.reference_frame.ReferenceFrame","title":"ReferenceFrame ","text":" Bases: CTkFrame , EvaluationBaseFrame Class for the reference evaluation frame. Source code in evaluator/frontend/components/evaluation_frames/reference_frame.py class ReferenceFrame(ctk.CTkFrame, EvaluationBaseFrame):\n \"\"\"Class for the reference evaluation frame.\"\"\"\n\n def __init__(\n self, master: ctk.CTkFrame, app_state: AppState, run_state: RunState, **kwargs\n ):\n \"\"\"Constructor.\"\"\"\n super().__init__(master, **kwargs)\n self.state = app_state\n self.run = run_state\n self.reference_eval = self.run[\"eval_data\"][\"reference_eval\"]\n\n self.grid_columnconfigure(0, weight=1)\n self.grid_rowconfigure(1, weight=1)\n\n self.main_ref_label = ctk.CTkLabel(\n master=self,\n text=\"Reference Evaluation\",\n font=(self.state[\"font\"], 28, \"bold\"),\n )\n self.main_ref_label.grid(\n row=0, columnspan=2, padx=self.state[\"padding\"], pady=self.state[\"padding\"]\n )\n\n self.ref_eval_label = ctk.CTkLabel(\n master=self,\n text=\"How relevant are the reference nodes?\",\n font=(self.state[\"font\"], 16, \"bold\"),\n )\n self.ref_eval_label.grid(\n row=2,\n column=0,\n padx=(self.state[\"padding\"], self.state[\"padding\"] // 4),\n pady=(self.state[\"padding\"] // 2, self.state[\"padding\"] // 4),\n sticky=\"w\",\n )\n\n self.ref_eval_var = ctk.IntVar(\n value=self.reference_eval.get(\n \"reference_relevancy\", EVAL_DEFAULTS[\"reference_relevancy\"]\n )\n )\n self.ref_eval_button = ctk.CTkSegmentedButton(\n master=self, values=[-1, 0, 1, 2], variable=self.ref_eval_var\n )\n self.ref_eval_button.grid(\n row=2,\n column=1,\n padx=(self.state[\"padding\"] // 4, self.state[\"padding\"]),\n pady=self.state[\"padding\"] // 4,\n )\n\n self.top_ref_var = ctk.StringVar(\n value=reverse_cast_checkbox(\n self.reference_eval.get(\n \"top_reference_retrieval\", EVAL_DEFAULTS[\"top_reference_retrieval\"]\n )\n )\n )\n self.top_ref_checkbox = ctk.CTkCheckBox(\n master=self,\n text=\"Top reference is most relevant?\",\n variable=self.top_ref_var,\n onvalue=\"on\",\n offvalue=\"off\",\n )\n self.top_ref_checkbox.grid(\n row=3,\n column=0,\n padx=self.state[\"padding\"],\n pady=self.state[\"padding\"] // 4,\n sticky=\"w\",\n )\n\n self.ref_notes_label = ctk.CTkLabel(\n master=self, text=\"Notes\", font=(self.state[\"font\"], 16, \"bold\")\n )\n self.ref_notes_label.grid(\n row=4,\n column=0,\n padx=self.state[\"padding\"],\n pady=(self.state[\"padding\"] // 2, self.state[\"padding\"] // 4),\n sticky=\"w\",\n )\n\n self.ref_notes = ctk.CTkTextbox(\n master=self, wrap=\"word\", font=(self.state[\"font\"], 18)\n )\n self.ref_notes.grid(\n row=5,\n columnspan=2,\n padx=self.state[\"padding\"] // 2,\n pady=(self.state[\"padding\"] // 4, self.state[\"padding\"]),\n sticky=\"nsew\",\n )\n\n def update_state(self, app_state: AppState, run_state: RunState) -> None:\n \"\"\"Update the state.\n\n Parameters\n ----------\n app_state : AppState\n The updated app state.\n run_state : RunState\n The updated run state.\n \"\"\"\n self.run = run_state\n self.state = app_state\n self.reference_eval = self.run[\"eval_data\"][\"reference_eval\"]\n\n self.ref_eval_var = ctk.IntVar(\n value=self.reference_eval.get(\n \"reference_relevancy\", EVAL_DEFAULTS[\"reference_relevancy\"]\n )\n )\n self.ref_eval_button.configure(variable=self.ref_eval_var)\n\n self.top_ref_var = ctk.StringVar(\n value=reverse_cast_checkbox(\n self.reference_eval.get(\n \"top_reference_retrieval\", EVAL_DEFAULTS[\"top_reference_retrieval\"]\n )\n )\n )\n self.top_ref_checkbox.configure(variable=self.top_ref_var)\n\n self.ref_notes.delete(0.0, \"end\")\n self.ref_notes.insert(\n 0.0, self.reference_eval.get(\"notes\", EVAL_DEFAULTS[\"notes\"])\n )\n\n def get_results(self) -> RefereceEval:\n \"\"\"Returns the reference evaluations.\n\n Returns\n -------\n ReferenceEval\n The reference evaluation results.\n \"\"\"\n ref_eval_score = self.ref_eval_var.get()\n top_ref_val = self.top_ref_var.get()\n notes = self.ref_notes.get(0.0, \"end\")\n ref_eval = create_reference_eval(\n reference_relevancy=ref_eval_score,\n top_reference_retrieval=top_ref_val,\n notes=notes,\n )\n return ref_eval\n "},{"location":"reference-frame/#evaluator.frontend.components.evaluation_frames.reference_frame.ReferenceFrame.__init__","title":"__init__(master, app_state, run_state, **kwargs) ","text":"Constructor. Source code in evaluator/frontend/components/evaluation_frames/reference_frame.py def __init__(\n self, master: ctk.CTkFrame, app_state: AppState, run_state: RunState, **kwargs\n):\n \"\"\"Constructor.\"\"\"\n super().__init__(master, **kwargs)\n self.state = app_state\n self.run = run_state\n self.reference_eval = self.run[\"eval_data\"][\"reference_eval\"]\n\n self.grid_columnconfigure(0, weight=1)\n self.grid_rowconfigure(1, weight=1)\n\n self.main_ref_label = ctk.CTkLabel(\n master=self,\n text=\"Reference Evaluation\",\n font=(self.state[\"font\"], 28, \"bold\"),\n )\n self.main_ref_label.grid(\n row=0, columnspan=2, padx=self.state[\"padding\"], pady=self.state[\"padding\"]\n )\n\n self.ref_eval_label = ctk.CTkLabel(\n master=self,\n text=\"How relevant are the reference nodes?\",\n font=(self.state[\"font\"], 16, \"bold\"),\n )\n self.ref_eval_label.grid(\n row=2,\n column=0,\n padx=(self.state[\"padding\"], self.state[\"padding\"] // 4),\n pady=(self.state[\"padding\"] // 2, self.state[\"padding\"] // 4),\n sticky=\"w\",\n )\n\n self.ref_eval_var = ctk.IntVar(\n value=self.reference_eval.get(\n \"reference_relevancy\", EVAL_DEFAULTS[\"reference_relevancy\"]\n )\n )\n self.ref_eval_button = ctk.CTkSegmentedButton(\n master=self, values=[-1, 0, 1, 2], variable=self.ref_eval_var\n )\n self.ref_eval_button.grid(\n row=2,\n column=1,\n padx=(self.state[\"padding\"] // 4, self.state[\"padding\"]),\n pady=self.state[\"padding\"] // 4,\n )\n\n self.top_ref_var = ctk.StringVar(\n value=reverse_cast_checkbox(\n self.reference_eval.get(\n \"top_reference_retrieval\", EVAL_DEFAULTS[\"top_reference_retrieval\"]\n )\n )\n )\n self.top_ref_checkbox = ctk.CTkCheckBox(\n master=self,\n text=\"Top reference is most relevant?\",\n variable=self.top_ref_var,\n onvalue=\"on\",\n offvalue=\"off\",\n )\n self.top_ref_checkbox.grid(\n row=3,\n column=0,\n padx=self.state[\"padding\"],\n pady=self.state[\"padding\"] // 4,\n sticky=\"w\",\n )\n\n self.ref_notes_label = ctk.CTkLabel(\n master=self, text=\"Notes\", font=(self.state[\"font\"], 16, \"bold\")\n )\n self.ref_notes_label.grid(\n row=4,\n column=0,\n padx=self.state[\"padding\"],\n pady=(self.state[\"padding\"] // 2, self.state[\"padding\"] // 4),\n sticky=\"w\",\n )\n\n self.ref_notes = ctk.CTkTextbox(\n master=self, wrap=\"word\", font=(self.state[\"font\"], 18)\n )\n self.ref_notes.grid(\n row=5,\n columnspan=2,\n padx=self.state[\"padding\"] // 2,\n pady=(self.state[\"padding\"] // 4, self.state[\"padding\"]),\n sticky=\"nsew\",\n )\n "},{"location":"reference-frame/#evaluator.frontend.components.evaluation_frames.reference_frame.ReferenceFrame.update_state","title":"update_state(app_state, run_state) ","text":"Update the state. Parameters: Name Type Description Default app_state AppState The updated app state. required run_state RunState The updated run state. required Source code in evaluator/frontend/components/evaluation_frames/reference_frame.py def update_state(self, app_state: AppState, run_state: RunState) -> None:\n \"\"\"Update the state.\n\n Parameters\n ----------\n app_state : AppState\n The updated app state.\n run_state : RunState\n The updated run state.\n \"\"\"\n self.run = run_state\n self.state = app_state\n self.reference_eval = self.run[\"eval_data\"][\"reference_eval\"]\n\n self.ref_eval_var = ctk.IntVar(\n value=self.reference_eval.get(\n \"reference_relevancy\", EVAL_DEFAULTS[\"reference_relevancy\"]\n )\n )\n self.ref_eval_button.configure(variable=self.ref_eval_var)\n\n self.top_ref_var = ctk.StringVar(\n value=reverse_cast_checkbox(\n self.reference_eval.get(\n \"top_reference_retrieval\", EVAL_DEFAULTS[\"top_reference_retrieval\"]\n )\n )\n )\n self.top_ref_checkbox.configure(variable=self.top_ref_var)\n\n self.ref_notes.delete(0.0, \"end\")\n self.ref_notes.insert(\n 0.0, self.reference_eval.get(\"notes\", EVAL_DEFAULTS[\"notes\"])\n )\n "},{"location":"reference-frame/#evaluator.frontend.components.evaluation_frames.reference_frame.ReferenceFrame.get_results","title":"get_results() ","text":"Returns the reference evaluations. Returns: Type Description ReferenceEval The reference evaluation results. Source code in evaluator/frontend/components/evaluation_frames/reference_frame.py def get_results(self) -> RefereceEval:\n \"\"\"Returns the reference evaluations.\n\n Returns\n -------\n ReferenceEval\n The reference evaluation results.\n \"\"\"\n ref_eval_score = self.ref_eval_var.get()\n top_ref_val = self.top_ref_var.get()\n notes = self.ref_notes.get(0.0, \"end\")\n ref_eval = create_reference_eval(\n reference_relevancy=ref_eval_score,\n top_reference_retrieval=top_ref_val,\n notes=notes,\n )\n return ref_eval\n "},{"location":"score-frame/","title":"Score Frame","text":""},{"location":"score-frame/#evaluator.frontend.components.evaluation_frames.score_frame.ScoreFrame","title":"ScoreFrame ","text":" Bases: CTkFrame , EvaluationBaseFrame Class for the score evaluation frame. Source code in evaluator/frontend/components/evaluation_frames/score_frame.py class ScoreFrame(ctk.CTkFrame, EvaluationBaseFrame):\n \"\"\"Class for the score evaluation frame.\"\"\"\n\n def __init__(\n self, master: ctk.CTkFrame, app_state: AppState, run_state: RunState, **kwargs\n ):\n \"\"\"Constructor.\"\"\"\n super().__init__(master, **kwargs)\n self.state = app_state\n self.run = run_state\n self.score_eval = run_state[\"eval_data\"][\"score_eval\"]\n\n self.grid_columnconfigure(0, weight=1)\n self.grid_rowconfigure(1, weight=1)\n\n self.main_score_label = ctk.CTkLabel(\n master=self, text=\"Score Evaluation\", font=(self.state[\"font\"], 28, \"bold\")\n )\n self.main_score_label.grid(\n row=0, columnspan=3, padx=self.state[\"padding\"], pady=self.state[\"padding\"]\n )\n\n self.score_label = ctk.CTkLabel(\n master=self, text=\"Score:\", font=(self.state[\"font\"], 16, \"bold\")\n )\n self.score_label.grid(\n row=2,\n column=0,\n padx=(self.state[\"padding\"], self.state[\"padding\"] // 2),\n pady=self.state[\"padding\"] // 4,\n sticky=\"w\",\n )\n\n self.score_text = ctk.CTkLabel(master=self, font=(self.state[\"font\"], 16))\n self.score_text.grid(\n row=2,\n column=1,\n padx=(self.state[\"padding\"] // 2, self.state[\"padding\"]),\n pady=self.state[\"padding\"] // 4,\n sticky=\"w\",\n )\n\n self.score_version_label = ctk.CTkLabel(\n master=self, text=\"Score version:\", font=(self.state[\"font\"], 16, \"bold\")\n )\n self.score_version_label.grid(\n row=3,\n column=0,\n padx=(self.state[\"padding\"], self.state[\"padding\"] // 2),\n pady=self.state[\"padding\"] // 4,\n sticky=\"w\",\n )\n\n self.score_version_text = ctk.CTkLabel(\n master=self, font=(self.state[\"font\"], 16)\n )\n self.score_version_text.grid(\n row=3,\n column=1,\n padx=(self.state[\"padding\"] // 2, self.state[\"padding\"]),\n pady=self.state[\"padding\"] // 4,\n sticky=\"w\",\n )\n\n self.score_eval_label = ctk.CTkLabel(\n master=self,\n text=\"Should the score be higher or lower?\",\n font=(self.state[\"font\"], 16, \"bold\"),\n )\n self.score_eval_label.grid(\n row=2,\n column=2,\n padx=self.state[\"padding\"],\n pady=self.state[\"padding\"] // 4,\n )\n\n self.score_eval_var = ctk.StringVar(value=self.score_eval[\"eval\"])\n self.score_eval_button = ctk.CTkSegmentedButton(\n master=self,\n values=[\"Lower\", \"About right\", \"Higher\"],\n variable=self.score_eval_var,\n )\n self.score_eval_button.grid(\n row=3,\n column=2,\n padx=self.state[\"padding\"],\n pady=self.state[\"padding\"] // 4,\n )\n\n self.score_notes_label = ctk.CTkLabel(\n master=self, text=\"Notes\", font=(self.state[\"font\"], 16, \"bold\")\n )\n self.score_notes_label.grid(\n row=6,\n column=0,\n padx=self.state[\"padding\"],\n pady=(self.state[\"padding\"] // 2, self.state[\"padding\"] // 4),\n sticky=\"w\",\n )\n\n self.score_notes = ctk.CTkTextbox(\n master=self, wrap=\"word\", font=(self.state[\"font\"], 18)\n )\n self.score_notes.grid(\n row=7,\n column=0,\n columnspan=3,\n padx=self.state[\"padding\"] // 2,\n pady=(self.state[\"padding\"] // 4, self.state[\"padding\"]),\n sticky=\"nsew\",\n )\n\n self.update_state(app_state=self.state, run_state=self.run)\n\n def update_state(self, app_state: AppState, run_state: RunState) -> None:\n \"\"\"Update the component state.\n\n Parameters\n ----------\n app_state : AppState\n The updated app state.\n run_state : RunState\n The updated run state.\n \"\"\"\n self.run = run_state\n self.state = app_state\n self.score_eval = self.run[\"eval_data\"][\"score_eval\"]\n\n self.score_text.configure(text=f\"{self.run['score']}\")\n self.score_version_text.configure(text=f\"{self.run['score_version']}\")\n\n self.score_eval_var = ctk.StringVar(\n value=self.score_eval.get(\"eval\", EVAL_DEFAULTS[\"eval\"])\n )\n self.score_eval_button.configure(variable=self.score_eval_var)\n\n self.score_notes.delete(0.0, \"end\")\n self.score_notes.insert(\n 0.0, self.score_eval.get(\"notes\", EVAL_DEFAULTS[\"notes\"])\n )\n\n def get_results(self) -> ScoreEval:\n \"\"\"Returns the score evaluations.\n\n Returns\n -------\n ScoreEval\n The score evaluation results.\n \"\"\"\n score_eval_button_val = cast_score_eval(self.score_eval_var.get())\n score_eval = create_score_eval(\n eval=score_eval_button_val, notes=self.score_notes.get(0.0, \"end\")\n )\n return score_eval\n "},{"location":"score-frame/#evaluator.frontend.components.evaluation_frames.score_frame.ScoreFrame.__init__","title":"__init__(master, app_state, run_state, **kwargs) ","text":"Constructor. Source code in evaluator/frontend/components/evaluation_frames/score_frame.py def __init__(\n self, master: ctk.CTkFrame, app_state: AppState, run_state: RunState, **kwargs\n):\n \"\"\"Constructor.\"\"\"\n super().__init__(master, **kwargs)\n self.state = app_state\n self.run = run_state\n self.score_eval = run_state[\"eval_data\"][\"score_eval\"]\n\n self.grid_columnconfigure(0, weight=1)\n self.grid_rowconfigure(1, weight=1)\n\n self.main_score_label = ctk.CTkLabel(\n master=self, text=\"Score Evaluation\", font=(self.state[\"font\"], 28, \"bold\")\n )\n self.main_score_label.grid(\n row=0, columnspan=3, padx=self.state[\"padding\"], pady=self.state[\"padding\"]\n )\n\n self.score_label = ctk.CTkLabel(\n master=self, text=\"Score:\", font=(self.state[\"font\"], 16, \"bold\")\n )\n self.score_label.grid(\n row=2,\n column=0,\n padx=(self.state[\"padding\"], self.state[\"padding\"] // 2),\n pady=self.state[\"padding\"] // 4,\n sticky=\"w\",\n )\n\n self.score_text = ctk.CTkLabel(master=self, font=(self.state[\"font\"], 16))\n self.score_text.grid(\n row=2,\n column=1,\n padx=(self.state[\"padding\"] // 2, self.state[\"padding\"]),\n pady=self.state[\"padding\"] // 4,\n sticky=\"w\",\n )\n\n self.score_version_label = ctk.CTkLabel(\n master=self, text=\"Score version:\", font=(self.state[\"font\"], 16, \"bold\")\n )\n self.score_version_label.grid(\n row=3,\n column=0,\n padx=(self.state[\"padding\"], self.state[\"padding\"] // 2),\n pady=self.state[\"padding\"] // 4,\n sticky=\"w\",\n )\n\n self.score_version_text = ctk.CTkLabel(\n master=self, font=(self.state[\"font\"], 16)\n )\n self.score_version_text.grid(\n row=3,\n column=1,\n padx=(self.state[\"padding\"] // 2, self.state[\"padding\"]),\n pady=self.state[\"padding\"] // 4,\n sticky=\"w\",\n )\n\n self.score_eval_label = ctk.CTkLabel(\n master=self,\n text=\"Should the score be higher or lower?\",\n font=(self.state[\"font\"], 16, \"bold\"),\n )\n self.score_eval_label.grid(\n row=2,\n column=2,\n padx=self.state[\"padding\"],\n pady=self.state[\"padding\"] // 4,\n )\n\n self.score_eval_var = ctk.StringVar(value=self.score_eval[\"eval\"])\n self.score_eval_button = ctk.CTkSegmentedButton(\n master=self,\n values=[\"Lower\", \"About right\", \"Higher\"],\n variable=self.score_eval_var,\n )\n self.score_eval_button.grid(\n row=3,\n column=2,\n padx=self.state[\"padding\"],\n pady=self.state[\"padding\"] // 4,\n )\n\n self.score_notes_label = ctk.CTkLabel(\n master=self, text=\"Notes\", font=(self.state[\"font\"], 16, \"bold\")\n )\n self.score_notes_label.grid(\n row=6,\n column=0,\n padx=self.state[\"padding\"],\n pady=(self.state[\"padding\"] // 2, self.state[\"padding\"] // 4),\n sticky=\"w\",\n )\n\n self.score_notes = ctk.CTkTextbox(\n master=self, wrap=\"word\", font=(self.state[\"font\"], 18)\n )\n self.score_notes.grid(\n row=7,\n column=0,\n columnspan=3,\n padx=self.state[\"padding\"] // 2,\n pady=(self.state[\"padding\"] // 4, self.state[\"padding\"]),\n sticky=\"nsew\",\n )\n\n self.update_state(app_state=self.state, run_state=self.run)\n "},{"location":"score-frame/#evaluator.frontend.components.evaluation_frames.score_frame.ScoreFrame.update_state","title":"update_state(app_state, run_state) ","text":"Update the component state. Parameters: Name Type Description Default app_state AppState The updated app state. required run_state RunState The updated run state. required Source code in evaluator/frontend/components/evaluation_frames/score_frame.py def update_state(self, app_state: AppState, run_state: RunState) -> None:\n \"\"\"Update the component state.\n\n Parameters\n ----------\n app_state : AppState\n The updated app state.\n run_state : RunState\n The updated run state.\n \"\"\"\n self.run = run_state\n self.state = app_state\n self.score_eval = self.run[\"eval_data\"][\"score_eval\"]\n\n self.score_text.configure(text=f\"{self.run['score']}\")\n self.score_version_text.configure(text=f\"{self.run['score_version']}\")\n\n self.score_eval_var = ctk.StringVar(\n value=self.score_eval.get(\"eval\", EVAL_DEFAULTS[\"eval\"])\n )\n self.score_eval_button.configure(variable=self.score_eval_var)\n\n self.score_notes.delete(0.0, \"end\")\n self.score_notes.insert(\n 0.0, self.score_eval.get(\"notes\", EVAL_DEFAULTS[\"notes\"])\n )\n "},{"location":"score-frame/#evaluator.frontend.components.evaluation_frames.score_frame.ScoreFrame.get_results","title":"get_results() ","text":"Returns the score evaluations. Returns: Type Description ScoreEval The score evaluation results. Source code in evaluator/frontend/components/evaluation_frames/score_frame.py def get_results(self) -> ScoreEval:\n \"\"\"Returns the score evaluations.\n\n Returns\n -------\n ScoreEval\n The score evaluation results.\n \"\"\"\n score_eval_button_val = cast_score_eval(self.score_eval_var.get())\n score_eval = create_score_eval(\n eval=score_eval_button_val, notes=self.score_notes.get(0.0, \"end\")\n )\n return score_eval\n "},{"location":"sidebar/","title":"Sidebar","text":""},{"location":"sidebar/#evaluator.frontend.components.sidebar.SideBar","title":"SideBar ","text":" Bases: CTkFrame Class for the navigation sidebar. Source code in evaluator/frontend/components/sidebar.py class SideBar(ctk.CTkFrame):\n \"\"\"Class for the navigation sidebar.\"\"\"\n\n def __init__(\n self,\n master: ctk.CTkFrame,\n app_state: AppState,\n run_state: RunState,\n navigate: Callable[[Literal[-1, 1], int, AppState], None],\n on_save: Callable[[AppState], None],\n on_exit: Callable[[], NoReturn],\n **kwargs,\n ):\n \"\"\"Constructor.\"\"\"\n super().__init__(master, **kwargs)\n\n self.state = app_state\n self.run = run_state\n self.navigate = navigate\n self.save = on_save\n self.exit = on_exit\n\n self.sidebar_frame = ctk.CTkFrame(master=master, width=140, corner_radius=0)\n self.sidebar_frame.grid(row=0, column=1, sticky=\"nsew\")\n self.sidebar_frame.grid_rowconfigure(7, weight=1)\n\n padding = self.state[\"padding\"]\n half_padding = padding // 2\n\n self.navigate_label = ctk.CTkLabel(\n master=self.sidebar_frame,\n text=\"Navigate\",\n font=(self.state[\"font\"], 16, \"bold\"),\n )\n self.navigate_label.grid(\n row=0, column=0, padx=padding, pady=(padding, half_padding)\n )\n\n self.prev_button = ctk.CTkButton(\n master=self.sidebar_frame,\n text=\"Previous\",\n command=self._previous,\n state=(\"normal\" if self.run[\"run_index\"] > 0 else \"disabled\"),\n )\n self.prev_button.grid(row=1, column=0, padx=padding, pady=half_padding)\n\n self.next_button = ctk.CTkButton(\n master=self.sidebar_frame,\n text=\"Next\",\n command=self._next,\n state=(\n \"normal\"\n if self.run[\"run_index\"] < self.run[\"total_runs\"] - 1\n else \"disabled\"\n ),\n )\n self.next_button.grid(row=2, column=0, padx=padding, pady=half_padding)\n\n self.run_counter_label = ctk.CTkLabel(\n master=self.sidebar_frame,\n text=f\"Run: {self.run['run_index'] + 1} / {self.run['total_runs']}\",\n font=(self.state[\"font\"], 16, \"bold\"),\n )\n self.run_counter_label.grid(row=3, column=0, padx=padding, pady=half_padding)\n\n self.already_evaluated_label = ctk.CTkLabel(\n master=self.sidebar_frame,\n text=\"Already Evaluated\" if self.run[\"already_evaluated\"] else \"\",\n font=(self.state[\"font\"], 16, \"bold\"),\n text_color=\"red\",\n )\n self.already_evaluated_label.grid(\n row=4, column=0, padx=padding, pady=half_padding\n )\n\n self.save_button = ctk.CTkButton(\n master=self.sidebar_frame, text=\"Save\", command=self._save\n )\n self.save_button.grid(row=5, column=0, padx=padding, pady=half_padding)\n\n self.exit_button = ctk.CTkButton(\n master=self.sidebar_frame, text=\"Exit\", command=self._exit\n )\n self.exit_button.grid(row=6, column=0, padx=padding, pady=half_padding)\n\n self.appearance_label = ctk.CTkLabel(\n self.sidebar_frame,\n text=\"Appearance\",\n font=(self.state[\"font\"], 16, \"bold\"),\n )\n self.appearance_label.grid(\n row=8, column=0, padx=padding, pady=(padding, half_padding)\n )\n\n self.appearance_option_menu = ctk.CTkOptionMenu(\n self.sidebar_frame,\n values=[\"System\", \"Light\", \"Dark\"],\n command=self._change_appearance_mode,\n )\n self.appearance_option_menu.grid(\n row=9, column=0, padx=padding, pady=half_padding\n )\n\n self.scaling_label = ctk.CTkLabel(\n master=self.sidebar_frame,\n text=\"UI Scaling\",\n font=(self.state[\"font\"], 16, \"bold\"),\n )\n self.scaling_label.grid(row=10, column=0, padx=padding, pady=half_padding)\n\n self.scaling_option_menu = ctk.CTkOptionMenu(\n master=self.sidebar_frame,\n values=[\"70%\", \"80%\", \"90%\", \"100%\", \"110%\", \"120%\", \"130%\"],\n command=self._change_scaling_value,\n )\n self.scaling_option_menu.grid(\n row=11, column=0, padx=padding, pady=(half_padding, padding)\n )\n\n def update_state(self, run_state: RunState) -> None:\n \"\"\"Updates the run state for consistency.\"\"\"\n self.run = run_state\n self.run_counter_label.configure(\n text=f\"Run: {self.run['run_index'] + 1} / {self.run['total_runs']}\"\n )\n self.already_evaluated_label.configure(\n text=\"Already evaluated\" if self.run[\"already_evaluated\"] else \"\"\n )\n\n def _previous(self) -> None:\n \"\"\"Callback for the previous button press.\"\"\"\n new_run_index = self.run[\"run_index\"] - 1\n if new_run_index == 0:\n self.prev_button.configure(state=\"disabled\")\n else:\n self.prev_button.configure(state=\"normal\")\n self.next_button.configure(state=\"normal\")\n self.navigate(-1, new_run_index, self.state)\n\n def _next(self) -> None:\n \"\"\"Callback for the next button press.\"\"\"\n new_run_index = self.run[\"run_index\"] + 1\n if new_run_index >= self.run[\"total_runs\"] - 1:\n self.next_button.configure(state=\"disabled\")\n else:\n self.next_button.configure(state=\"normal\")\n self.prev_button.configure(state=\"normal\")\n self.navigate(1, new_run_index, self.state)\n\n def _change_appearance_mode(self, new_appearance_mode: str) -> None:\n \"\"\"Changes the UI color mode.\"\"\"\n ctk.set_appearance_mode(new_appearance_mode)\n\n def _change_scaling_value(self, new_scaling: str) -> None:\n \"\"\"Changes the UI scaling.\"\"\"\n new_scaling_val = int(new_scaling.replace(\"%\", \"\")) / 100\n ctk.set_widget_scaling(new_scaling_val)\n\n def _save(self) -> None:\n \"\"\"Calls the save state function.\"\"\"\n self.save(self.state)\n\n def _exit(self) -> NoReturn:\n \"\"\"Calls the exit function.\"\"\"\n self.save(self.state)\n self.exit()\n "},{"location":"sidebar/#evaluator.frontend.components.sidebar.SideBar.__init__","title":"__init__(master, app_state, run_state, navigate, on_save, on_exit, **kwargs) ","text":"Constructor. Source code in evaluator/frontend/components/sidebar.py def __init__(\n self,\n master: ctk.CTkFrame,\n app_state: AppState,\n run_state: RunState,\n navigate: Callable[[Literal[-1, 1], int, AppState], None],\n on_save: Callable[[AppState], None],\n on_exit: Callable[[], NoReturn],\n **kwargs,\n):\n \"\"\"Constructor.\"\"\"\n super().__init__(master, **kwargs)\n\n self.state = app_state\n self.run = run_state\n self.navigate = navigate\n self.save = on_save\n self.exit = on_exit\n\n self.sidebar_frame = ctk.CTkFrame(master=master, width=140, corner_radius=0)\n self.sidebar_frame.grid(row=0, column=1, sticky=\"nsew\")\n self.sidebar_frame.grid_rowconfigure(7, weight=1)\n\n padding = self.state[\"padding\"]\n half_padding = padding // 2\n\n self.navigate_label = ctk.CTkLabel(\n master=self.sidebar_frame,\n text=\"Navigate\",\n font=(self.state[\"font\"], 16, \"bold\"),\n )\n self.navigate_label.grid(\n row=0, column=0, padx=padding, pady=(padding, half_padding)\n )\n\n self.prev_button = ctk.CTkButton(\n master=self.sidebar_frame,\n text=\"Previous\",\n command=self._previous,\n state=(\"normal\" if self.run[\"run_index\"] > 0 else \"disabled\"),\n )\n self.prev_button.grid(row=1, column=0, padx=padding, pady=half_padding)\n\n self.next_button = ctk.CTkButton(\n master=self.sidebar_frame,\n text=\"Next\",\n command=self._next,\n state=(\n \"normal\"\n if self.run[\"run_index\"] < self.run[\"total_runs\"] - 1\n else \"disabled\"\n ),\n )\n self.next_button.grid(row=2, column=0, padx=padding, pady=half_padding)\n\n self.run_counter_label = ctk.CTkLabel(\n master=self.sidebar_frame,\n text=f\"Run: {self.run['run_index'] + 1} / {self.run['total_runs']}\",\n font=(self.state[\"font\"], 16, \"bold\"),\n )\n self.run_counter_label.grid(row=3, column=0, padx=padding, pady=half_padding)\n\n self.already_evaluated_label = ctk.CTkLabel(\n master=self.sidebar_frame,\n text=\"Already Evaluated\" if self.run[\"already_evaluated\"] else \"\",\n font=(self.state[\"font\"], 16, \"bold\"),\n text_color=\"red\",\n )\n self.already_evaluated_label.grid(\n row=4, column=0, padx=padding, pady=half_padding\n )\n\n self.save_button = ctk.CTkButton(\n master=self.sidebar_frame, text=\"Save\", command=self._save\n )\n self.save_button.grid(row=5, column=0, padx=padding, pady=half_padding)\n\n self.exit_button = ctk.CTkButton(\n master=self.sidebar_frame, text=\"Exit\", command=self._exit\n )\n self.exit_button.grid(row=6, column=0, padx=padding, pady=half_padding)\n\n self.appearance_label = ctk.CTkLabel(\n self.sidebar_frame,\n text=\"Appearance\",\n font=(self.state[\"font\"], 16, \"bold\"),\n )\n self.appearance_label.grid(\n row=8, column=0, padx=padding, pady=(padding, half_padding)\n )\n\n self.appearance_option_menu = ctk.CTkOptionMenu(\n self.sidebar_frame,\n values=[\"System\", \"Light\", \"Dark\"],\n command=self._change_appearance_mode,\n )\n self.appearance_option_menu.grid(\n row=9, column=0, padx=padding, pady=half_padding\n )\n\n self.scaling_label = ctk.CTkLabel(\n master=self.sidebar_frame,\n text=\"UI Scaling\",\n font=(self.state[\"font\"], 16, \"bold\"),\n )\n self.scaling_label.grid(row=10, column=0, padx=padding, pady=half_padding)\n\n self.scaling_option_menu = ctk.CTkOptionMenu(\n master=self.sidebar_frame,\n values=[\"70%\", \"80%\", \"90%\", \"100%\", \"110%\", \"120%\", \"130%\"],\n command=self._change_scaling_value,\n )\n self.scaling_option_menu.grid(\n row=11, column=0, padx=padding, pady=(half_padding, padding)\n )\n "},{"location":"sidebar/#evaluator.frontend.components.sidebar.SideBar.update_state","title":"update_state(run_state) ","text":"Updates the run state for consistency. Source code in evaluator/frontend/components/sidebar.py def update_state(self, run_state: RunState) -> None:\n \"\"\"Updates the run state for consistency.\"\"\"\n self.run = run_state\n self.run_counter_label.configure(\n text=f\"Run: {self.run['run_index'] + 1} / {self.run['total_runs']}\"\n )\n self.already_evaluated_label.configure(\n text=\"Already evaluated\" if self.run[\"already_evaluated\"] else \"\"\n )\n "},{"location":"sidebar/#evaluator.frontend.components.sidebar.SideBar._previous","title":"_previous() ","text":"Callback for the previous button press. Source code in evaluator/frontend/components/sidebar.py def _previous(self) -> None:\n \"\"\"Callback for the previous button press.\"\"\"\n new_run_index = self.run[\"run_index\"] - 1\n if new_run_index == 0:\n self.prev_button.configure(state=\"disabled\")\n else:\n self.prev_button.configure(state=\"normal\")\n self.next_button.configure(state=\"normal\")\n self.navigate(-1, new_run_index, self.state)\n "},{"location":"sidebar/#evaluator.frontend.components.sidebar.SideBar._next","title":"_next() ","text":"Callback for the next button press. Source code in evaluator/frontend/components/sidebar.py def _next(self) -> None:\n \"\"\"Callback for the next button press.\"\"\"\n new_run_index = self.run[\"run_index\"] + 1\n if new_run_index >= self.run[\"total_runs\"] - 1:\n self.next_button.configure(state=\"disabled\")\n else:\n self.next_button.configure(state=\"normal\")\n self.prev_button.configure(state=\"normal\")\n self.navigate(1, new_run_index, self.state)\n "},{"location":"sidebar/#evaluator.frontend.components.sidebar.SideBar._change_appearance_mode","title":"_change_appearance_mode(new_appearance_mode) ","text":"Changes the UI color mode. Source code in evaluator/frontend/components/sidebar.py def _change_appearance_mode(self, new_appearance_mode: str) -> None:\n \"\"\"Changes the UI color mode.\"\"\"\n ctk.set_appearance_mode(new_appearance_mode)\n "},{"location":"sidebar/#evaluator.frontend.components.sidebar.SideBar._change_scaling_value","title":"_change_scaling_value(new_scaling) ","text":"Changes the UI scaling. Source code in evaluator/frontend/components/sidebar.py def _change_scaling_value(self, new_scaling: str) -> None:\n \"\"\"Changes the UI scaling.\"\"\"\n new_scaling_val = int(new_scaling.replace(\"%\", \"\")) / 100\n ctk.set_widget_scaling(new_scaling_val)\n "},{"location":"sidebar/#evaluator.frontend.components.sidebar.SideBar._save","title":"_save() ","text":"Calls the save state function. Source code in evaluator/frontend/components/sidebar.py def _save(self) -> None:\n \"\"\"Calls the save state function.\"\"\"\n self.save(self.state)\n "},{"location":"sidebar/#evaluator.frontend.components.sidebar.SideBar._exit","title":"_exit() ","text":"Calls the exit function. Source code in evaluator/frontend/components/sidebar.py def _exit(self) -> NoReturn:\n \"\"\"Calls the exit function.\"\"\"\n self.save(self.state)\n self.exit()\n "},{"location":"state/","title":"State","text":"Handles all app and run state changes. "},{"location":"state/#evaluator.backend.state.create_new_user","title":"create_new_user(app_state, first_name, last_name) ","text":"Creates a new user. Parameters: Name Type Description Default app_state AppState The current app state. required first_name str The user's first name. required last_name str The user's last name. required Returns: Type Description AppState The updated app state. Source code in evaluator/backend/state.py def create_new_user(app_state: AppState, first_name: str, last_name: str) -> AppState:\n \"\"\"Creates a new user.\n\n Parameters\n ----------\n app_state : AppState\n The current app state.\n first_name : str\n The user's first name.\n last_name : str\n The user's last name.\n\n Returns\n -------\n AppState\n The updated app state.\n \"\"\"\n app_state[\"logger\"].info(f\"Creating new user for {last_name}, {first_name}\")\n app_state[\"users_data\"][app_state[\"user_hash\"]] = {\n \"first_name\": first_name,\n \"last_name\": last_name,\n }\n app_state[\"user_results_data\"][app_state[\"user_hash\"]] = None\n return app_state\n "},{"location":"state/#evaluator.backend.state.set_resume_session","title":"set_resume_session(app_state, resume_session) ","text":"Sets the resume session boolean. Parameters: Name Type Description Default app_state AppState The current app state. required resume_session bool The resume_session value to set. required Returns: Type Description AppState The updated app state. Source code in evaluator/backend/state.py def set_resume_session(app_state: AppState, resume_session: bool) -> AppState:\n \"\"\"Sets the resume session boolean.\n\n Parameters\n ----------\n app_state : AppState\n The current app state.\n resume_session : bool\n The resume_session value to set.\n\n Returns\n -------\n AppState\n The updated app state.\n \"\"\"\n app_state[\"resume_session\"] = resume_session\n return app_state\n "},{"location":"state/#evaluator.backend.state.save_state","title":"save_state(app_state) ","text":"Saves the state. Parameters: Name Type Description Default app_state AppState The app state to save. required Source code in evaluator/backend/state.py def save_state(app_state: AppState) -> None:\n \"\"\"Saves the state.\n\n Parameters\n ----------\n app_state : AppState\n The app state to save.\n \"\"\"\n app_state[\"logger\"].info(\"Writing data...\")\n misc_fns.write_json(\n output_path=os.path.join(\n app_state[\"results_dir_path\"], app_state[\"bco_results_file_name\"]\n ),\n data=app_state[\"bco_results_data\"],\n )\n misc_fns.write_json(\n output_path=os.path.join(\n app_state[\"results_dir_path\"], app_state[\"user_results_file_name\"]\n ),\n data=app_state[\"user_results_data\"],\n )\n misc_fns.write_json(\n output_path=os.path.join(\n app_state[\"results_dir_path\"], app_state[\"users_file_name\"]\n ),\n data=app_state[\"users_data\"],\n )\n "},{"location":"state/#evaluator.backend.state.submit_eval_state","title":"submit_eval_state(app_state, run_state) ","text":"Updates the app state with the submitted evaluation data. If the eval state is the default eval state this function will silently not perform the update. Parameters: Name Type Description Default app_state AppState The app state to update. required run_state RunState The run state to update from. required Returns: Type Description AppState The updated app state. Source code in evaluator/backend/state.py def submit_eval_state(app_state: AppState, run_state: RunState) -> AppState:\n \"\"\"Updates the app state with the submitted evaluation data. If the\n eval state is the default eval state this function will silently not\n perform the update.\n\n Parameters\n ----------\n app_state : AppState\n The app state to update.\n run_state : RunState\n The run state to update from.\n\n Returns\n -------\n AppState\n The updated app state.\n \"\"\"\n if not check_default_eval(run_state[\"eval_data\"]):\n\n user_hash = app_state[\"user_hash\"]\n file_name = os.path.basename(run_state[\"generated_file_path\"])\n file_eval = run_state[\"eval_data\"]\n\n ## update the users evaluation data file\n\n if user_hash not in app_state[\"user_results_data\"]:\n misc_fns.graceful_exit(\n 1,\n f\"Error: User hash `{user_hash}` not found in user results data on submit eval.\",\n )\n\n user_data = app_state[\"user_results_data\"][user_hash]\n if user_data is None:\n user_data = {}\n\n user_data = cast(dict[str, Optional[EvalData]], user_data)\n user_data[file_name] = file_eval\n\n app_state[\"user_results_data\"][user_hash] = user_data\n\n ## update the evaluations data file\n # TODO \n\n app_state[\"logger\"].info(\"Eval state updated...\")\n\n else:\n\n app_state[\"logger\"].info(\"Default eval set detected, not updating.\")\n\n return app_state\n "},{"location":"state/#evaluator.backend.state.load_run_state","title":"load_run_state(run_index, total_runs, app_state) ","text":"Create run state. TODO : This function is messy, should be cleaned up at some point. Parameters: Name Type Description Default run_index int The run index to load from. required total_runs int The total number of potential evaluation runs. required app_state AppState The current app state. required Returns: Type Description RunState The run state for the run at the specified index. Source code in evaluator/backend/state.py def load_run_state(run_index: int, total_runs: int, app_state: AppState) -> RunState:\n \"\"\"Create run state.\n\n TODO : This function is messy, should be cleaned up at some point.\n\n Parameters\n ----------\n run_index : int\n The run index to load from.\n total_runs : int\n The total number of potential evaluation runs.\n app_state : AppState\n The current app state.\n\n Returns\n -------\n RunState \n The run state for the run at the specified index.\n \"\"\"\n current_run = 0\n\n for directory in app_state[\"generated_directory_paths\"]:\n\n current_paper = os.path.basename(directory)\n\n output_map = misc_fns.load_json(os.path.join(directory, \"output_map.json\"))\n if output_map is None:\n misc_fns.graceful_exit(\n 1, f\"Error: Output map not found in directory `{directory}`\"\n )\n\n for domain in output_map:\n for domain_param_set in output_map[domain]:\n for domain_run in domain_param_set[\"entries\"][\"runs\"]:\n\n if current_run == run_index:\n\n generated_domain_path = str(domain_run[\"json_file\"])\n generated_domain: dict | str | None = None\n if os.path.isfile(generated_domain_path):\n generated_domain = misc_fns.load_json(generated_domain_path)\n if generated_domain is None:\n misc_fns.graceful_exit(\n 1,\n f\"Unable to load generated JSON data at `{generated_domain_path}`.\",\n )\n else:\n generated_domain_path = domain_run[\"txt_file\"]\n raw_txt = open(generated_domain_path, \"r\").read()\n generated_domain = f\"Failed JSON serialization. Raw text output:\\n\\n{raw_txt}\"\n\n domain = os.path.basename(generated_domain_path.split(\"-\")[0])\n\n human_curated_path = os.path.join(\n app_state[\"generated_output_dir_root\"],\n \"human_curated\",\n f\"{os.path.basename(directory)}.json\",\n )\n if not os.path.isfile(human_curated_path):\n misc_fns.graceful_exit(\n 1,\n f\"Human curated BCO file not found at filepath `{human_curated_path}`.\",\n )\n human_curated_json = misc_fns.load_json(human_curated_path)\n if human_curated_json is None:\n misc_fns.graceful_exit(\n 1,\n f\"Unable to load human curated JSON at path `{human_curated_path}`.\",\n )\n human_curated_domain_formatted_json = {\n f\"{domain}_domain\": human_curated_json[f\"{domain}_domain\"]\n }\n human_curated_domain = json.dumps(\n human_curated_domain_formatted_json, indent=4\n )\n\n param_set = json.dumps(\n domain_param_set[\"entries\"][\"params\"], indent=4\n )\n\n reference_nodes = open(\n domain_run[\"source_node_file\"], \"r\"\n ).read()\n\n already_evaluated = False\n eval_data = default_eval()\n if (\n app_state[\"user_results_data\"][app_state[\"user_hash\"]]\n is not None\n ):\n user_eval_data = app_state[\"user_results_data\"][\n app_state[\"user_hash\"]\n ]\n if (user_eval_data is not None) and (\n os.path.basename(generated_domain_path)\n in user_eval_data\n ):\n user_file_eval = user_eval_data[\n os.path.basename(generated_domain_path)\n ]\n if user_file_eval is not None:\n already_evaluated = True\n eval_data = user_file_eval\n\n run_state = create_run_state(\n paper=current_paper,\n domain=domain,\n generated_domain=generated_domain,\n generated_file_path=generated_domain_path,\n human_curated_domain=human_curated_domain,\n param_set=param_set,\n reference_nodes=reference_nodes,\n run_index=run_index,\n total_runs=total_runs,\n already_evaluated=already_evaluated,\n logger=app_state[\"logger\"],\n eval_data=eval_data,\n )\n\n log_state(run_state, \"run\")\n return run_state\n\n current_run += 1\n\n misc_fns.graceful_exit(1, f\"Failed to load run state for run index `{run_index}`.\")\n "},{"location":"tab-view/","title":"Tab View","text":""},{"location":"tab-view/#evaluator.frontend.components.tab_view.TabView","title":"TabView ","text":" Bases: CTkTabview , EvaluationBaseFrame Class for the view page tab view. Source code in evaluator/frontend/components/tab_view.py class TabView(ctk.CTkTabview, EvaluationBaseFrame):\n \"\"\"Class for the view page tab view.\"\"\"\n\n def __init__(\n self,\n master: ctk.CTkFrame,\n app_state: AppState,\n run_state: RunState,\n on_submit: Callable,\n **kwargs\n ):\n \"\"\"Constructor.\"\"\"\n super().__init__(master, **kwargs)\n\n self.state = app_state\n self.run = run_state\n self.on_submit = on_submit\n\n self.add(\"Compare JSON\")\n self.add(\"Source Nodes\")\n self.add(\"Parameter Set\")\n self.add(\"Evaluate\")\n\n self._create_compare_json_tab()\n self._create_source_node_tab()\n self._create_parameter_set_tab()\n self._create_evaluate_tab()\n\n self.update_state(app_state=self.state, run_state=self.run)\n\n def update_state(self, app_state: AppState, run_state: RunState) -> None:\n \"\"\"Loads the run data and updates the state.\n\n Parameters\n ----------\n run_state : RunState\n The run to laod.\n \"\"\"\n self.run = run_state\n self.state = app_state\n\n self.left_json_text.configure(state=\"normal\")\n self.left_json_text.delete(0.0, \"end\")\n self.left_json_text.insert(0.0, self.run[\"human_curated_domain\"])\n self.left_json_text.configure(state=\"disabled\")\n\n self.right_json_text.configure(state=\"normal\")\n self.right_json_text.delete(0.0, \"end\")\n self.right_json_text.insert(\"0.0\", self.run[\"generated_domain\"])\n self.right_json_text.configure(state=\"disabled\")\n\n self.source_node_text.configure(state=\"normal\")\n self.source_node_text.delete(0.0, \"end\")\n self.source_node_text.insert(0.0, self.run[\"reference_nodes\"])\n self.source_node_text.configure(state=\"disabled\")\n\n self.parameter_set_text.configure(state=\"normal\")\n self.parameter_set_text.delete(0.0, \"end\")\n self.parameter_set_text.insert(\"0.0\", self.run[\"param_set\"])\n self.parameter_set_text.configure(state=\"disabled\")\n\n self.score_frame.update_state(app_state=self.state, run_state=self.run)\n self.err_frame.update_state(app_state=self.state, run_state=self.run)\n self.ref_frame.update_state(app_state=self.state, run_state=self.run)\n self.general_frame.update_state(app_state=self.state, run_state=self.run)\n self.misc_frame.update_state(app_state=self.state, run_state=self.run)\n\n def get_results(self) -> EvalData:\n \"\"\"Returns the score evaluations.\"\"\"\n score_eval = self.score_frame.get_results()\n error_eval = self.err_frame.get_results()\n reference_eval = self.ref_frame.get_results()\n general_eval = self.general_frame.get_results()\n misc_eval = self.misc_frame.get_results()\n eval_data = create_full_eval(\n score_eval=score_eval,\n error_eval=error_eval,\n reference_eval=reference_eval,\n general_eval=general_eval,\n misc_eval=misc_eval,\n )\n return eval_data\n\n def _create_evaluate_tab(self) -> None:\n \"\"\"Creates the evaluate tab view.\"\"\"\n self.evaluate_frame = self.tab(\"Evaluate\")\n self.evaluate_frame.grid_columnconfigure((0, 1, 2), weight=1)\n self.evaluate_frame.grid_rowconfigure((0, 1), weight=1)\n\n self.score_frame = ScoreFrame(\n master=self.evaluate_frame, app_state=self.state, run_state=self.run\n )\n self.score_frame.grid(\n row=0,\n column=0,\n padx=self.state[\"padding\"],\n pady=self.state[\"padding\"],\n sticky=\"nsew\",\n )\n\n self.err_frame = ErrorFrame(\n master=self.evaluate_frame, app_state=self.state, run_state=self.run\n )\n self.err_frame.grid(\n row=0,\n column=1,\n padx=self.state[\"padding\"],\n pady=self.state[\"padding\"],\n sticky=\"nsew\",\n )\n\n self.ref_frame = ReferenceFrame(\n master=self.evaluate_frame, app_state=self.state, run_state=self.run\n )\n self.ref_frame.grid(\n row=0,\n column=2,\n padx=self.state[\"padding\"],\n pady=self.state[\"padding\"],\n sticky=\"nsew\",\n )\n\n self.general_frame = GeneralFrame(\n master=self.evaluate_frame, app_state=self.state, run_state=self.run\n )\n self.general_frame.grid(\n row=1,\n column=0,\n padx=self.state[\"padding\"],\n pady=self.state[\"padding\"],\n sticky=\"nsew\",\n )\n\n self.misc_frame = MiscFrame(\n master=self.evaluate_frame, app_state=self.state, run_state=self.run\n )\n self.misc_frame.grid(\n row=1,\n column=1,\n columnspan=2,\n padx=self.state[\"padding\"],\n pady=self.state[\"padding\"],\n sticky=\"nsew\",\n )\n\n self.submit_button = ctk.CTkButton(\n master=self.evaluate_frame, text=\"Submit\", command=self.on_submit\n )\n self.submit_button.grid(\n row=6,\n column=2,\n padx=self.state[\"padding\"],\n pady=self.state[\"padding\"],\n sticky=\"se\",\n )\n\n def _create_compare_json_tab(self) -> None:\n \"\"\"Creates the compare JSON tab view.\"\"\"\n self.compare_frame = self.tab(\"Compare JSON\")\n self.compare_frame.grid_columnconfigure(0, weight=1)\n self.compare_frame.grid_columnconfigure(1, weight=1)\n self.compare_frame.grid_rowconfigure(0, weight=0)\n self.compare_frame.grid_rowconfigure(1, weight=1)\n\n self.left_label = ctk.CTkLabel(\n master=self.compare_frame,\n text=\"Human Curated Domain\",\n font=(self.state[\"font\"], 18, \"bold\"),\n )\n self.left_label.grid(\n row=0, column=0, padx=self.state[\"padding\"], pady=0, sticky=\"w\"\n )\n\n self.left_json_text = ctk.CTkTextbox(\n master=self.compare_frame, wrap=\"none\", font=(self.state[\"font\"], 18)\n )\n self.left_json_text.grid(\n row=1,\n column=0,\n padx=(self.state[\"padding\"], self.state[\"padding\"] // 2),\n pady=(0, self.state[\"padding\"] // 2),\n sticky=\"nsew\",\n )\n self.left_json_text.configure(state=\"disabled\")\n\n self.right_label = ctk.CTkLabel(\n master=self.compare_frame,\n text=\"Generated Domain\",\n font=(self.state[\"font\"], 18, \"bold\"),\n )\n self.right_label.grid(\n row=0,\n column=1,\n padx=(self.state[\"padding\"] // 2, self.state[\"padding\"]),\n pady=0,\n sticky=\"w\",\n )\n\n self.right_json_text = ctk.CTkTextbox(\n master=self.compare_frame, wrap=\"none\", font=(self.state[\"font\"], 18)\n )\n self.right_json_text.grid(\n row=1,\n column=1,\n padx=(self.state[\"padding\"] // 2, self.state[\"padding\"]),\n pady=(0, self.state[\"padding\"] // 2),\n sticky=\"nsew\",\n )\n self.right_json_text.configure(state=\"disabled\")\n\n def _create_source_node_tab(self) -> None:\n \"\"\"Creates the source node tab.\"\"\"\n self.source_node_frame = self.tab(\"Source Nodes\")\n self.source_node_frame.grid_columnconfigure(0, weight=1)\n self.source_node_frame.grid_rowconfigure(0, weight=1)\n\n self.source_node_text = ctk.CTkTextbox(\n master=self.source_node_frame, wrap=\"none\", font=(self.state[\"font\"], 18)\n )\n self.source_node_text.grid(\n row=0,\n column=0,\n padx=self.state[\"padding\"],\n pady=self.state[\"padding\"],\n sticky=\"nsew\",\n )\n self.source_node_text.configure(state=\"disabled\")\n\n def _create_parameter_set_tab(self) -> None:\n \"\"\"Creates the parameter set tab.\"\"\"\n self.parameter_set_frame = self.tab(\"Parameter Set\")\n self.parameter_set_frame.grid_columnconfigure(0, weight=1)\n self.parameter_set_frame.grid_rowconfigure(0, weight=1)\n\n self.parameter_set_text = ctk.CTkTextbox(\n master=self.parameter_set_frame, wrap=\"none\", font=(self.state[\"font\"], 18)\n )\n self.parameter_set_text.grid(\n row=0,\n column=0,\n padx=self.state[\"padding\"],\n pady=self.state[\"padding\"],\n sticky=\"nsew\",\n )\n self.parameter_set_text.configure(state=\"disabled\")\n "},{"location":"tab-view/#evaluator.frontend.components.tab_view.TabView.__init__","title":"__init__(master, app_state, run_state, on_submit, **kwargs) ","text":"Constructor. Source code in evaluator/frontend/components/tab_view.py def __init__(\n self,\n master: ctk.CTkFrame,\n app_state: AppState,\n run_state: RunState,\n on_submit: Callable,\n **kwargs\n):\n \"\"\"Constructor.\"\"\"\n super().__init__(master, **kwargs)\n\n self.state = app_state\n self.run = run_state\n self.on_submit = on_submit\n\n self.add(\"Compare JSON\")\n self.add(\"Source Nodes\")\n self.add(\"Parameter Set\")\n self.add(\"Evaluate\")\n\n self._create_compare_json_tab()\n self._create_source_node_tab()\n self._create_parameter_set_tab()\n self._create_evaluate_tab()\n\n self.update_state(app_state=self.state, run_state=self.run)\n "},{"location":"tab-view/#evaluator.frontend.components.tab_view.TabView.update_state","title":"update_state(app_state, run_state) ","text":"Loads the run data and updates the state. Parameters: Name Type Description Default run_state RunState The run to laod. required Source code in evaluator/frontend/components/tab_view.py def update_state(self, app_state: AppState, run_state: RunState) -> None:\n \"\"\"Loads the run data and updates the state.\n\n Parameters\n ----------\n run_state : RunState\n The run to laod.\n \"\"\"\n self.run = run_state\n self.state = app_state\n\n self.left_json_text.configure(state=\"normal\")\n self.left_json_text.delete(0.0, \"end\")\n self.left_json_text.insert(0.0, self.run[\"human_curated_domain\"])\n self.left_json_text.configure(state=\"disabled\")\n\n self.right_json_text.configure(state=\"normal\")\n self.right_json_text.delete(0.0, \"end\")\n self.right_json_text.insert(\"0.0\", self.run[\"generated_domain\"])\n self.right_json_text.configure(state=\"disabled\")\n\n self.source_node_text.configure(state=\"normal\")\n self.source_node_text.delete(0.0, \"end\")\n self.source_node_text.insert(0.0, self.run[\"reference_nodes\"])\n self.source_node_text.configure(state=\"disabled\")\n\n self.parameter_set_text.configure(state=\"normal\")\n self.parameter_set_text.delete(0.0, \"end\")\n self.parameter_set_text.insert(\"0.0\", self.run[\"param_set\"])\n self.parameter_set_text.configure(state=\"disabled\")\n\n self.score_frame.update_state(app_state=self.state, run_state=self.run)\n self.err_frame.update_state(app_state=self.state, run_state=self.run)\n self.ref_frame.update_state(app_state=self.state, run_state=self.run)\n self.general_frame.update_state(app_state=self.state, run_state=self.run)\n self.misc_frame.update_state(app_state=self.state, run_state=self.run)\n "},{"location":"tab-view/#evaluator.frontend.components.tab_view.TabView.get_results","title":"get_results() ","text":"Returns the score evaluations. Source code in evaluator/frontend/components/tab_view.py def get_results(self) -> EvalData:\n \"\"\"Returns the score evaluations.\"\"\"\n score_eval = self.score_frame.get_results()\n error_eval = self.err_frame.get_results()\n reference_eval = self.ref_frame.get_results()\n general_eval = self.general_frame.get_results()\n misc_eval = self.misc_frame.get_results()\n eval_data = create_full_eval(\n score_eval=score_eval,\n error_eval=error_eval,\n reference_eval=reference_eval,\n general_eval=general_eval,\n misc_eval=misc_eval,\n )\n return eval_data\n "},{"location":"tab-view/#evaluator.frontend.components.tab_view.TabView._create_evaluate_tab","title":"_create_evaluate_tab() ","text":"Creates the evaluate tab view. Source code in evaluator/frontend/components/tab_view.py def _create_evaluate_tab(self) -> None:\n \"\"\"Creates the evaluate tab view.\"\"\"\n self.evaluate_frame = self.tab(\"Evaluate\")\n self.evaluate_frame.grid_columnconfigure((0, 1, 2), weight=1)\n self.evaluate_frame.grid_rowconfigure((0, 1), weight=1)\n\n self.score_frame = ScoreFrame(\n master=self.evaluate_frame, app_state=self.state, run_state=self.run\n )\n self.score_frame.grid(\n row=0,\n column=0,\n padx=self.state[\"padding\"],\n pady=self.state[\"padding\"],\n sticky=\"nsew\",\n )\n\n self.err_frame = ErrorFrame(\n master=self.evaluate_frame, app_state=self.state, run_state=self.run\n )\n self.err_frame.grid(\n row=0,\n column=1,\n padx=self.state[\"padding\"],\n pady=self.state[\"padding\"],\n sticky=\"nsew\",\n )\n\n self.ref_frame = ReferenceFrame(\n master=self.evaluate_frame, app_state=self.state, run_state=self.run\n )\n self.ref_frame.grid(\n row=0,\n column=2,\n padx=self.state[\"padding\"],\n pady=self.state[\"padding\"],\n sticky=\"nsew\",\n )\n\n self.general_frame = GeneralFrame(\n master=self.evaluate_frame, app_state=self.state, run_state=self.run\n )\n self.general_frame.grid(\n row=1,\n column=0,\n padx=self.state[\"padding\"],\n pady=self.state[\"padding\"],\n sticky=\"nsew\",\n )\n\n self.misc_frame = MiscFrame(\n master=self.evaluate_frame, app_state=self.state, run_state=self.run\n )\n self.misc_frame.grid(\n row=1,\n column=1,\n columnspan=2,\n padx=self.state[\"padding\"],\n pady=self.state[\"padding\"],\n sticky=\"nsew\",\n )\n\n self.submit_button = ctk.CTkButton(\n master=self.evaluate_frame, text=\"Submit\", command=self.on_submit\n )\n self.submit_button.grid(\n row=6,\n column=2,\n padx=self.state[\"padding\"],\n pady=self.state[\"padding\"],\n sticky=\"se\",\n )\n "},{"location":"tab-view/#evaluator.frontend.components.tab_view.TabView._create_compare_json_tab","title":"_create_compare_json_tab() ","text":"Creates the compare JSON tab view. Source code in evaluator/frontend/components/tab_view.py def _create_compare_json_tab(self) -> None:\n \"\"\"Creates the compare JSON tab view.\"\"\"\n self.compare_frame = self.tab(\"Compare JSON\")\n self.compare_frame.grid_columnconfigure(0, weight=1)\n self.compare_frame.grid_columnconfigure(1, weight=1)\n self.compare_frame.grid_rowconfigure(0, weight=0)\n self.compare_frame.grid_rowconfigure(1, weight=1)\n\n self.left_label = ctk.CTkLabel(\n master=self.compare_frame,\n text=\"Human Curated Domain\",\n font=(self.state[\"font\"], 18, \"bold\"),\n )\n self.left_label.grid(\n row=0, column=0, padx=self.state[\"padding\"], pady=0, sticky=\"w\"\n )\n\n self.left_json_text = ctk.CTkTextbox(\n master=self.compare_frame, wrap=\"none\", font=(self.state[\"font\"], 18)\n )\n self.left_json_text.grid(\n row=1,\n column=0,\n padx=(self.state[\"padding\"], self.state[\"padding\"] // 2),\n pady=(0, self.state[\"padding\"] // 2),\n sticky=\"nsew\",\n )\n self.left_json_text.configure(state=\"disabled\")\n\n self.right_label = ctk.CTkLabel(\n master=self.compare_frame,\n text=\"Generated Domain\",\n font=(self.state[\"font\"], 18, \"bold\"),\n )\n self.right_label.grid(\n row=0,\n column=1,\n padx=(self.state[\"padding\"] // 2, self.state[\"padding\"]),\n pady=0,\n sticky=\"w\",\n )\n\n self.right_json_text = ctk.CTkTextbox(\n master=self.compare_frame, wrap=\"none\", font=(self.state[\"font\"], 18)\n )\n self.right_json_text.grid(\n row=1,\n column=1,\n padx=(self.state[\"padding\"] // 2, self.state[\"padding\"]),\n pady=(0, self.state[\"padding\"] // 2),\n sticky=\"nsew\",\n )\n self.right_json_text.configure(state=\"disabled\")\n "},{"location":"tab-view/#evaluator.frontend.components.tab_view.TabView._create_source_node_tab","title":"_create_source_node_tab() ","text":"Creates the source node tab. Source code in evaluator/frontend/components/tab_view.py def _create_source_node_tab(self) -> None:\n \"\"\"Creates the source node tab.\"\"\"\n self.source_node_frame = self.tab(\"Source Nodes\")\n self.source_node_frame.grid_columnconfigure(0, weight=1)\n self.source_node_frame.grid_rowconfigure(0, weight=1)\n\n self.source_node_text = ctk.CTkTextbox(\n master=self.source_node_frame, wrap=\"none\", font=(self.state[\"font\"], 18)\n )\n self.source_node_text.grid(\n row=0,\n column=0,\n padx=self.state[\"padding\"],\n pady=self.state[\"padding\"],\n sticky=\"nsew\",\n )\n self.source_node_text.configure(state=\"disabled\")\n "},{"location":"tab-view/#evaluator.frontend.components.tab_view.TabView._create_parameter_set_tab","title":"_create_parameter_set_tab() ","text":"Creates the parameter set tab. Source code in evaluator/frontend/components/tab_view.py def _create_parameter_set_tab(self) -> None:\n \"\"\"Creates the parameter set tab.\"\"\"\n self.parameter_set_frame = self.tab(\"Parameter Set\")\n self.parameter_set_frame.grid_columnconfigure(0, weight=1)\n self.parameter_set_frame.grid_rowconfigure(0, weight=1)\n\n self.parameter_set_text = ctk.CTkTextbox(\n master=self.parameter_set_frame, wrap=\"none\", font=(self.state[\"font\"], 18)\n )\n self.parameter_set_text.grid(\n row=0,\n column=0,\n padx=self.state[\"padding\"],\n pady=self.state[\"padding\"],\n sticky=\"nsew\",\n )\n self.parameter_set_text.configure(state=\"disabled\")\n "},{"location":"unit-testing/","title":"Automated Testing","text":"The test_bco_rag.py script contains a suite of tests designed to evaluate the functionality of the BcoRag tool using the pytest framework and the open source LLM evaluation framework DeepEval. "},{"location":"unit-testing/#test-cases","title":"Test Cases","text":"There is one test case for each domain: test_usability test_io test_description test_execution test_parametric test_error "},{"location":"unit-testing/#test-metrics","title":"Test Metrics","text":"The test suite evaluates two different metrics: Answer Relevancy: The answer relevancy metric is used to evaluate how relevant the finalized generated output (in our case, the generated domain) is to the original input prompt. It attempts to evaluate relevancy (does the generated content directly relate to the question at hand), appropriateness (is the content appropriate given the context of the input) and focus (does the content stay on topic). The answer relevancy metric measures the quality of your RAG pipeline's generator by evaluating how relevant the actual_output of your LLM application is compared to the provided input. Faithfulness: The faithfulness metric assesses how accurate and truthful the finalized generated output (in our case, the generated domain) is concerning the source material (the retrieved content). It attempts to ensure that the content is relevant, factual, and does not contradict the information gathered from the retrieval step. The faithfulness metric measures the quality of your RAG pipeline's generator by evaluating whether the actual_output factually aligns with the contents of your retrieval_context . "},{"location":"unit-testing/#running-the-tests","title":"Running The Tests","text":"It is not recommended to run all the tests at once. The test suite uses gpt-4o in the backend to evaluate the above metrics. To run one test at a time: deepeval test run test_bco_rag.py::test_{domain} To run all the tests at once: deepeval test run test_bco_rag.py "},{"location":"view-page/","title":"View Page","text":""},{"location":"view-page/#evaluator.frontend.components.view_page.ViewPage","title":"ViewPage ","text":" Bases: CTkFrame Class for the view/evaluate page. Source code in evaluator/frontend/components/view_page.py class ViewPage(ctk.CTkFrame):\n \"\"\"Class for the view/evaluate page.\"\"\"\n\n def __init__(\n self,\n master: ctk.CTk,\n app_state: AppState,\n run_state: RunState,\n navigate: Callable[[Literal[-1, 1], int, AppState], None],\n on_save: Callable[[AppState], None],\n on_exit: Callable[[], NoReturn],\n **kwargs\n ):\n \"\"\"Constructor.\"\"\"\n super().__init__(master, **kwargs)\n\n self.state = app_state\n self.run = run_state\n self.navigate = navigate\n\n self.grid(row=0, column=0, padx=0, pady=0, sticky=\"nsew\")\n self.grid_columnconfigure(0, weight=1)\n self.grid_columnconfigure(1, weight=0)\n self.grid_rowconfigure(0, weight=1)\n\n self.sidebar = SideBar(\n master=self,\n app_state=self.state,\n run_state=self.run,\n navigate=self.navigate,\n on_save=on_save,\n on_exit=on_exit,\n )\n\n self.tab_view = TabView(master=self, app_state=self.state, run_state=self.run, on_submit=self.on_submit)\n self.tab_view.grid(row=0, column=0, padx=0, pady=0, sticky=\"nsew\")\n\n def update_state(self, app_state: AppState, run_state: RunState) -> None:\n \"\"\"Updates the state.\"\"\"\n self.run = run_state\n self.state = app_state\n self.sidebar.update_state(self.run)\n self.tab_view.update_state(app_state=app_state, run_state=self.run)\n\n def on_submit(self) -> None:\n \"\"\"Submits the user evaluation.\"\"\"\n self.run[\"eval_data\"] = self.tab_view.get_results()\n updated_app_state = submit_eval_state(self.state, self.run)\n self.update_state(app_state=updated_app_state, run_state=self.run)\n "},{"location":"view-page/#evaluator.frontend.components.view_page.ViewPage.__init__","title":"__init__(master, app_state, run_state, navigate, on_save, on_exit, **kwargs) ","text":"Constructor. Source code in evaluator/frontend/components/view_page.py def __init__(\n self,\n master: ctk.CTk,\n app_state: AppState,\n run_state: RunState,\n navigate: Callable[[Literal[-1, 1], int, AppState], None],\n on_save: Callable[[AppState], None],\n on_exit: Callable[[], NoReturn],\n **kwargs\n):\n \"\"\"Constructor.\"\"\"\n super().__init__(master, **kwargs)\n\n self.state = app_state\n self.run = run_state\n self.navigate = navigate\n\n self.grid(row=0, column=0, padx=0, pady=0, sticky=\"nsew\")\n self.grid_columnconfigure(0, weight=1)\n self.grid_columnconfigure(1, weight=0)\n self.grid_rowconfigure(0, weight=1)\n\n self.sidebar = SideBar(\n master=self,\n app_state=self.state,\n run_state=self.run,\n navigate=self.navigate,\n on_save=on_save,\n on_exit=on_exit,\n )\n\n self.tab_view = TabView(master=self, app_state=self.state, run_state=self.run, on_submit=self.on_submit)\n self.tab_view.grid(row=0, column=0, padx=0, pady=0, sticky=\"nsew\")\n "},{"location":"view-page/#evaluator.frontend.components.view_page.ViewPage.update_state","title":"update_state(app_state, run_state) ","text":"Updates the state. Source code in evaluator/frontend/components/view_page.py def update_state(self, app_state: AppState, run_state: RunState) -> None:\n \"\"\"Updates the state.\"\"\"\n self.run = run_state\n self.state = app_state\n self.sidebar.update_state(self.run)\n self.tab_view.update_state(app_state=app_state, run_state=self.run)\n "},{"location":"view-page/#evaluator.frontend.components.view_page.ViewPage.on_submit","title":"on_submit() ","text":"Submits the user evaluation. Source code in evaluator/frontend/components/view_page.py def on_submit(self) -> None:\n \"\"\"Submits the user evaluation.\"\"\"\n self.run[\"eval_data\"] = self.tab_view.get_results()\n updated_app_state = submit_eval_state(self.state, self.run)\n self.update_state(app_state=updated_app_state, run_state=self.run)\n "}]}
\ No newline at end of file
+{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"","title":"Biocompute Object Retrieval-Augmented Generation Assistant","text":""},{"location":"#background","title":"Background","text":"The BioCompute Object (BCO) project is a community-driven open standards framework for standardizing and sharing computations and analyses. With the exponential increase in both the quantity and complexity of biological data and the workflows used to analyze and transform the data, the need for standardization in documentation is imperative for experimental preservation, transparency, accuracy, and reproducability. As with any documentation standard, the main hurdles to continued adoption are the overhead required to maintain the quality and accuracy of a BCO in parallel as the research evolves over time and retroactively documenting pre-existing research. With the recent improvements in large language models (LLMs), the feasibility and utility of an automated BCO creation assistant is an intriguing use case. "},{"location":"#goal","title":"Goal","text":"The goal of this project is to reduce the overhead required in retroactively documenting pre-existing research. By using the Biocompute RAG assistant, you can seamlessly convert existing publications on previous research to be BCO compliant. "},{"location":"#approach-justification","title":"Approach Justification","text":"This BioCompute Object (BCO) assistant will assist users in automatically creating specific BCO domains from user uploaded papers. This proof of concept uses a Retrieval Augmented Generation (RAG) approach rather than a standalone (or fine-tuned) LLM. Our use case is somewhat antithetical to what LLMs were originally designed for. LLMs were designed for creative, free text responses that represent plausible natural language. BCOs, on the other hand, were designed for deterministic, accurate, non-ambiguous, and reproduceable documentation. Given this, there are two main initial questions that have to be considered: - Current LLMs are often categorized as \"stochastic parrots\" that have no underlying understanding of text structure, only generating what are considered plausible natural language respones. How well could LLMs consistently produce structured, schema compliant JSON responses (regardless of the actual output content)?
- Unlike traditional LLM use cases, the goal of BCOs as described above, are not to generate creative and original output. How can we constrain the LLM to limit creativity, extrapolation, and potentially subjective output?
Given these considerations and our use case, a traditional standalone LLM suffers from multiple drawbacks in our problem context. "},{"location":"#issues-with-long-context-windows","title":"Issues with Long Context Windows","text":"Recent studies (Lost in the Middle) have shown that LLMs can struggle with long contexts: ... performance can degrade significantly when changing the position of relevant information, indicating that current language models do not robustly make use of information in long input contexts. In particular, we observe that performance is often highest when relevant information occurs at the beginning or end of the input context, and significantly degrades when models must access relevant information in the middle of long contexts, even for explicitly long-context models. This issue is particularly important for our expected user workflow. If a user uploads a particularly long paper, ingesting the entire paper as part of our context window will likely result in significantly variant output quality on a per domain basis. For example, the usability domain information is usually contained in the paper abstract, which is usually at the beginning of the paper and as a result, will be earlier in the context window. In this case, the generated usability domain is more likely to contain quality information whereas the description domain captures specific workflow information that is usually contained in the middle of a paper. In this case, if the required information is on page 5 of a 10 page paper, we can expect lower quality information in the generated description domain. The RAG will help ensure that our context window stays manageable, by avoiding complete ingestion of the paper in one-shot. Instead, the paper will be indexed and intelligently queried prior to each prompt to ensure our context window is manageable, precise, and relevant. "},{"location":"#training-data-and-false-extrapolation","title":"Training Data and False Extrapolation","text":"LLMs are also highly sensitive to the quality of the training data. A study from Microsoft Research titled Textbooks Are All You Need demonstrated the impact of high-quality training data in output quality, specifically with regard to proficiency in code-generation tasks. By crafting \"textbook quality\" data we were able to train a model that surpasses almost all open-source models on coding benchmarks such as HumanEval and MBPP despite being 10x smaller in model size and 100x smaller in dataset size. When explicit facts aren't availble, standalone LLMs can extrapolate fabricated outputs resulting in confident, but false output. Since we are leveraging existing pre-trained LLMs and do not have the resources to control the training data specificity and quality, we can leverage a RAG framework to supplement our requests with up-to-date, accurate, and relevant information. Rather than relying on the LLM to extrapolate itself, we can supply it with the exact related information it needs to parse, format, and summarize. "},{"location":"aggregator/","title":"In-Progress Documentation","text":"Handles the in progress documentation generator. "},{"location":"aggregator/#aggregator.aggregator.Aggregator","title":"Aggregator ","text":"Classs to handle the in progress documentation of a repository. Processes the work done so far in a code repository and generates plain text documentation on the project that resembles a plain text Biocompute Object. Attributes: Name Type Description path str Path to the directory to process. include str Comma delimited list of glob patterns to include in processing. exclude str Comma delimited list of glob patterns to exclude in processing. include_priority bool Determines whether to prioritize the include or exclude pattern in the case that include and exclude patterns conflict. exclude_from_tree bool Whether to exclude excluded files from the source tree path for prompt generation. client OpenAI OpenAI API client. encoding Encoding The encoding for the LLM. Source code in aggregator/aggregator.py class Aggregator:\n \"\"\"Classs to handle the in progress documentation of a repository. Processes the work done so far in a\n code repository and generates plain text documentation on the project that resembles a plain text Biocompute\n Object.\n\n Attributes\n ----------\n path: str\n Path to the directory to process.\n include: str\n Comma delimited list of glob patterns to include in processing.\n exclude: str\n Comma delimited list of glob patterns to exclude in processing.\n include_priority : bool\n Determines whether to prioritize the include or exclude pattern\n in the case that include and exclude patterns conflict.\n exclude_from_tree : bool\n Whether to exclude excluded files from the source tree path for\n prompt generation.\n client : OpenAI\n OpenAI API client.\n encoding : Encoding\n The encoding for the LLM.\n \"\"\"\n\n def __init__(\n self,\n path: str,\n include: Optional[str],\n exclude: Optional[str],\n include_priority: bool = False,\n exclude_from_tree: bool = False,\n ):\n \"\"\"Constructor.\n\n Parameters\n ----------\n path: str\n Path to the directory to process.\n include: str\n Comma delimited list of glob patterns to include in processing.\n exclude: str\n Comma delimited list of glob patterns to exclude in processing.\n include_priority : bool, optional\n Determines whether to prioritize the include or exclude pattern\n in the case that include and exclude patterns conflict.\n exclude_from_tree : bool, optional\n Whether to exclude excluded files from the source tree path for\n prompt generation.\n \"\"\"\n load_dotenv()\n self.path = path\n self.include = include\n self.exclude = exclude\n self.include_priority = include_priority\n self.exclude_from_tree = exclude_from_tree\n self.client = OpenAI(api_key=os.getenv(\"OPENAI_API_KEY\"))\n self.encoding = tiktoken.encoding_for_model(MODEL)\n\n host_os = platform.system().lower()\n machine = platform.machine().lower()\n if host_os not in BINARY_MAP:\n graceful_exit(1, f\"OS `{os}` not supported.\")\n if machine not in BINARY_MAP[host_os]:\n graceful_exit(1, f\"{os} architecture for `{machine}` not supported.\")\n\n self._binary_path = os.path.join(\n os.path.dirname(__file__), \"binaries\", BINARY_MAP[host_os][machine]\n )\n\n def get_prompt(self) -> str:\n \"\"\"Calls the codeprompt binary and generates the LLM prompt.\"\"\"\n cmd = [self._binary_path, self.path]\n\n if self.include:\n cmd.extend([\"--include\", f\"{self.include}\"])\n if self.exclude:\n cmd.extend([\"--exclude\", f\"{self.exclude}\"])\n if self.include_priority:\n cmd.append(\"--include-priority\")\n if self.exclude_from_tree:\n cmd.extend([\"--exclude-from-tree\"])\n cmd.extend([\"--output\", PROMPT_PATH])\n cmd.extend([\"-t\", os.path.join(os.path.dirname(__file__), \"template.hbs\")])\n cmd.append(\"--no-clipboard\")\n cmd.append(\"--spinner\")\n cmd.append(\"--line-numbers\")\n cmd.append(\"--tokens\")\n\n try:\n result = subprocess.run(cmd, capture_output=True, text=True, check=True)\n return result.stdout\n except subprocess.CalledProcessError as e:\n error_msg = (\n f\"Command '{e.cmd}' returned non-zero exit status {e.returncode}.\"\n )\n error_msg += f\"\\nError output:\\n{e.stderr}\"\n except Exception as e:\n error_msg = f\"Unexpected error in generating prompt.\\n{e}\"\n graceful_exit(1, error_msg)\n\n def generate_summary(self) -> None:\n \"\"\"Entry point for generating the LLM documentation.\"\"\"\n if not os.path.isfile(PROMPT_PATH):\n graceful_exit(1, f\"No prompt found at `{PROMPT_PATH}`.\")\n with open(PROMPT_PATH, \"r\") as f:\n prompt = f.read()\n\n tokens = self._count_tokens(prompt)\n token_count = len(tokens)\n print(f\"Total prompt token count: {token_count}\")\n\n if token_count <= MAX_TOKENS:\n response = self._process_prompt(prompt)\n else:\n print(\n f\"Warning: Prompt size exceeds the max tokens limit ({MAX_TOKENS}), response will still be generated but will likely be somewhat degraded in quality. Consider limiting the include patterns.\"\n )\n chunks = self._split_prompt(tokens, token_count)\n responses = self._process_chunks(chunks)\n response = self._combine_responses(responses)\n\n self._write_output(response)\n\n def _process_prompt(self, prompt: str) -> str:\n \"\"\"Process a single prompt using the OpenAI API.\n\n Parameters\n ----------\n prompt : str\n The prompt to be processed.\n\n Raises\n ------\n Exception\n If there's an unexpected error in generating the summary.\n \"\"\"\n try:\n response = self.client.chat.completions.create(\n model=MODEL,\n messages=[\n {\n \"role\": \"system\",\n \"content\": SYSTEM_PROMPT,\n },\n {\"role\": \"user\", \"content\": prompt},\n ],\n )\n response_txt = (\n response.choices[0].message.content\n if response.choices[0].message.content\n else \"\"\n )\n\n except Exception as e:\n error_msg = f\"Unexpected error in generating summary.\\n{e}\"\n graceful_exit(1, error_msg)\n\n return response_txt\n\n def _split_prompt(self, tokens: list[int], token_count: int) -> list[str]:\n \"\"\"Split a large prompt into smaller chunks that fit within the token limit.\n\n Parameters\n ----------\n\n Returns\n -------\n list[str]\n A list of prompt chunks, each within the token limit.\n \"\"\"\n print(\"Splitting prompt...\")\n chunks = []\n\n start = 0\n while start < token_count:\n end = min(start + MAX_TOKENS, token_count)\n if end < token_count:\n split_range = max(10, int(MAX_TOKENS * 0.1))\n for i in range(end, end - split_range, -1):\n if tokens[i] == self.encoding.encode(\"\\n\")[0]:\n end = i + 1\n break\n\n chunk_tokens = tokens[start:end]\n chunks.append(self.encoding.decode(chunk_tokens))\n start = end\n\n print(f\"Split into {len(chunks)} chunks\")\n return chunks\n\n def _process_chunks(self, chunks: list[str]) -> list[str]:\n \"\"\"Process multiple prompt chunks and combine their responses.\n\n Parameters\n ----------\n chunks : list[str]\n A list of prompt chunks to be processed.\n\n Raises\n ------\n Exception\n If there's an unexpected error in generating the summary for any chunk.\n \"\"\"\n responses: list[str] = []\n for i, chunk in enumerate(chunks):\n try:\n response = self.client.chat.completions.create(\n model=MODEL,\n messages=[\n {\n \"role\": \"system\",\n \"content\": f\"{SYSTEM_PROMPT} This is part {i + 1} of {len(chunks)}.\",\n },\n {\"role\": \"user\", \"content\": chunk},\n ],\n )\n response_txt = (\n response.choices[0].message.content\n if response.choices[0].message.content\n else \"\"\n )\n responses.append(response_txt)\n except Exception as e:\n graceful_exit(\n 1, f\"Unexpected error in generating summary for chunk {i + 1}.\\n{e}\"\n )\n return responses\n\n def _combine_responses(self, responses: list[str]) -> str:\n combine_prompt = f\"\"\"\n You are tasked with combining multiple responses into cohesive BioCompute Object-like (BCO) documentation. \n The BCO-like plain text documentation should include the following domains:\n - Usability Domain\n - IO Domain\n - Description Domain\n - Execution Domain\n - Parametric Domain\n - Error Domain\n\n Here are the responses to combine:\n\n {' '.join(responses)}\n\n Please structure the information into a single, coherent BCO documentation, ensuring that:\n 1. All relevant information from the responses is included.\n 2. The information is organized under the appropriate BCO domains.\n 3. Any redundant information is removed.\n 4. The final document flows logically and reads cohesively.\n 5. If specific information for a domain isn't available, mention that in the respective section.\n\n Format the output as markdown, with each domain as a second-level header (##).\n \"\"\"\n try:\n response = self.client.chat.completions.create(\n model=MODEL,\n messages=[\n {\"role\": \"system\", \"content\": SYSTEM_PROMPT},\n {\"role\": \"user\", \"content\": combine_prompt},\n ],\n )\n return (\n response.choices[0].message.content\n if response.choices[0].message.content\n else \"\"\n )\n except Exception as e:\n graceful_exit(1, f\"{e}\\nUnexpected error in combining responses.\")\n\n def _count_tokens(self, text: str) -> list[int]:\n \"\"\"Count the number of tokens in the given text.\n\n Parameters\n ----------\n text : str\n The text to count tokens for.\n\n Returns\n -------\n list[int]\n The number of tokens in each line of the text.\n \"\"\"\n return self.encoding.encode(text)\n\n def _write_output(self, content: str) -> None:\n with open(OUTPUT_PATH, \"w\") as out_file:\n out_file.write(content)\n "},{"location":"aggregator/#aggregator.aggregator.Aggregator.__init__","title":"__init__(path, include, exclude, include_priority=False, exclude_from_tree=False) ","text":"Constructor. Parameters: Name Type Description Default path str Path to the directory to process. required include Optional[str] Comma delimited list of glob patterns to include in processing. required exclude Optional[str] Comma delimited list of glob patterns to exclude in processing. required include_priority bool Determines whether to prioritize the include or exclude pattern in the case that include and exclude patterns conflict. False exclude_from_tree bool Whether to exclude excluded files from the source tree path for prompt generation. False Source code in aggregator/aggregator.py def __init__(\n self,\n path: str,\n include: Optional[str],\n exclude: Optional[str],\n include_priority: bool = False,\n exclude_from_tree: bool = False,\n):\n \"\"\"Constructor.\n\n Parameters\n ----------\n path: str\n Path to the directory to process.\n include: str\n Comma delimited list of glob patterns to include in processing.\n exclude: str\n Comma delimited list of glob patterns to exclude in processing.\n include_priority : bool, optional\n Determines whether to prioritize the include or exclude pattern\n in the case that include and exclude patterns conflict.\n exclude_from_tree : bool, optional\n Whether to exclude excluded files from the source tree path for\n prompt generation.\n \"\"\"\n load_dotenv()\n self.path = path\n self.include = include\n self.exclude = exclude\n self.include_priority = include_priority\n self.exclude_from_tree = exclude_from_tree\n self.client = OpenAI(api_key=os.getenv(\"OPENAI_API_KEY\"))\n self.encoding = tiktoken.encoding_for_model(MODEL)\n\n host_os = platform.system().lower()\n machine = platform.machine().lower()\n if host_os not in BINARY_MAP:\n graceful_exit(1, f\"OS `{os}` not supported.\")\n if machine not in BINARY_MAP[host_os]:\n graceful_exit(1, f\"{os} architecture for `{machine}` not supported.\")\n\n self._binary_path = os.path.join(\n os.path.dirname(__file__), \"binaries\", BINARY_MAP[host_os][machine]\n )\n "},{"location":"aggregator/#aggregator.aggregator.Aggregator.get_prompt","title":"get_prompt() ","text":"Calls the codeprompt binary and generates the LLM prompt. Source code in aggregator/aggregator.py def get_prompt(self) -> str:\n \"\"\"Calls the codeprompt binary and generates the LLM prompt.\"\"\"\n cmd = [self._binary_path, self.path]\n\n if self.include:\n cmd.extend([\"--include\", f\"{self.include}\"])\n if self.exclude:\n cmd.extend([\"--exclude\", f\"{self.exclude}\"])\n if self.include_priority:\n cmd.append(\"--include-priority\")\n if self.exclude_from_tree:\n cmd.extend([\"--exclude-from-tree\"])\n cmd.extend([\"--output\", PROMPT_PATH])\n cmd.extend([\"-t\", os.path.join(os.path.dirname(__file__), \"template.hbs\")])\n cmd.append(\"--no-clipboard\")\n cmd.append(\"--spinner\")\n cmd.append(\"--line-numbers\")\n cmd.append(\"--tokens\")\n\n try:\n result = subprocess.run(cmd, capture_output=True, text=True, check=True)\n return result.stdout\n except subprocess.CalledProcessError as e:\n error_msg = (\n f\"Command '{e.cmd}' returned non-zero exit status {e.returncode}.\"\n )\n error_msg += f\"\\nError output:\\n{e.stderr}\"\n except Exception as e:\n error_msg = f\"Unexpected error in generating prompt.\\n{e}\"\n graceful_exit(1, error_msg)\n "},{"location":"aggregator/#aggregator.aggregator.Aggregator.generate_summary","title":"generate_summary() ","text":"Entry point for generating the LLM documentation. Source code in aggregator/aggregator.py def generate_summary(self) -> None:\n \"\"\"Entry point for generating the LLM documentation.\"\"\"\n if not os.path.isfile(PROMPT_PATH):\n graceful_exit(1, f\"No prompt found at `{PROMPT_PATH}`.\")\n with open(PROMPT_PATH, \"r\") as f:\n prompt = f.read()\n\n tokens = self._count_tokens(prompt)\n token_count = len(tokens)\n print(f\"Total prompt token count: {token_count}\")\n\n if token_count <= MAX_TOKENS:\n response = self._process_prompt(prompt)\n else:\n print(\n f\"Warning: Prompt size exceeds the max tokens limit ({MAX_TOKENS}), response will still be generated but will likely be somewhat degraded in quality. Consider limiting the include patterns.\"\n )\n chunks = self._split_prompt(tokens, token_count)\n responses = self._process_chunks(chunks)\n response = self._combine_responses(responses)\n\n self._write_output(response)\n "},{"location":"aggregator/#aggregator.aggregator.Aggregator._process_prompt","title":"_process_prompt(prompt) ","text":"Process a single prompt using the OpenAI API. Parameters: Name Type Description Default prompt str The prompt to be processed. required Raises: Type Description Exception If there's an unexpected error in generating the summary. Source code in aggregator/aggregator.py def _process_prompt(self, prompt: str) -> str:\n \"\"\"Process a single prompt using the OpenAI API.\n\n Parameters\n ----------\n prompt : str\n The prompt to be processed.\n\n Raises\n ------\n Exception\n If there's an unexpected error in generating the summary.\n \"\"\"\n try:\n response = self.client.chat.completions.create(\n model=MODEL,\n messages=[\n {\n \"role\": \"system\",\n \"content\": SYSTEM_PROMPT,\n },\n {\"role\": \"user\", \"content\": prompt},\n ],\n )\n response_txt = (\n response.choices[0].message.content\n if response.choices[0].message.content\n else \"\"\n )\n\n except Exception as e:\n error_msg = f\"Unexpected error in generating summary.\\n{e}\"\n graceful_exit(1, error_msg)\n\n return response_txt\n "},{"location":"aggregator/#aggregator.aggregator.Aggregator._split_prompt","title":"_split_prompt(tokens, token_count) ","text":"Split a large prompt into smaller chunks that fit within the token limit. Parameters: Name Type Description Default Returns required list A list of prompt chunks, each within the token limit. required Source code in aggregator/aggregator.py def _split_prompt(self, tokens: list[int], token_count: int) -> list[str]:\n \"\"\"Split a large prompt into smaller chunks that fit within the token limit.\n\n Parameters\n ----------\n\n Returns\n -------\n list[str]\n A list of prompt chunks, each within the token limit.\n \"\"\"\n print(\"Splitting prompt...\")\n chunks = []\n\n start = 0\n while start < token_count:\n end = min(start + MAX_TOKENS, token_count)\n if end < token_count:\n split_range = max(10, int(MAX_TOKENS * 0.1))\n for i in range(end, end - split_range, -1):\n if tokens[i] == self.encoding.encode(\"\\n\")[0]:\n end = i + 1\n break\n\n chunk_tokens = tokens[start:end]\n chunks.append(self.encoding.decode(chunk_tokens))\n start = end\n\n print(f\"Split into {len(chunks)} chunks\")\n return chunks\n "},{"location":"aggregator/#aggregator.aggregator.Aggregator._process_chunks","title":"_process_chunks(chunks) ","text":"Process multiple prompt chunks and combine their responses. Parameters: Name Type Description Default chunks list[str] A list of prompt chunks to be processed. required Raises: Type Description Exception If there's an unexpected error in generating the summary for any chunk. Source code in aggregator/aggregator.py def _process_chunks(self, chunks: list[str]) -> list[str]:\n \"\"\"Process multiple prompt chunks and combine their responses.\n\n Parameters\n ----------\n chunks : list[str]\n A list of prompt chunks to be processed.\n\n Raises\n ------\n Exception\n If there's an unexpected error in generating the summary for any chunk.\n \"\"\"\n responses: list[str] = []\n for i, chunk in enumerate(chunks):\n try:\n response = self.client.chat.completions.create(\n model=MODEL,\n messages=[\n {\n \"role\": \"system\",\n \"content\": f\"{SYSTEM_PROMPT} This is part {i + 1} of {len(chunks)}.\",\n },\n {\"role\": \"user\", \"content\": chunk},\n ],\n )\n response_txt = (\n response.choices[0].message.content\n if response.choices[0].message.content\n else \"\"\n )\n responses.append(response_txt)\n except Exception as e:\n graceful_exit(\n 1, f\"Unexpected error in generating summary for chunk {i + 1}.\\n{e}\"\n )\n return responses\n "},{"location":"aggregator/#aggregator.aggregator.Aggregator._count_tokens","title":"_count_tokens(text) ","text":"Count the number of tokens in the given text. Parameters: Name Type Description Default text str The text to count tokens for. required Returns: Type Description list[int] The number of tokens in each line of the text. Source code in aggregator/aggregator.py def _count_tokens(self, text: str) -> list[int]:\n \"\"\"Count the number of tokens in the given text.\n\n Parameters\n ----------\n text : str\n The text to count tokens for.\n\n Returns\n -------\n list[int]\n The number of tokens in each line of the text.\n \"\"\"\n return self.encoding.encode(text)\n "},{"location":"app-start/","title":"App Start","text":"Handles the app initilization procedure. "},{"location":"app-start/#evaluator.backend.app_start.initialization","title":"initialization() ","text":"Handles the app initialization process. Source code in evaluator/backend/app_start.py def initialization() -> AppAttributes:\n \"\"\"Handles the app initialization process.\"\"\"\n _config_data = _load_config_data()\n if _config_data is None:\n misc_fns.graceful_exit(1, \"Error loading frontend configuration data.\")\n\n logger = misc_fns.setup_root_logger(\n log_path=_config_data[\"logger_path\"], name=_config_data[\"logger_name\"]\n )\n logger.info(\n \"################################## RUN START ##################################\"\n )\n\n _raw_directory_paths = glob(\n os.path.join(\n _config_data[\"generated_output_dir_path\"], _config_data[\"glob_pattern\"]\n )\n )\n directory_paths = [\n x\n for x in _raw_directory_paths\n if not any(y in x for y in _config_data[\"ignore_files\"])\n ]\n\n # load in existing evaluation data\n bco_results_data = misc_fns.load_json(\n os.path.join(\n _config_data[\"results_dir_path\"], _config_data[\"bco_results_file_name\"]\n )\n )\n user_results_data = misc_fns.load_json(\n os.path.join(\n _config_data[\"results_dir_path\"], _config_data[\"user_results_file_name\"]\n )\n )\n users_data = misc_fns.load_json(\n os.path.join(_config_data[\"results_dir_path\"], _config_data[\"users_file_name\"])\n )\n if bco_results_data is None or user_results_data is None or users_data is None:\n misc_fns.graceful_exit(1, \"Error loading results files.\")\n\n bco_results_data = _create_paper_keys(directory_paths, bco_results_data)\n\n app_attrs = create_app_attributes(\n logger=logger,\n results_dir_path=_config_data[\"results_dir_path\"],\n bco_results_file_name=_config_data[\"bco_results_file_name\"],\n bco_results_data=bco_results_data,\n user_results_file_name=_config_data[\"user_results_file_name\"],\n user_results_data=user_results_data,\n users_file_name=_config_data[\"users_file_name\"],\n users_data=users_data,\n generated_output_dir_root=_config_data[\"generated_output_dir_path\"],\n generated_directory_paths=directory_paths,\n padding=_config_data[\"padding\"],\n font=_config_data[\"font\"],\n )\n\n return app_attrs\n "},{"location":"app-start/#evaluator.backend.app_start.create_init_run_state","title":"create_init_run_state(app_state) ","text":"Creates the init run state. Parameters: Name Type Description Default app_state AppState The current app state. required Returns: Type Description RunState The intial run state. Source code in evaluator/backend/app_start.py def create_init_run_state(app_state: AppState) -> RunState:\n \"\"\"Creates the init run state.\n\n Parameters\n ----------\n app_state : AppState\n The current app state.\n\n Returns\n -------\n RunState\n The intial run state.\n \"\"\"\n total_runs = _get_total_runs(app_state)\n run_state = load_run_state(run_index=0, total_runs=total_runs, app_state=app_state)\n return run_state\n "},{"location":"app-start/#evaluator.backend.app_start._get_total_runs","title":"_get_total_runs(app_state) ","text":"Get the total number of runs in the output directory. Parameters: Name Type Description Default app_state AppState The current app state. required Returns: Type Description int The number of total potential generated domains to evaluate. Source code in evaluator/backend/app_start.py def _get_total_runs(app_state: AppState) -> int:\n \"\"\"Get the total number of runs in the output directory.\n\n Parameters\n ----------\n app_state : AppState\n The current app state.\n\n Returns\n -------\n int\n The number of total potential generated domains\n to evaluate.\n \"\"\"\n total_runs = 0\n for directory in app_state[\"generated_directory_paths\"]:\n output_map = misc_fns.load_json(os.path.join(directory, \"output_map.json\"))\n if output_map is None:\n misc_fns.graceful_exit(\n 1,\n f\"Error: Output map not found in directory `{directory}` while calculating total runs.\",\n )\n for domain in output_map:\n for domain_param_set in output_map[domain]:\n total_runs += len(domain_param_set[\"entries\"][\"runs\"])\n return total_runs\n "},{"location":"app-start/#evaluator.backend.app_start._create_paper_keys","title":"_create_paper_keys(directory_paths, bco_results_data) ","text":"Creates an entry for each paper in the evaluations file. Parameters: Name Type Description Default directory_paths list[str] Path to the generated BCO directories. required bco_results_data dict The loaded BCO evaluations results file. required Returns: Type Description dict The updated BCO evaluations data. Source code in evaluator/backend/app_start.py def _create_paper_keys(directory_paths: list[str], bco_results_data: dict) -> dict:\n \"\"\"Creates an entry for each paper in the evaluations file.\n\n Parameters\n ----------\n directory_paths : list [str]\n Path to the generated BCO directories.\n bco_results_data : dict\n The loaded BCO evaluations results file.\n\n Returns\n -------\n dict\n The updated BCO evaluations data.\n \"\"\"\n directory_basenames = [os.path.basename(x) for x in directory_paths]\n for paper in directory_basenames:\n if paper not in bco_results_data:\n bco_results_data[paper] = {}\n return bco_results_data\n "},{"location":"app-start/#evaluator.backend.app_start._load_config_data","title":"_load_config_data(filepath='./evaluator/backend/conf.json') ","text":"Loads the App configuration data. Parameters: Name Type Description Default filepath str Filepath to the App config data. './evaluator/backend/conf.json' Returns: Type Description ConfigData | None The configuration data on success, None on error. Source code in evaluator/backend/app_start.py def _load_config_data(\n filepath: str = \"./evaluator/backend/conf.json\",\n) -> Optional[ConfigData]:\n \"\"\"Loads the App configuration data.\n\n Parameters\n ----------\n filepath : str, optional\n Filepath to the App config data.\n\n Returns\n -------\n ConfigData | None\n The configuration data on success, None on error.\n \"\"\"\n naive_load_data = misc_fns.load_json(filepath)\n if naive_load_data is None:\n return None\n if isinstance(naive_load_data, dict):\n config_object = cast(ConfigData, naive_load_data)\n return config_object\n return None\n "},{"location":"app/","title":"App","text":""},{"location":"app/#evaluator.frontend.app.App","title":"App ","text":" Bases: CTk Frontend for evaluating generated BCO domains from BcoRag. Source code in evaluator/frontend/app.py class App(ctk.CTk):\n \"\"\"Frontend for evaluating generated BCO domains from\n BcoRag.\n \"\"\"\n\n def __init__(self):\n \"\"\"Constructor.\"\"\"\n super().__init__()\n init_data = app_start.initialization()\n\n self.attributes = init_data\n\n self.title(\"BCO RAG Evaluator\")\n self.geometry(f\"{1920}x{1080}\")\n\n self.grid_columnconfigure(0, weight=1)\n self.grid_rowconfigure(0, weight=1)\n\n self.login_screen = LoginScreen(\n master=self,\n attributes=self.attributes,\n on_login=login,\n on_login_success=self._login_success,\n on_exit=misc.exit_app,\n )\n\n def start(self):\n \"\"\"Start the app main loop.\"\"\"\n self.mainloop()\n\n def navigate(\n self, direction: Literal[-1, 1], run_index: int, app_state: AppState\n ) -> None:\n \"\"\"Callback to execute when the user presses\n the next or previous buttons.\n\n Parameters\n ----------\n direction : -1 or 1\n Indicates the direction the user is navigating,\n -1 for previous, 1 for next.\n run_index : int\n The new run index being navigated to.\n app_state : AppState\n The current app state.\n \"\"\"\n self.app_state = app_state\n updated_run_state = state.load_run_state(\n run_index=run_index,\n total_runs=self.run[\"total_runs\"],\n app_state=self.app_state,\n )\n self.view_page.update_state(\n app_state=self.app_state, run_state=updated_run_state\n )\n\n def _login_success(self, app_state: AppState) -> None:\n \"\"\"Callback to execute on login success.\"\"\"\n self.app_state = app_state\n self.login_screen.grid_forget()\n self.intermediate_screen = IntermediateScreen(\n master=self, on_start=self._on_start, app_state=self.app_state\n )\n\n def _on_start(self, app_state: AppState) -> None:\n \"\"\"Callback to execute on evaluation start.\"\"\"\n self.intermediate_screen.grid_forget()\n self.app_state = app_state\n # create init run state\n init_run_state = app_start.create_init_run_state(app_state)\n self.run = init_run_state\n self.view_page = ViewPage(\n master=self,\n app_state=self.app_state,\n run_state=init_run_state,\n navigate=self.navigate,\n on_save=state.save_state,\n on_exit=misc.exit_app,\n )\n "},{"location":"app/#evaluator.frontend.app.App.__init__","title":"__init__() ","text":"Constructor. Source code in evaluator/frontend/app.py def __init__(self):\n \"\"\"Constructor.\"\"\"\n super().__init__()\n init_data = app_start.initialization()\n\n self.attributes = init_data\n\n self.title(\"BCO RAG Evaluator\")\n self.geometry(f\"{1920}x{1080}\")\n\n self.grid_columnconfigure(0, weight=1)\n self.grid_rowconfigure(0, weight=1)\n\n self.login_screen = LoginScreen(\n master=self,\n attributes=self.attributes,\n on_login=login,\n on_login_success=self._login_success,\n on_exit=misc.exit_app,\n )\n "},{"location":"app/#evaluator.frontend.app.App.start","title":"start() ","text":"Start the app main loop. Source code in evaluator/frontend/app.py def start(self):\n \"\"\"Start the app main loop.\"\"\"\n self.mainloop()\n "},{"location":"app/#evaluator.frontend.app.App.navigate","title":"navigate(direction, run_index, app_state) ","text":"Callback to execute when the user presses the next or previous buttons. Parameters: Name Type Description Default direction -1 or 1 Indicates the direction the user is navigating, -1 for previous, 1 for next. required run_index int The new run index being navigated to. required app_state AppState The current app state. required Source code in evaluator/frontend/app.py def navigate(\n self, direction: Literal[-1, 1], run_index: int, app_state: AppState\n) -> None:\n \"\"\"Callback to execute when the user presses\n the next or previous buttons.\n\n Parameters\n ----------\n direction : -1 or 1\n Indicates the direction the user is navigating,\n -1 for previous, 1 for next.\n run_index : int\n The new run index being navigated to.\n app_state : AppState\n The current app state.\n \"\"\"\n self.app_state = app_state\n updated_run_state = state.load_run_state(\n run_index=run_index,\n total_runs=self.run[\"total_runs\"],\n app_state=self.app_state,\n )\n self.view_page.update_state(\n app_state=self.app_state, run_state=updated_run_state\n )\n "},{"location":"app/#evaluator.frontend.app.App._login_success","title":"_login_success(app_state) ","text":"Callback to execute on login success. Source code in evaluator/frontend/app.py def _login_success(self, app_state: AppState) -> None:\n \"\"\"Callback to execute on login success.\"\"\"\n self.app_state = app_state\n self.login_screen.grid_forget()\n self.intermediate_screen = IntermediateScreen(\n master=self, on_start=self._on_start, app_state=self.app_state\n )\n "},{"location":"app/#evaluator.frontend.app.App._on_start","title":"_on_start(app_state) ","text":"Callback to execute on evaluation start. Source code in evaluator/frontend/app.py def _on_start(self, app_state: AppState) -> None:\n \"\"\"Callback to execute on evaluation start.\"\"\"\n self.intermediate_screen.grid_forget()\n self.app_state = app_state\n # create init run state\n init_run_state = app_start.create_init_run_state(app_state)\n self.run = init_run_state\n self.view_page = ViewPage(\n master=self,\n app_state=self.app_state,\n run_state=init_run_state,\n navigate=self.navigate,\n on_save=state.save_state,\n on_exit=misc.exit_app,\n )\n "},{"location":"base-evaluation-frame/","title":"Base Class","text":"Base evaluation frame, enforces the update state and get results methods. "},{"location":"base-evaluation-frame/#evaluator.frontend.components.evaluation_frames.evaluation_parent.EvaluationBaseFrame","title":"EvaluationBaseFrame ","text":" Bases: ABC , Generic[T] Source code in evaluator/frontend/components/evaluation_frames/evaluation_parent.py class EvaluationBaseFrame(ABC, Generic[T]):\n\n @abstractmethod\n def __init__(self, master, app_state: AppState, run_state: RunState, **kwargs):\n pass\n\n @abstractmethod\n def update_state(self, app_state: AppState, run_state: RunState) -> None:\n \"\"\"Upate the component state.\n\n Parameters\n ----------\n app_state : AppState\n The updated app state.\n run_state : RunState\n The updated run state.\n \"\"\"\n pass\n\n @abstractmethod\n def get_results(self) -> T:\n \"\"\"Gets the results for the current state of the evaluation frame.\n\n Returns\n -------\n T\n The specific evaluation TypedDict for the frame.\n \"\"\"\n pass\n "},{"location":"base-evaluation-frame/#evaluator.frontend.components.evaluation_frames.evaluation_parent.EvaluationBaseFrame.update_state","title":"update_state(app_state, run_state) abstractmethod ","text":"Upate the component state. Parameters: Name Type Description Default app_state AppState The updated app state. required run_state RunState The updated run state. required Source code in evaluator/frontend/components/evaluation_frames/evaluation_parent.py @abstractmethod\ndef update_state(self, app_state: AppState, run_state: RunState) -> None:\n \"\"\"Upate the component state.\n\n Parameters\n ----------\n app_state : AppState\n The updated app state.\n run_state : RunState\n The updated run state.\n \"\"\"\n pass\n "},{"location":"base-evaluation-frame/#evaluator.frontend.components.evaluation_frames.evaluation_parent.EvaluationBaseFrame.get_results","title":"get_results() abstractmethod ","text":"Gets the results for the current state of the evaluation frame. Returns: Type Description T The specific evaluation TypedDict for the frame. Source code in evaluator/frontend/components/evaluation_frames/evaluation_parent.py @abstractmethod\ndef get_results(self) -> T:\n \"\"\"Gets the results for the current state of the evaluation frame.\n\n Returns\n -------\n T\n The specific evaluation TypedDict for the frame.\n \"\"\"\n pass\n "},{"location":"bcorag-types/","title":"Core Types","text":"The core logic custom types. Type Aliases DomainKey = Literal[\"usability\", \"io\", \"description\", \"execution\", \"parametric\", \"error\"] OptionKey = Literal[ \"loader\", \"chunking_config\", \"embedding_model\", \"vector_store\", \"similarity_top_k\", \"llm\", \"mode\"] "},{"location":"bcorag-types/#bcorag.custom_types.core_types.GitFilter","title":"GitFilter ","text":" Bases: Enum Enum delineating between the directory and file extension filters. Attributes: Name Type Description DIRECTORY int A git directory filter, represented by the value 1. FILE_EXTENSION int A file extension filter, represented by the value 2. Source code in bcorag/custom_types/core_types.py class GitFilter(Enum):\n \"\"\"Enum delineating between the directory and file extension filters.\n\n Attributes\n ----------\n DIRECTORY : int\n A git directory filter, represented by the value 1.\n FILE_EXTENSION : int\n A file extension filter, represented by the value 2.\n \"\"\"\n\n DIRECTORY = 1\n FILE_EXTENSION = 2\n "},{"location":"bcorag-types/#bcorag.custom_types.core_types.GitFilters","title":"GitFilters ","text":" Bases: TypedDict Typed dict for github loader filters. Attributes: Name Type Description filter_type FilterType The type of github filter (whether it is an include or exclude filter). filter GitFilter The filter enum specification. value list[str] The values to filter on. Source code in bcorag/custom_types/core_types.py class GitFilters(TypedDict):\n \"\"\"Typed dict for github loader filters.\n\n Attributes\n ----------\n filter_type : GithubRepositoryReader.FilterType\n The type of github filter (whether it is an include or exclude filter).\n filter : GitFilter\n The filter enum specification.\n value : list[str]\n The values to filter on.\n \"\"\"\n\n filter_type: GithubRepositoryReader.FilterType\n filter: GitFilter\n value: list[str]\n "},{"location":"bcorag-types/#bcorag.custom_types.core_types.GitData","title":"GitData ","text":" Bases: TypedDict Typed dict for the optional git repo information. Attributes: Name Type Description user str The github repo owner. repo str The repo name. branch str The repo branch to index. filters list[GitFilters] The list of filters to apply. Source code in bcorag/custom_types/core_types.py class GitData(TypedDict):\n \"\"\"Typed dict for the optional git repo information.\n\n Attributes\n ----------\n user : str\n The github repo owner.\n repo : str\n The repo name.\n branch : str\n The repo branch to index.\n filters : list[GitFilters]\n The list of filters to apply.\n \"\"\"\n\n user: str\n repo: str\n branch: str\n # TODO : can we refactor this for a tuple?\n filters: list[GitFilters]\n "},{"location":"bcorag-types/#bcorag.custom_types.core_types.UserSelections","title":"UserSelections ","text":" Bases: TypedDict Types dict for the user selections. Attributes: Name Type Description llm str The LLM to use. embedding_model str The embedding model to use. filename str The file name of the paper being processed. filepath str The file path to the paper being processed. vector_store str The vector store to use. loader str The data loader to ingest the paper with. mode str The run mode. similarity_top_k int The base integer used to calculate the similarity_top_k and top_n values. chunking_config str The chunking configuration to use during node parsing. git_data Optional[GitData] The optional github repository information to include in the documents. other_docs Optional[list[str]] The file path to any additional documentation to include in the documents. Source code in bcorag/custom_types/core_types.py class UserSelections(TypedDict):\n \"\"\"Types dict for the user selections.\n\n Attributes\n ----------\n llm : str\n The LLM to use.\n embedding_model : str\n The embedding model to use.\n filename : str\n The file name of the paper being processed.\n filepath : str\n The file path to the paper being processed.\n vector_store : str\n The vector store to use.\n loader : str\n The data loader to ingest the paper with.\n mode : str\n The run mode.\n similarity_top_k : int\n The base integer used to calculate the similarity_top_k and top_n values.\n chunking_config : str\n The chunking configuration to use during node parsing.\n git_data : Optional[GitData]\n The optional github repository information to include in the documents.\n other_docs : Optional[list[str]]\n The file path to any additional documentation to include in the documents.\n \"\"\"\n\n llm: str\n embedding_model: str\n filename: str\n filepath: str\n vector_store: str\n loader: str\n mode: str\n similarity_top_k: int\n chunking_config: str\n git_data: Optional[GitData]\n other_docs: Optional[list[str]]\n "},{"location":"bcorag-types/#bcorag.custom_types.core_types.SourceNode","title":"SourceNode ","text":" Bases: TypedDict Holds the source node information for one node. Attributes: Name Type Description node_id str content str metdata str score str Source code in bcorag/custom_types/core_types.py class SourceNode(TypedDict):\n \"\"\"Holds the source node information for one node.\n\n Attributes\n ----------\n node_id : str\n content : str\n metdata : str\n score : str\n \"\"\"\n\n node_id: str\n content: str\n metadata: str\n score: str\n "},{"location":"bcorag-types/#bcorag.custom_types.core_types.DomainContent","title":"DomainContent ","text":" Bases: TypedDict Holds the most recent generated domain for in memory storage. Attributes: Name Type Description usability Optional[str] io Optional[str] description Optional[str] execution Optional[str] parametric Optional[str] error Optional[list[str]] Source code in bcorag/custom_types/core_types.py class DomainContent(TypedDict):\n \"\"\"Holds the most recent generated domain for in memory storage.\n\n Attributes\n ----------\n usability: Optional[str]\n io: Optional[str]\n description: Optional[str]\n execution: Optional[str]\n parametric: Optional[str]\n error: Optional[list[str]]\n \"\"\"\n\n usability: Optional[str]\n io: Optional[str]\n description: Optional[str]\n execution: Optional[str]\n parametric: Optional[str]\n error: Optional[str]\n last_source_nodes: Optional[list[SourceNode]]\n "},{"location":"bcorag-types/#bcorag.custom_types.core_types.IndividualDomainMapEntry","title":"IndividualDomainMapEntry ","text":" Bases: TypedDict Information for one domain to prompt and process the user domain choice. Attributes: Name Type Description prompt str The prompt to use for querying the RAG pipeline for a specific domain generation. top_level bool Whether the specified domain includes object's defined in the top level JSON schema. user_prompt str The prompt string to display to the user. code str The short hand code for choosing the domain. dependencies list[DomainKey] The domain dependencies. Source code in bcorag/custom_types/core_types.py class IndividualDomainMapEntry(TypedDict):\n \"\"\"Information for one domain to prompt and process the user domain choice.\n\n Attributes\n ----------\n prompt : str\n The prompt to use for querying the RAG pipeline for a specific domain generation.\n top_level : bool\n Whether the specified domain includes object's defined in the top level JSON schema.\n user_prompt : str\n The prompt string to display to the user.\n code : str\n The short hand code for choosing the domain.\n dependencies : list[DomainKey]\n The domain dependencies.\n \"\"\"\n\n prompt: str\n top_level: bool\n user_prompt: str\n code: str\n dependencies: list[DomainKey]\n "},{"location":"bcorag-types/#bcorag.custom_types.core_types.DomainMap","title":"DomainMap ","text":" Bases: TypedDict Domain map for processing user input. Maps the user input for the domain prompt to the prompt to use for querying the RAG pipeline. Attributes: Name Type Description usability IndividualDomainMapEntry io IndividualDomainMapEntry description IndividualDomainMapEntry execution IndividualDomainMapEntry parametric IndividualDomainMapEntry error IndividualDomainMapEntry Source code in bcorag/custom_types/core_types.py class DomainMap(TypedDict):\n \"\"\"Domain map for processing user input. Maps the user input for\n the domain prompt to the prompt to use for querying the RAG pipeline.\n\n Attributes\n ----------\n usability : IndividualDomainMapEntry\n io: IndividualDomainMapEntry\n description: IndividualDomainMapEntry\n execution: IndividualDomainMapEntry\n parametric: IndividualDomainMapEntry\n error: IndividualDomainMapEntry\n \"\"\"\n\n usability: IndividualDomainMapEntry\n io: IndividualDomainMapEntry\n description: IndividualDomainMapEntry\n execution: IndividualDomainMapEntry\n parametric: IndividualDomainMapEntry\n error: IndividualDomainMapEntry\n "},{"location":"bcorag-types/#bcorag.custom_types.core_types.OptionSchema","title":"OptionSchema ","text":" Bases: TypedDict Schema for a config object option entry in the config JSON file. Attributes: Name Type Description list list[str] The list of options to choose from. default str The option to use as the default. documentation str The link to the documentation for the option. Source code in bcorag/custom_types/core_types.py class OptionSchema(TypedDict):\n \"\"\"Schema for a config object option entry in the config JSON file.\n\n Attributes\n ----------\n list : list[str]\n The list of options to choose from.\n default : str\n The option to use as the default.\n documentation : str\n The link to the documentation for the option.\n \"\"\"\n\n list: list[str]\n default: str\n documentation: str\n "},{"location":"bcorag-types/#bcorag.custom_types.core_types.ConfigObjectOptions","title":"ConfigObjectOptions ","text":" Bases: TypedDict Schema for the supported options. Attributes: Name Type Description loader OptionSchema chunking_config OptionSchema embedding_model OptionSchema vector_store OptionSchema similarity_top_k OptionSchema llm OptionSchema mode OptionSchema Source code in bcorag/custom_types/core_types.py class ConfigObjectOptions(TypedDict):\n \"\"\"Schema for the supported options.\n\n Attributes\n ----------\n loader : OptionSchema\n chunking_config : OptionSchema\n embedding_model: OptionSchema\n vector_store: OptionSchema\n similarity_top_k: OptionSchema\n llm: OptionSchema\n mode: OptionSchema\n \"\"\"\n\n loader: OptionSchema\n chunking_config: OptionSchema\n embedding_model: OptionSchema\n vector_store: OptionSchema\n similarity_top_k: OptionSchema\n llm: OptionSchema\n mode: OptionSchema\n "},{"location":"bcorag-types/#bcorag.custom_types.core_types.ConfigObject","title":"ConfigObject ","text":" Bases: TypedDict Config JSON schema. Attributes: Name Type Description paper_directory str The file path to the paper's directory. options ConfigObjectOptions The supported configuration options. Source code in bcorag/custom_types/core_types.py class ConfigObject(TypedDict):\n \"\"\"Config JSON schema.\n\n Attributes\n ----------\n paper_directory : str\n The file path to the paper's directory.\n options : ConfigObjectOptions\n The supported configuration options.\n \"\"\"\n\n paper_directory: str\n options: ConfigObjectOptions\n "},{"location":"bcorag-types/#bcorag.custom_types.core_types.create_git_filters","title":"create_git_filters(filter_type, filter, value) ","text":"Constructor for the GitFilters TypedDict. Parameters: Name Type Description Default filter_type FilterType The type of github filter (whether it is an include or exclude filter). required filter GitFilter The filter enum specification. required value list[str] The values to filter on. required Returns: Type Description GitFilters Source code in bcorag/custom_types/core_types.py def create_git_filters(\n filter_type: GithubRepositoryReader.FilterType, filter: GitFilter, value: list[str]\n) -> GitFilters:\n \"\"\"Constructor for the `GitFilters` TypedDict.\n\n Parameters\n ----------\n filter_type : GithubRepositoryReader.FilterType\n The type of github filter (whether it is an include or exclude filter).\n filter : GitFilter\n The filter enum specification.\n value : list[str]\n The values to filter on.\n\n Returns\n -------\n GitFilters\n \"\"\"\n sorted_values = sorted(value)\n return_data: GitFilters = {\n \"filter_type\": filter_type,\n \"filter\": filter,\n \"value\": sorted_values,\n }\n return return_data\n "},{"location":"bcorag-types/#bcorag.custom_types.core_types.create_git_data","title":"create_git_data(user, repo, branch, filters=[]) ","text":"Constructor for the GitData TypedDict. Parameters: Name Type Description Default user str The github repo owner. required repo str The repo name. required branch str The repo branch to index. required filters list[GitFilters] The list of filters to apply. [] Returns: Type Description GitData Source code in bcorag/custom_types/core_types.py def create_git_data(\n user: str, repo: str, branch: str, filters: list[GitFilters] = []\n) -> GitData:\n \"\"\"Constructor for the `GitData` TypedDict.\n\n Parameters\n ----------\n user : str\n The github repo owner.\n repo : str\n The repo name.\n branch : str\n The repo branch to index.\n filters : list[GitFilters]\n The list of filters to apply.\n\n Returns\n -------\n GitData\n \"\"\"\n return_data: GitData = {\n \"user\": user,\n \"repo\": repo,\n \"branch\": branch,\n \"filters\": filters,\n }\n return return_data\n "},{"location":"bcorag-types/#bcorag.custom_types.core_types.create_user_selections","title":"create_user_selections(llm, embedding_model, filename, filepath, vector_store, loader, mode, similarity_top_k, chunking_config, git_data, other_docs) ","text":"Constructor for the UserSelections TypedDict. Parameters: Name Type Description Default llm str The LLM to use. required embedding_model str The embedding model to use. required filename str The file name of the paper being processed. required filepath str The file path to the paper being processed. required vector_store str The vector store to use. required loader str The data loader to ingest the paper with. required mode str The run mode. required similarity_top_k int The base integer used to calculate the similarity_top_k and top_n values. required chunking_config str The chunking configuration to use during node parsing. required git_data Optional[GitData] The optional github repository information to include in the documents. required other_docs Optional[list[str]] The file path to any additional documentation to include in the documents. required Returns: Type Description UserSelections Source code in bcorag/custom_types/core_types.py def create_user_selections(\n llm: str,\n embedding_model: str,\n filename: str,\n filepath: str,\n vector_store: str,\n loader: str,\n mode: str,\n similarity_top_k: int,\n chunking_config: str,\n git_data: Optional[GitData],\n other_docs: Optional[list[str]],\n) -> UserSelections:\n \"\"\"Constructor for the `UserSelections` TypedDict.\n\n Parameters\n ----------\n llm : str\n The LLM to use.\n embedding_model : str\n The embedding model to use.\n filename : str\n The file name of the paper being processed.\n filepath : str\n The file path to the paper being processed.\n vector_store : str\n The vector store to use.\n loader : str\n The data loader to ingest the paper with.\n mode : str\n The run mode.\n similarity_top_k : int\n The base integer used to calculate the similarity_top_k and top_n values.\n chunking_config : str\n The chunking configuration to use during node parsing.\n git_data : Optional[GitData]\n The optional github repository information to include in the documents.\n other_docs : Optional[list[str]]\n The file path to any additional documentation to include in the documents.\n\n Returns\n -------\n UserSelections\n \"\"\"\n return_data: UserSelections = {\n \"llm\": llm,\n \"embedding_model\": embedding_model,\n \"filename\": filename,\n \"filepath\": filepath,\n \"vector_store\": vector_store,\n \"loader\": loader,\n \"mode\": mode,\n \"similarity_top_k\": similarity_top_k,\n \"chunking_config\": chunking_config,\n \"git_data\": git_data,\n \"other_docs\": other_docs,\n }\n return return_data\n "},{"location":"bcorag-types/#bcorag.custom_types.core_types.default_domain_content","title":"default_domain_content() ","text":"Creates an empty, default DomainContent TypedDict. Returns: Type Description DomainContent Source code in bcorag/custom_types/core_types.py def default_domain_content() -> DomainContent:\n \"\"\"Creates an empty, default DomainContent TypedDict.\n\n Returns\n -------\n DomainContent\n \"\"\"\n return_data: DomainContent = {\n \"usability\": None,\n \"io\": None,\n \"description\": None,\n \"execution\": None,\n \"parametric\": None,\n \"error\": None,\n \"last_source_nodes\": None,\n }\n return return_data\n "},{"location":"bcorag-types/#bcorag.custom_types.core_types.add_source_nodes","title":"add_source_nodes(domain_content, nodes) ","text":"Adds source node data to the domain content. Parameters: Name Type Description Default domain_content DomainContent The domain content instance to add source node data to. required nodes list[NodeWithScore] List of nodes with score data. required Returns: Type Description DomainContent The updated domain content object. Source code in bcorag/custom_types/core_types.py def add_source_nodes(\n domain_content: DomainContent, nodes: list[NodeWithScore]\n) -> DomainContent:\n \"\"\"Adds source node data to the domain content.\n\n Parameters\n ----------\n domain_content : DomainContent\n The domain content instance to add source node data to.\n nodes : list[NodeWithScore]\n List of nodes with score data.\n\n Returns\n -------\n DomainContent\n The updated domain content object.\n \"\"\"\n node_list: list[SourceNode] = []\n for node in nodes:\n node_list.append(\n {\n \"node_id\": node.node.node_id,\n \"content\": node.node.get_content(),\n \"metadata\": node.node.get_metadata_str(),\n \"score\": str(node.score),\n }\n )\n domain_content[\"last_source_nodes\"] = node_list\n return domain_content\n "},{"location":"bcorag/","title":"Bcorag","text":"Handles the RAG implementation using the llama-index library. "},{"location":"bcorag/#bcorag.bcorag.BcoRag","title":"BcoRag ","text":"Class to handle the RAG implementation. Attributes: Name Type Description _parameter_set_hash str The MD5 hexidecimal hash of the parameter set. _domain_map DomainMap Mapping for each domain to its standardized prompt. _file_name str The source file (paper) name. _file_path str The file path to the source file (paper). _output_path_root str Path to the specific document directory to dump the outputs. _debug bool Whether in debug mode or not. _logger Logger The document specific logger. _llm_model_name str The LLM model name. _llm_model OpenAI The Open AI LLM model instance. _embed_model_name str The embedding model name. _embed_model OpenAIEmbedding The embedding model instance. _loader str The data loader being used. _vector_store str The vector store being used. _splitter SemanticSplitterNodeParser or None The node parser (if a non-fixed chunking strategy is chosen). _similarity_top_k int The similarity top k retrieval number for node sources. _token_counter TokenCountingHandler or None The token counter handler or None if in production mode. _token_counts dict[str, int] or None The token counts or None if in production mode. _git_data GitData or None The git data or None if no github repo was included. _documents list[Documents] The list of documents (containers for the data source). _index VectorStoreIndex The vector store index instance. _query_engine RetrieverQueryEngine The query engine. _other_docs list[str] | None Any other miscellaneous documents to include in the indexing process. _domain_content DomainContent Holds the most recent generated domain. Source code in bcorag/bcorag.py class BcoRag:\n \"\"\"Class to handle the RAG implementation.\n\n Attributes\n ----------\n _parameter_set_hash : str\n The MD5 hexidecimal hash of the parameter set.\n _domain_map : DomainMap\n Mapping for each domain to its standardized prompt.\n _file_name : str\n The source file (paper) name.\n _file_path : str\n The file path to the source file (paper).\n _output_path_root : str\n Path to the specific document directory to dump the outputs.\n _debug : bool\n Whether in debug mode or not.\n _logger : logging.Logger\n The document specific logger.\n _llm_model_name : str\n The LLM model name.\n _llm_model : OpenAI\n The Open AI LLM model instance.\n _embed_model_name : str\n The embedding model name.\n _embed_model : OpenAIEmbedding\n The embedding model instance.\n _loader : str\n The data loader being used.\n _vector_store : str\n The vector store being used.\n _splitter : SemanticSplitterNodeParser or None\n The node parser (if a non-fixed chunking strategy is chosen).\n _similarity_top_k : int\n The similarity top k retrieval number for node sources.\n _token_counter : TokenCountingHandler or None\n The token counter handler or None if in production mode.\n _token_counts : dict[str, int] or None\n The token counts or None if in production mode.\n _git_data : GitData or None\n The git data or None if no github repo was included.\n _documents : list[Documents]\n The list of documents (containers for the data source).\n _index : VectorStoreIndex\n The vector store index instance.\n _query_engine : RetrieverQueryEngine\n The query engine.\n _other_docs : list[str] | None\n Any other miscellaneous documents to include in the indexing process.\n _domain_content : DomainContent\n Holds the most recent generated domain.\n \"\"\"\n\n def __init__(\n self,\n user_selections: UserSelections,\n output_dir: str = \"./output\",\n ):\n \"\"\"Constructor.\n\n Parameters\n ----------\n user_selections : UserSelections\n The user configuration selections.\n output_dir : str\n The directory to dump the outputs (relative to main.py entry point\n in the repo root).\n evaluation_metrics : bool\n Whether or not to calculate Faithfulness and Relevancy metrics.\n \"\"\"\n load_dotenv()\n\n self._parameter_set_hash = self._user_selection_hash(user_selections)\n self._domain_map = DOMAIN_MAP\n self._file_name = user_selections[\"filename\"]\n self._file_path = user_selections[\"filepath\"]\n self._output_path_root = os.path.join(\n output_dir,\n os.path.splitext(self._file_name.lower().replace(\" \", \"_\").strip())[0],\n )\n self._debug = True if user_selections[\"mode\"] == \"debug\" else False\n self._logger = misc_fns.setup_document_logger(\n self._file_name.lower().strip().replace(\" \", \"_\")\n )\n self._llm_model_name = user_selections[\"llm\"]\n self._llm_model = OpenAI(model=self._llm_model_name)\n self._embed_model_name = user_selections[\"embedding_model\"]\n self._embed_model = OpenAIEmbedding(model=self._embed_model_name)\n self._loader = user_selections[\"loader\"]\n self._vector_store = user_selections[\"vector_store\"]\n self._splitter = None\n self._similarity_top_k = user_selections[\"similarity_top_k\"]\n self._chunking_config = user_selections[\"chunking_config\"]\n self._token_counter: TokenCountingHandler | None = None\n self._token_counts: dict[str, int] | None = None\n self._git_data: Optional[GitData] = (\n user_selections[\"git_data\"]\n if user_selections[\"git_data\"] is not None\n else None\n )\n self._other_docs: list[str] | None = user_selections[\"other_docs\"]\n self.domain_content: DomainContent = default_domain_content()\n\n openai_api_key = os.getenv(\"OPENAI_API_KEY\")\n if not openai_api_key:\n raise EnvironmentError(\"OpenAI API key not found.\")\n\n github_token = os.getenv(\"GITHUB_TOKEN\")\n if self._git_data is not None and not github_token:\n raise EnvironmentError(\"Github token not found.\")\n\n misc_fns.check_dir(self._output_path_root)\n self._display_info(user_selections, \"User selections:\")\n\n Settings.embed_model = self._embed_model\n Settings.llm = self._llm_model\n\n match self._chunking_config:\n case \"semantic\":\n self._splitter = SemanticSplitterNodeParser.from_defaults(\n buffer_size=1,\n embed_model=self._embed_model,\n # The percentile of cosin dissimilarity that must be exceeded\n # between a group of sentences and the next to form a node. The\n # smaller this number is, the more nodes will be generated.\n breakpoint_percentile_threshold=90,\n )\n case \"256 chunk size/20 chunk overlap\":\n Settings.chunk_size = 256\n Settings.chunk_overlap = 50\n case \"512 chunk size/50 chunk overlap\":\n Settings.chunk_size = 512\n Settings.chunk_overlap = 50\n case \"2048 chunk size/50 chunk overlap\":\n Settings.chunk_size = 2048\n Settings.chunk_overlap = 50\n case _:\n Settings.chunk_size = 1024\n Settings.chunk_overlap = 20\n\n if self._debug:\n self._token_counter = TokenCountingHandler(\n tokenizer=tiktoken.encoding_for_model(self._llm_model_name).encode\n )\n Settings.callback_manager = CallbackManager([self._token_counter])\n self._token_counts = {\n \"embedding\": 0,\n \"input\": 0,\n \"output\": 0,\n \"total\": 0,\n }\n\n match self._loader:\n case \"SimpleDirectoryReader\":\n loader = SimpleDirectoryReader(input_files=[self._file_path])\n paper_documents = loader.load_data()\n case \"PDFReader\":\n # Note: download_loader is deprecated in llama_index now\n # with supress_stdout():\n # pdf_loader = download_loader(\"PDFReader\")\n pdf_loader = PDFReader()\n paper_documents = pdf_loader.load_data(file=Path(self._file_path))\n case \"PDFMarker\":\n with supress_stdout():\n pdf_loader = PDFMarkerReader()\n paper_documents = pdf_loader.load_data(file=Path(self._file_path))\n\n other_docs = []\n if self._other_docs:\n for path in self._other_docs:\n loader = SimpleDirectoryReader(input_files=[path])\n other_docs += loader.load_data()\n\n documents = paper_documents + other_docs # type: ignore\n if self._git_data is not None:\n\n github_client = GithubClient(github_token)\n # Note: download_loader is deprecated in llama_index now\n # with supress_stdout():\n # download_loader(\"GithubRepositoryReader\")\n\n directory_filter: GitFilters | None = None\n file_ext_filter: GitFilters | None = None\n for filter in self._git_data[\"filters\"]:\n if filter[\"filter\"] == GitFilter.DIRECTORY:\n directory_filter = filter\n elif filter[\"filter\"] == GitFilter.FILE_EXTENSION:\n file_ext_filter = filter\n\n git_loader = GithubRepositoryReader(\n github_client=github_client,\n owner=self._git_data[\"user\"],\n repo=self._git_data[\"repo\"],\n filter_directories=(\n (directory_filter[\"value\"], directory_filter[\"filter_type\"])\n if directory_filter is not None\n else None\n ),\n filter_file_extensions=(\n (file_ext_filter[\"value\"], file_ext_filter[\"filter_type\"])\n if file_ext_filter is not None\n else None\n ),\n )\n\n github_documents = git_loader.load_data(branch=self._git_data[\"branch\"])\n documents += github_documents\n self._logger.info(\n f\"Loading repo `{self._git_data['repo']}` from user `{self._git_data['user']}`\"\n )\n self._documents = documents\n\n _chunk_fixed = (\n False if user_selections[\"chunking_config\"] == \"semantic\" else True\n )\n if self._vector_store == \"VectorStoreIndex\":\n if _chunk_fixed:\n self._index = VectorStoreIndex.from_documents(self._documents)\n else:\n if self._splitter is not None:\n nodes = self._splitter.build_semantic_nodes_from_documents(\n self._documents\n )\n self._index = VectorStoreIndex(nodes=nodes)\n\n retriever = VectorIndexRetriever(\n index=self._index, similarity_top_k=self._similarity_top_k * 3\n )\n response_synthesizer = get_response_synthesizer()\n rerank_postprocessor = SentenceTransformerRerank(\n top_n=self._similarity_top_k,\n keep_retrieval_score=True,\n )\n self._query_engine = RetrieverQueryEngine(\n retriever=retriever,\n response_synthesizer=response_synthesizer,\n node_postprocessors=[rerank_postprocessor],\n )\n\n if (\n self._debug\n and self._token_counts is not None\n and self._token_counter is not None\n ):\n self._token_counts[\n \"embedding\"\n ] += self._token_counter.total_embedding_token_count\n\n def perform_query(self, domain: DomainKey) -> str:\n \"\"\"Performs a query for a specific BCO domain.\n\n Parameters\n ----------\n domain : DomainKey\n The domain being queried for.\n\n Returns\n -------\n str\n The generated domain.\n \"\"\"\n query_start_time = time.time()\n domain_prompt = self._domain_map[domain][\"prompt\"]\n for dependency in self._domain_map[domain][\"dependencies\"]:\n if self.domain_content[dependency] is not None:\n dependency_prompt = f\"The {domain} domain is dependent on the {dependency} domain. Here is the {dependency} domain: {self.domain_content[dependency]}.\"\n domain_prompt += dependency_prompt\n query_prompt = QUERY_PROMPT.format(domain, domain_prompt)\n if self._domain_map[domain][\"top_level\"]:\n query_prompt += f\"\\n{SUPPLEMENT_PROMPT}\"\n\n response_object = self._query_engine.query(query_prompt)\n if isinstance(response_object, Response):\n response_object = Response(\n response=response_object.response,\n metadata=response_object.metadata,\n source_nodes=response_object.source_nodes,\n )\n else:\n self._logger.error(\n f\"Error parsing response object, expected type Response, got type `{type(response_object)}`.\"\n )\n print(\n f\"Error parsing response object, expected type Response, got type `{type(response_object)}`.\"\n )\n misc_fns.graceful_exit(1)\n query_response = str(response_object.response)\n\n self.domain_content[domain] = query_response\n self.domain_content = add_source_nodes(\n domain_content=self.domain_content, nodes=response_object.source_nodes\n )\n\n source_str = \"\"\n for idx, source_node in enumerate(response_object.source_nodes):\n source_str += f\"\\n--------------- Source Node '{idx + 1}/{len(response_object.source_nodes)}' ---------------\"\n source_str += f\"\\nNode ID: '{source_node.node.node_id}'\"\n source_str += f\"\\nRerank Score: '{source_node.score}'\"\n source_str += f\"\\nMetadata String:\\n`{source_node.node.get_metadata_str()}`\"\n source_str += (\n f\"\\nMetadata Size: `{len(source_node.node.get_metadata_str())}`\"\n )\n source_str += f\"\\nContent Size: `{len(source_node.node.get_content())}`\"\n source_str += (\n f\"\\nRetrieved Text:\\n{source_node.node.get_content().strip()}\\n\"\n )\n source_str += \"\\n\"\n\n if self._debug:\n self._display_info(query_prompt, f\"QUERY PROMPT for the {domain} domain:\")\n self._token_counts[\"input\"] += self._token_counter.prompt_llm_token_count # type: ignore\n self._token_counts[\"output\"] += self._token_counter.completion_llm_token_count # type: ignore\n self._token_counts[\"total\"] += self._token_counter.total_llm_token_count # type: ignore\n self._token_counts[\"embedding\"] += self._token_counter.total_embedding_token_count # type: ignore\n self._display_info(self._token_counts, \"Updated token counts:\")\n self._display_info(source_str, \"Retrieval source(s):\")\n\n query_elapsed_time = time.time() - query_start_time\n self._process_output(\n domain, query_response, source_str, round(query_elapsed_time, 2)\n )\n\n return query_response\n\n def choose_domain(\n self, automatic_query: bool = False\n ) -> Optional[tuple[DomainKey, str] | DomainKey]:\n \"\"\"Gets the user input for the domain the user wants to generate.\n\n Parameters\n ----------\n automatic_query : bool, optional\n Whether to automatically query after the user chooses a domain. If set to\n True this is a shortcut to calling `bcorag.perform_query(choose_domain())`.\n\n Returns\n -------\n (DomainKey, str) | str | None\n If automatic query is set to True will return a tuple containing the domain\n name and the query response. If automatic query is False will return the user\n chosen domain. None is returned if the user chooses to exit.\n \"\"\"\n domain_prompt = (\n \"Which domain would you like to generate? Supported domains are:\"\n )\n\n domain_user_prompt: DomainKey\n for domain_user_prompt in get_args(DomainKey):\n domain_prompt += (\n f\"\\n\\t{self._domain_map[domain_user_prompt]['user_prompt']}\"\n )\n domain_prompt += \"\\n\\tE[x]it\\n\"\n print(domain_prompt)\n\n domain_selection = None\n\n while True:\n\n domain_selection = input(\"> \").strip().lower()\n\n domain: DomainKey\n for domain in get_args(DomainKey):\n if (\n domain_selection == domain\n or domain_selection == self._domain_map[domain][\"code\"]\n ):\n domain_selection = domain\n break\n else:\n if domain_selection == \"exit\" or domain_selection == \"x\":\n if self._debug:\n self._display_info(\n \"User selected 'exit' on the domain selection step.\"\n )\n return None\n else:\n if self._debug:\n self._display_info(\n f\"User entered unrecognized input '{domain_selection}' on domain chooser step.\"\n )\n print(\n f\"Unrecognized input {domain_selection} entered, please try again.\"\n )\n continue\n if not self._check_dependencies(domain_selection):\n print(\n f\"Dependencies for the `{domain_selection}` domain are not satisfied. Please choose another domain.\"\n )\n continue\n\n break\n\n if automatic_query:\n if self._debug:\n self._display_info(\n f\"Automatic query called on domain: '{domain_selection}'.\"\n )\n return domain_selection, self.perform_query(domain_selection)\n if self._debug:\n self._display_info(\n f\"User chose '{domain_selection}' domain with no automatic query.\"\n )\n return domain_selection\n\n def _process_output(\n self, domain: DomainKey, response: str, source_str: str, elapsed_time: float\n ):\n \"\"\"Attempts to serialize the response into a JSON object and dumps the output.\n Also dumps the raw text regardless if JSON serialization was successful. The\n file dumps are dumped to the `output` directory located in the root of this\n repo. Keeps a TSV file to track all of the domain outputs and what parameter\n set generated the results.\n\n Note: This function is getting long with some redundancy, it should be re-written\n at some point. It works, but is ugly.\n\n Parameters\n ----------\n domain : DomainKey\n The domain the response is for.\n response : str\n The generated response to dump.\n source_str : str\n The formatted source string for the query.\n elapsed_time : float\n The query generation elapsed time.\n \"\"\"\n\n def dump_json_response(fp: str, response_string: str) -> bool:\n if response_string.startswith(\"```json\\n\"):\n response_string = response_string.replace(\"```json\\n\", \"\").replace(\n \"```\", \"\"\n )\n self._display_info(\n response_string, f\"QUERY RESPONSE for the `{domain}` domain:\"\n )\n try:\n response_json = json.loads(response_string)\n if misc_fns.write_json(fp, response_json):\n self._logger.info(\n f\"Succesfully serialized JSON response for the `{domain}` domain.\"\n )\n return True\n except Exception as e:\n self._logger.error(\n f\"Failed to serialize the JSON response for the `{domain}` domain.\\n{e}\"\n )\n return False\n\n generated_dir = os.path.join(self._output_path_root, \"generated_domains\")\n misc_fns.check_dir(generated_dir)\n\n txt_file_unindexed = os.path.join(\n generated_dir, f\"{domain}-(index)-{self._parameter_set_hash}.txt\"\n )\n json_file_unindexed = os.path.join(\n generated_dir, f\"{domain}-(index)-{self._parameter_set_hash}.json\"\n )\n source_file_unindexed = os.path.join(\n self._output_path_root,\n \"reference_sources\",\n f\"{domain}-(index)-{self._parameter_set_hash}.txt\",\n )\n\n output_map_json = misc_fns.load_output_tracker(\n os.path.join(self._output_path_root, \"output_map.json\")\n )\n\n # Create a new output file if one doesn't exist\n if output_map_json is None:\n\n txt_file = txt_file_unindexed.replace(\"(index)\", \"1\")\n json_file = json_file_unindexed.replace(\"(index)\", \"1\")\n source_file = source_file_unindexed.replace(\"(index)\", \"1\")\n if not dump_json_response(json_file, response):\n json_file = \"NA\"\n\n run_entry = create_output_tracker_runs_entry(\n 1,\n misc_fns.create_timestamp(),\n txt_file,\n json_file,\n source_file,\n elapsed_time,\n )\n\n directory_filter: OutputTrackerGitFilter | None = None\n file_ext_filter: OutputTrackerGitFilter | None = None\n if self._git_data is not None:\n for filter in self._git_data[\"filters\"]:\n if filter[\"filter\"] == GitFilter.FILE_EXTENSION:\n file_ext_filter = create_output_tracker_git_filter(\n (\"include\", filter[\"value\"])\n if filter[\"filter_type\"]\n == GithubRepositoryReader.FilterType.INCLUDE\n else (\"exclude\", filter[\"value\"])\n )\n elif filter[\"filter\"] == GitFilter.DIRECTORY:\n directory_filter = create_output_tracker_git_filter(\n (\"include\", filter[\"value\"])\n if filter[\"filter_type\"]\n == GithubRepositoryReader.FilterType.INCLUDE\n else (\"exclude\", filter[\"value\"])\n )\n\n param_set = create_output_tracker_param_set(\n loader=self._loader,\n vector_store=self._vector_store,\n llm=self._llm_model_name,\n embedding_model=self._embed_model_name,\n similarity_top_k=self._similarity_top_k,\n chunking_config=self._chunking_config,\n git_user=self._git_data[\"user\"] if self._git_data is not None else None,\n git_repo=self._git_data[\"repo\"] if self._git_data is not None else None,\n git_branch=(\n self._git_data[\"branch\"] if self._git_data is not None else None\n ),\n directory_git_filter=directory_filter,\n file_ext_git_filter=file_ext_filter,\n other_docs=self._other_docs,\n )\n\n instance_entry = create_output_tracker_entry(1, param_set, [run_entry])\n\n domain_entry = create_output_tracker_domain_entry(\n self._parameter_set_hash, instance_entry\n )\n\n output_data = default_output_tracker_file()\n output_data[domain].append(domain_entry)\n\n # update output map\n else:\n\n domain_map_entries = output_map_json[domain]\n\n for domain_map_entry in domain_map_entries:\n\n # found the collision entry\n if domain_map_entry[\"hash_str\"] == self._parameter_set_hash:\n\n new_index = domain_map_entry[\"entries\"][\"curr_index\"] + 1\n domain_map_entry[\"entries\"][\"curr_index\"] = new_index\n\n txt_file = txt_file_unindexed.replace(\"(index)\", str(new_index))\n json_file = json_file_unindexed.replace(\"(index)\", str(new_index))\n source_file = source_file_unindexed.replace(\n \"(index)\", str(new_index)\n )\n if not dump_json_response(json_file, response):\n json_file = \"NA\"\n\n run_entry = create_output_tracker_runs_entry(\n new_index,\n misc_fns.create_timestamp(),\n txt_file,\n json_file,\n source_file,\n elapsed_time,\n )\n\n domain_map_entry[\"entries\"][\"runs\"].append(run_entry)\n\n break\n\n # first time parameter set run (loop didn't break)\n else:\n\n txt_file = txt_file_unindexed.replace(\"(index)\", \"1\")\n json_file = json_file_unindexed.replace(\"(index)\", \"1\")\n source_file = source_file_unindexed.replace(\"(index)\", \"1\")\n if not dump_json_response(json_file, response):\n json_file = \"NA\"\n\n run_entry = create_output_tracker_runs_entry(\n 1,\n misc_fns.create_timestamp(),\n txt_file,\n json_file,\n source_file,\n elapsed_time,\n )\n\n directory_filter = None\n file_ext_filter = None\n if self._git_data is not None:\n for filter in self._git_data[\"filters\"]:\n if filter[\"filter\"] == GitFilter.FILE_EXTENSION:\n file_ext_filter = create_output_tracker_git_filter(\n (\"include\", filter[\"value\"])\n if filter[\"filter_type\"]\n == GithubRepositoryReader.FilterType.INCLUDE\n else (\"exclude\", filter[\"value\"])\n )\n elif filter[\"filter\"] == GitFilter.DIRECTORY:\n directory_filter = create_output_tracker_git_filter(\n (\"include\", filter[\"value\"])\n if filter[\"filter_type\"]\n == GithubRepositoryReader.FilterType.INCLUDE\n else (\"exclude\", filter[\"value\"])\n )\n\n param_set = create_output_tracker_param_set(\n loader=self._loader,\n vector_store=self._vector_store,\n llm=self._llm_model_name,\n embedding_model=self._embed_model_name,\n similarity_top_k=self._similarity_top_k,\n chunking_config=self._chunking_config,\n git_user=(\n self._git_data[\"user\"] if self._git_data is not None else None\n ),\n git_repo=(\n self._git_data[\"repo\"] if self._git_data is not None else None\n ),\n git_branch=(\n self._git_data[\"branch\"] if self._git_data is not None else None\n ),\n directory_git_filter=directory_filter,\n file_ext_git_filter=file_ext_filter,\n other_docs=self._other_docs,\n )\n\n instance_entry = create_output_tracker_entry(1, param_set, [run_entry])\n\n domain_entry = create_output_tracker_domain_entry(\n self._parameter_set_hash, instance_entry\n )\n\n domain_map_entries.append(domain_entry)\n\n output_data = output_map_json\n\n misc_fns.dump_string(txt_file, response)\n misc_fns.dump_string(source_file, source_str)\n # writes the output mapping files\n misc_fns.write_json(\n os.path.join(self._output_path_root, \"output_map.json\"), output_data\n )\n misc_fns.dump_output_file_map_tsv(\n os.path.join(self._output_path_root, \"output_map.tsv\"), output_data\n )\n\n def _display_info(\n self,\n info: Optional[dict | list | str | UserSelections],\n header: Optional[str] = None,\n ):\n \"\"\"If in debug mode, handles the debug info output to the log file.\n\n Parameters\n ----------\n info : dict | list | str | UserSelections | None\n The object to log.\n header : str or None\n The optional header to log before the info.\n \"\"\"\n log_str = header if header is not None else \"\"\n if isinstance(info, dict):\n for key, value in info.items():\n log_str += f\"\\n\\t{key}: '{value}'\"\n elif isinstance(info, str):\n log_str += f\"{info}\" if header is None else f\"\\n{info}\"\n self._logger.info(log_str)\n\n def _user_selection_hash(self, params: UserSelections) -> str:\n \"\"\"Generates an MD5 hash of the parameter set.\n\n Parameters\n ----------\n params : UserSelections\n The user configuration selections.\n\n Returns\n -------\n str\n The hexidecimal MD5 hash.\n \"\"\"\n hash_list = []\n hash_list.append(params[\"llm\"])\n hash_list.append(params[\"embedding_model\"])\n hash_list.append(params[\"vector_store\"])\n hash_list.append(params[\"loader\"])\n hash_list.append(str(params[\"similarity_top_k\"]))\n hash_list.append(params[\"chunking_config\"])\n\n if params[\"git_data\"] is not None:\n\n hash_list.append(params[\"git_data\"][\"user\"])\n hash_list.append(params[\"git_data\"][\"repo\"])\n hash_list.append(params[\"git_data\"][\"branch\"])\n\n for filter in params[\"git_data\"][\"filters\"]:\n\n filter_type = (\n \"include\"\n if filter[\"filter_type\"]\n == GithubRepositoryReader.FilterType.INCLUDE\n else \"exclude\"\n )\n filter_str = f\"{filter_type}-{filter['value']}\"\n hash_list.append(filter_str)\n\n sorted(hash_list)\n hash_str = \"_\".join(hash_list)\n hash_hex = md5(hash_str.encode(\"utf-8\")).hexdigest()\n return hash_hex\n\n def _check_dependencies(self, domain: DomainKey) -> bool:\n \"\"\"Checks a domain's dependencies.\n\n Parameters\n ----------\n domain : DomainKey\n The domain to check.\n\n Returns\n -------\n bool\n True if dependencies are satisfied, False otherwise.\n \"\"\"\n for dependency in self._domain_map[domain][\"dependencies\"]:\n if self.domain_content[dependency] is None:\n print(\n f\"Error: {dependency.title()} domain must be generated before the {domain.title()} domain.\"\n )\n return False\n return True\n "},{"location":"bcorag/#bcorag.bcorag.BcoRag.__init__","title":"__init__(user_selections, output_dir='./output') ","text":"Constructor. Parameters: Name Type Description Default user_selections UserSelections The user configuration selections. required output_dir str The directory to dump the outputs (relative to main.py entry point in the repo root). './output' evaluation_metrics bool Whether or not to calculate Faithfulness and Relevancy metrics. required Source code in bcorag/bcorag.py def __init__(\n self,\n user_selections: UserSelections,\n output_dir: str = \"./output\",\n):\n \"\"\"Constructor.\n\n Parameters\n ----------\n user_selections : UserSelections\n The user configuration selections.\n output_dir : str\n The directory to dump the outputs (relative to main.py entry point\n in the repo root).\n evaluation_metrics : bool\n Whether or not to calculate Faithfulness and Relevancy metrics.\n \"\"\"\n load_dotenv()\n\n self._parameter_set_hash = self._user_selection_hash(user_selections)\n self._domain_map = DOMAIN_MAP\n self._file_name = user_selections[\"filename\"]\n self._file_path = user_selections[\"filepath\"]\n self._output_path_root = os.path.join(\n output_dir,\n os.path.splitext(self._file_name.lower().replace(\" \", \"_\").strip())[0],\n )\n self._debug = True if user_selections[\"mode\"] == \"debug\" else False\n self._logger = misc_fns.setup_document_logger(\n self._file_name.lower().strip().replace(\" \", \"_\")\n )\n self._llm_model_name = user_selections[\"llm\"]\n self._llm_model = OpenAI(model=self._llm_model_name)\n self._embed_model_name = user_selections[\"embedding_model\"]\n self._embed_model = OpenAIEmbedding(model=self._embed_model_name)\n self._loader = user_selections[\"loader\"]\n self._vector_store = user_selections[\"vector_store\"]\n self._splitter = None\n self._similarity_top_k = user_selections[\"similarity_top_k\"]\n self._chunking_config = user_selections[\"chunking_config\"]\n self._token_counter: TokenCountingHandler | None = None\n self._token_counts: dict[str, int] | None = None\n self._git_data: Optional[GitData] = (\n user_selections[\"git_data\"]\n if user_selections[\"git_data\"] is not None\n else None\n )\n self._other_docs: list[str] | None = user_selections[\"other_docs\"]\n self.domain_content: DomainContent = default_domain_content()\n\n openai_api_key = os.getenv(\"OPENAI_API_KEY\")\n if not openai_api_key:\n raise EnvironmentError(\"OpenAI API key not found.\")\n\n github_token = os.getenv(\"GITHUB_TOKEN\")\n if self._git_data is not None and not github_token:\n raise EnvironmentError(\"Github token not found.\")\n\n misc_fns.check_dir(self._output_path_root)\n self._display_info(user_selections, \"User selections:\")\n\n Settings.embed_model = self._embed_model\n Settings.llm = self._llm_model\n\n match self._chunking_config:\n case \"semantic\":\n self._splitter = SemanticSplitterNodeParser.from_defaults(\n buffer_size=1,\n embed_model=self._embed_model,\n # The percentile of cosin dissimilarity that must be exceeded\n # between a group of sentences and the next to form a node. The\n # smaller this number is, the more nodes will be generated.\n breakpoint_percentile_threshold=90,\n )\n case \"256 chunk size/20 chunk overlap\":\n Settings.chunk_size = 256\n Settings.chunk_overlap = 50\n case \"512 chunk size/50 chunk overlap\":\n Settings.chunk_size = 512\n Settings.chunk_overlap = 50\n case \"2048 chunk size/50 chunk overlap\":\n Settings.chunk_size = 2048\n Settings.chunk_overlap = 50\n case _:\n Settings.chunk_size = 1024\n Settings.chunk_overlap = 20\n\n if self._debug:\n self._token_counter = TokenCountingHandler(\n tokenizer=tiktoken.encoding_for_model(self._llm_model_name).encode\n )\n Settings.callback_manager = CallbackManager([self._token_counter])\n self._token_counts = {\n \"embedding\": 0,\n \"input\": 0,\n \"output\": 0,\n \"total\": 0,\n }\n\n match self._loader:\n case \"SimpleDirectoryReader\":\n loader = SimpleDirectoryReader(input_files=[self._file_path])\n paper_documents = loader.load_data()\n case \"PDFReader\":\n # Note: download_loader is deprecated in llama_index now\n # with supress_stdout():\n # pdf_loader = download_loader(\"PDFReader\")\n pdf_loader = PDFReader()\n paper_documents = pdf_loader.load_data(file=Path(self._file_path))\n case \"PDFMarker\":\n with supress_stdout():\n pdf_loader = PDFMarkerReader()\n paper_documents = pdf_loader.load_data(file=Path(self._file_path))\n\n other_docs = []\n if self._other_docs:\n for path in self._other_docs:\n loader = SimpleDirectoryReader(input_files=[path])\n other_docs += loader.load_data()\n\n documents = paper_documents + other_docs # type: ignore\n if self._git_data is not None:\n\n github_client = GithubClient(github_token)\n # Note: download_loader is deprecated in llama_index now\n # with supress_stdout():\n # download_loader(\"GithubRepositoryReader\")\n\n directory_filter: GitFilters | None = None\n file_ext_filter: GitFilters | None = None\n for filter in self._git_data[\"filters\"]:\n if filter[\"filter\"] == GitFilter.DIRECTORY:\n directory_filter = filter\n elif filter[\"filter\"] == GitFilter.FILE_EXTENSION:\n file_ext_filter = filter\n\n git_loader = GithubRepositoryReader(\n github_client=github_client,\n owner=self._git_data[\"user\"],\n repo=self._git_data[\"repo\"],\n filter_directories=(\n (directory_filter[\"value\"], directory_filter[\"filter_type\"])\n if directory_filter is not None\n else None\n ),\n filter_file_extensions=(\n (file_ext_filter[\"value\"], file_ext_filter[\"filter_type\"])\n if file_ext_filter is not None\n else None\n ),\n )\n\n github_documents = git_loader.load_data(branch=self._git_data[\"branch\"])\n documents += github_documents\n self._logger.info(\n f\"Loading repo `{self._git_data['repo']}` from user `{self._git_data['user']}`\"\n )\n self._documents = documents\n\n _chunk_fixed = (\n False if user_selections[\"chunking_config\"] == \"semantic\" else True\n )\n if self._vector_store == \"VectorStoreIndex\":\n if _chunk_fixed:\n self._index = VectorStoreIndex.from_documents(self._documents)\n else:\n if self._splitter is not None:\n nodes = self._splitter.build_semantic_nodes_from_documents(\n self._documents\n )\n self._index = VectorStoreIndex(nodes=nodes)\n\n retriever = VectorIndexRetriever(\n index=self._index, similarity_top_k=self._similarity_top_k * 3\n )\n response_synthesizer = get_response_synthesizer()\n rerank_postprocessor = SentenceTransformerRerank(\n top_n=self._similarity_top_k,\n keep_retrieval_score=True,\n )\n self._query_engine = RetrieverQueryEngine(\n retriever=retriever,\n response_synthesizer=response_synthesizer,\n node_postprocessors=[rerank_postprocessor],\n )\n\n if (\n self._debug\n and self._token_counts is not None\n and self._token_counter is not None\n ):\n self._token_counts[\n \"embedding\"\n ] += self._token_counter.total_embedding_token_count\n "},{"location":"bcorag/#bcorag.bcorag.BcoRag.perform_query","title":"perform_query(domain) ","text":"Performs a query for a specific BCO domain. Parameters: Name Type Description Default domain DomainKey The domain being queried for. required Returns: Type Description str The generated domain. Source code in bcorag/bcorag.py def perform_query(self, domain: DomainKey) -> str:\n \"\"\"Performs a query for a specific BCO domain.\n\n Parameters\n ----------\n domain : DomainKey\n The domain being queried for.\n\n Returns\n -------\n str\n The generated domain.\n \"\"\"\n query_start_time = time.time()\n domain_prompt = self._domain_map[domain][\"prompt\"]\n for dependency in self._domain_map[domain][\"dependencies\"]:\n if self.domain_content[dependency] is not None:\n dependency_prompt = f\"The {domain} domain is dependent on the {dependency} domain. Here is the {dependency} domain: {self.domain_content[dependency]}.\"\n domain_prompt += dependency_prompt\n query_prompt = QUERY_PROMPT.format(domain, domain_prompt)\n if self._domain_map[domain][\"top_level\"]:\n query_prompt += f\"\\n{SUPPLEMENT_PROMPT}\"\n\n response_object = self._query_engine.query(query_prompt)\n if isinstance(response_object, Response):\n response_object = Response(\n response=response_object.response,\n metadata=response_object.metadata,\n source_nodes=response_object.source_nodes,\n )\n else:\n self._logger.error(\n f\"Error parsing response object, expected type Response, got type `{type(response_object)}`.\"\n )\n print(\n f\"Error parsing response object, expected type Response, got type `{type(response_object)}`.\"\n )\n misc_fns.graceful_exit(1)\n query_response = str(response_object.response)\n\n self.domain_content[domain] = query_response\n self.domain_content = add_source_nodes(\n domain_content=self.domain_content, nodes=response_object.source_nodes\n )\n\n source_str = \"\"\n for idx, source_node in enumerate(response_object.source_nodes):\n source_str += f\"\\n--------------- Source Node '{idx + 1}/{len(response_object.source_nodes)}' ---------------\"\n source_str += f\"\\nNode ID: '{source_node.node.node_id}'\"\n source_str += f\"\\nRerank Score: '{source_node.score}'\"\n source_str += f\"\\nMetadata String:\\n`{source_node.node.get_metadata_str()}`\"\n source_str += (\n f\"\\nMetadata Size: `{len(source_node.node.get_metadata_str())}`\"\n )\n source_str += f\"\\nContent Size: `{len(source_node.node.get_content())}`\"\n source_str += (\n f\"\\nRetrieved Text:\\n{source_node.node.get_content().strip()}\\n\"\n )\n source_str += \"\\n\"\n\n if self._debug:\n self._display_info(query_prompt, f\"QUERY PROMPT for the {domain} domain:\")\n self._token_counts[\"input\"] += self._token_counter.prompt_llm_token_count # type: ignore\n self._token_counts[\"output\"] += self._token_counter.completion_llm_token_count # type: ignore\n self._token_counts[\"total\"] += self._token_counter.total_llm_token_count # type: ignore\n self._token_counts[\"embedding\"] += self._token_counter.total_embedding_token_count # type: ignore\n self._display_info(self._token_counts, \"Updated token counts:\")\n self._display_info(source_str, \"Retrieval source(s):\")\n\n query_elapsed_time = time.time() - query_start_time\n self._process_output(\n domain, query_response, source_str, round(query_elapsed_time, 2)\n )\n\n return query_response\n "},{"location":"bcorag/#bcorag.bcorag.BcoRag.choose_domain","title":"choose_domain(automatic_query=False) ","text":"Gets the user input for the domain the user wants to generate. Parameters: Name Type Description Default automatic_query bool Whether to automatically query after the user chooses a domain. If set to True this is a shortcut to calling bcorag.perform_query(choose_domain()) . False Returns: Type Description (DomainKey, str) | str | None If automatic query is set to True will return a tuple containing the domain name and the query response. If automatic query is False will return the user chosen domain. None is returned if the user chooses to exit. Source code in bcorag/bcorag.py def choose_domain(\n self, automatic_query: bool = False\n) -> Optional[tuple[DomainKey, str] | DomainKey]:\n \"\"\"Gets the user input for the domain the user wants to generate.\n\n Parameters\n ----------\n automatic_query : bool, optional\n Whether to automatically query after the user chooses a domain. If set to\n True this is a shortcut to calling `bcorag.perform_query(choose_domain())`.\n\n Returns\n -------\n (DomainKey, str) | str | None\n If automatic query is set to True will return a tuple containing the domain\n name and the query response. If automatic query is False will return the user\n chosen domain. None is returned if the user chooses to exit.\n \"\"\"\n domain_prompt = (\n \"Which domain would you like to generate? Supported domains are:\"\n )\n\n domain_user_prompt: DomainKey\n for domain_user_prompt in get_args(DomainKey):\n domain_prompt += (\n f\"\\n\\t{self._domain_map[domain_user_prompt]['user_prompt']}\"\n )\n domain_prompt += \"\\n\\tE[x]it\\n\"\n print(domain_prompt)\n\n domain_selection = None\n\n while True:\n\n domain_selection = input(\"> \").strip().lower()\n\n domain: DomainKey\n for domain in get_args(DomainKey):\n if (\n domain_selection == domain\n or domain_selection == self._domain_map[domain][\"code\"]\n ):\n domain_selection = domain\n break\n else:\n if domain_selection == \"exit\" or domain_selection == \"x\":\n if self._debug:\n self._display_info(\n \"User selected 'exit' on the domain selection step.\"\n )\n return None\n else:\n if self._debug:\n self._display_info(\n f\"User entered unrecognized input '{domain_selection}' on domain chooser step.\"\n )\n print(\n f\"Unrecognized input {domain_selection} entered, please try again.\"\n )\n continue\n if not self._check_dependencies(domain_selection):\n print(\n f\"Dependencies for the `{domain_selection}` domain are not satisfied. Please choose another domain.\"\n )\n continue\n\n break\n\n if automatic_query:\n if self._debug:\n self._display_info(\n f\"Automatic query called on domain: '{domain_selection}'.\"\n )\n return domain_selection, self.perform_query(domain_selection)\n if self._debug:\n self._display_info(\n f\"User chose '{domain_selection}' domain with no automatic query.\"\n )\n return domain_selection\n "},{"location":"bcorag/#bcorag.bcorag.BcoRag._process_output","title":"_process_output(domain, response, source_str, elapsed_time) ","text":"Attempts to serialize the response into a JSON object and dumps the output. Also dumps the raw text regardless if JSON serialization was successful. The file dumps are dumped to the output directory located in the root of this repo. Keeps a TSV file to track all of the domain outputs and what parameter set generated the results. Note: This function is getting long with some redundancy, it should be re-written at some point. It works, but is ugly. Parameters: Name Type Description Default domain DomainKey The domain the response is for. required response str The generated response to dump. required source_str str The formatted source string for the query. required elapsed_time float The query generation elapsed time. required Source code in bcorag/bcorag.py def _process_output(\n self, domain: DomainKey, response: str, source_str: str, elapsed_time: float\n):\n \"\"\"Attempts to serialize the response into a JSON object and dumps the output.\n Also dumps the raw text regardless if JSON serialization was successful. The\n file dumps are dumped to the `output` directory located in the root of this\n repo. Keeps a TSV file to track all of the domain outputs and what parameter\n set generated the results.\n\n Note: This function is getting long with some redundancy, it should be re-written\n at some point. It works, but is ugly.\n\n Parameters\n ----------\n domain : DomainKey\n The domain the response is for.\n response : str\n The generated response to dump.\n source_str : str\n The formatted source string for the query.\n elapsed_time : float\n The query generation elapsed time.\n \"\"\"\n\n def dump_json_response(fp: str, response_string: str) -> bool:\n if response_string.startswith(\"```json\\n\"):\n response_string = response_string.replace(\"```json\\n\", \"\").replace(\n \"```\", \"\"\n )\n self._display_info(\n response_string, f\"QUERY RESPONSE for the `{domain}` domain:\"\n )\n try:\n response_json = json.loads(response_string)\n if misc_fns.write_json(fp, response_json):\n self._logger.info(\n f\"Succesfully serialized JSON response for the `{domain}` domain.\"\n )\n return True\n except Exception as e:\n self._logger.error(\n f\"Failed to serialize the JSON response for the `{domain}` domain.\\n{e}\"\n )\n return False\n\n generated_dir = os.path.join(self._output_path_root, \"generated_domains\")\n misc_fns.check_dir(generated_dir)\n\n txt_file_unindexed = os.path.join(\n generated_dir, f\"{domain}-(index)-{self._parameter_set_hash}.txt\"\n )\n json_file_unindexed = os.path.join(\n generated_dir, f\"{domain}-(index)-{self._parameter_set_hash}.json\"\n )\n source_file_unindexed = os.path.join(\n self._output_path_root,\n \"reference_sources\",\n f\"{domain}-(index)-{self._parameter_set_hash}.txt\",\n )\n\n output_map_json = misc_fns.load_output_tracker(\n os.path.join(self._output_path_root, \"output_map.json\")\n )\n\n # Create a new output file if one doesn't exist\n if output_map_json is None:\n\n txt_file = txt_file_unindexed.replace(\"(index)\", \"1\")\n json_file = json_file_unindexed.replace(\"(index)\", \"1\")\n source_file = source_file_unindexed.replace(\"(index)\", \"1\")\n if not dump_json_response(json_file, response):\n json_file = \"NA\"\n\n run_entry = create_output_tracker_runs_entry(\n 1,\n misc_fns.create_timestamp(),\n txt_file,\n json_file,\n source_file,\n elapsed_time,\n )\n\n directory_filter: OutputTrackerGitFilter | None = None\n file_ext_filter: OutputTrackerGitFilter | None = None\n if self._git_data is not None:\n for filter in self._git_data[\"filters\"]:\n if filter[\"filter\"] == GitFilter.FILE_EXTENSION:\n file_ext_filter = create_output_tracker_git_filter(\n (\"include\", filter[\"value\"])\n if filter[\"filter_type\"]\n == GithubRepositoryReader.FilterType.INCLUDE\n else (\"exclude\", filter[\"value\"])\n )\n elif filter[\"filter\"] == GitFilter.DIRECTORY:\n directory_filter = create_output_tracker_git_filter(\n (\"include\", filter[\"value\"])\n if filter[\"filter_type\"]\n == GithubRepositoryReader.FilterType.INCLUDE\n else (\"exclude\", filter[\"value\"])\n )\n\n param_set = create_output_tracker_param_set(\n loader=self._loader,\n vector_store=self._vector_store,\n llm=self._llm_model_name,\n embedding_model=self._embed_model_name,\n similarity_top_k=self._similarity_top_k,\n chunking_config=self._chunking_config,\n git_user=self._git_data[\"user\"] if self._git_data is not None else None,\n git_repo=self._git_data[\"repo\"] if self._git_data is not None else None,\n git_branch=(\n self._git_data[\"branch\"] if self._git_data is not None else None\n ),\n directory_git_filter=directory_filter,\n file_ext_git_filter=file_ext_filter,\n other_docs=self._other_docs,\n )\n\n instance_entry = create_output_tracker_entry(1, param_set, [run_entry])\n\n domain_entry = create_output_tracker_domain_entry(\n self._parameter_set_hash, instance_entry\n )\n\n output_data = default_output_tracker_file()\n output_data[domain].append(domain_entry)\n\n # update output map\n else:\n\n domain_map_entries = output_map_json[domain]\n\n for domain_map_entry in domain_map_entries:\n\n # found the collision entry\n if domain_map_entry[\"hash_str\"] == self._parameter_set_hash:\n\n new_index = domain_map_entry[\"entries\"][\"curr_index\"] + 1\n domain_map_entry[\"entries\"][\"curr_index\"] = new_index\n\n txt_file = txt_file_unindexed.replace(\"(index)\", str(new_index))\n json_file = json_file_unindexed.replace(\"(index)\", str(new_index))\n source_file = source_file_unindexed.replace(\n \"(index)\", str(new_index)\n )\n if not dump_json_response(json_file, response):\n json_file = \"NA\"\n\n run_entry = create_output_tracker_runs_entry(\n new_index,\n misc_fns.create_timestamp(),\n txt_file,\n json_file,\n source_file,\n elapsed_time,\n )\n\n domain_map_entry[\"entries\"][\"runs\"].append(run_entry)\n\n break\n\n # first time parameter set run (loop didn't break)\n else:\n\n txt_file = txt_file_unindexed.replace(\"(index)\", \"1\")\n json_file = json_file_unindexed.replace(\"(index)\", \"1\")\n source_file = source_file_unindexed.replace(\"(index)\", \"1\")\n if not dump_json_response(json_file, response):\n json_file = \"NA\"\n\n run_entry = create_output_tracker_runs_entry(\n 1,\n misc_fns.create_timestamp(),\n txt_file,\n json_file,\n source_file,\n elapsed_time,\n )\n\n directory_filter = None\n file_ext_filter = None\n if self._git_data is not None:\n for filter in self._git_data[\"filters\"]:\n if filter[\"filter\"] == GitFilter.FILE_EXTENSION:\n file_ext_filter = create_output_tracker_git_filter(\n (\"include\", filter[\"value\"])\n if filter[\"filter_type\"]\n == GithubRepositoryReader.FilterType.INCLUDE\n else (\"exclude\", filter[\"value\"])\n )\n elif filter[\"filter\"] == GitFilter.DIRECTORY:\n directory_filter = create_output_tracker_git_filter(\n (\"include\", filter[\"value\"])\n if filter[\"filter_type\"]\n == GithubRepositoryReader.FilterType.INCLUDE\n else (\"exclude\", filter[\"value\"])\n )\n\n param_set = create_output_tracker_param_set(\n loader=self._loader,\n vector_store=self._vector_store,\n llm=self._llm_model_name,\n embedding_model=self._embed_model_name,\n similarity_top_k=self._similarity_top_k,\n chunking_config=self._chunking_config,\n git_user=(\n self._git_data[\"user\"] if self._git_data is not None else None\n ),\n git_repo=(\n self._git_data[\"repo\"] if self._git_data is not None else None\n ),\n git_branch=(\n self._git_data[\"branch\"] if self._git_data is not None else None\n ),\n directory_git_filter=directory_filter,\n file_ext_git_filter=file_ext_filter,\n other_docs=self._other_docs,\n )\n\n instance_entry = create_output_tracker_entry(1, param_set, [run_entry])\n\n domain_entry = create_output_tracker_domain_entry(\n self._parameter_set_hash, instance_entry\n )\n\n domain_map_entries.append(domain_entry)\n\n output_data = output_map_json\n\n misc_fns.dump_string(txt_file, response)\n misc_fns.dump_string(source_file, source_str)\n # writes the output mapping files\n misc_fns.write_json(\n os.path.join(self._output_path_root, \"output_map.json\"), output_data\n )\n misc_fns.dump_output_file_map_tsv(\n os.path.join(self._output_path_root, \"output_map.tsv\"), output_data\n )\n "},{"location":"bcorag/#bcorag.bcorag.BcoRag._display_info","title":"_display_info(info, header=None) ","text":"If in debug mode, handles the debug info output to the log file. Parameters: Name Type Description Default info dict | list | str | UserSelections | None The object to log. required header str or None The optional header to log before the info. None Source code in bcorag/bcorag.py def _display_info(\n self,\n info: Optional[dict | list | str | UserSelections],\n header: Optional[str] = None,\n):\n \"\"\"If in debug mode, handles the debug info output to the log file.\n\n Parameters\n ----------\n info : dict | list | str | UserSelections | None\n The object to log.\n header : str or None\n The optional header to log before the info.\n \"\"\"\n log_str = header if header is not None else \"\"\n if isinstance(info, dict):\n for key, value in info.items():\n log_str += f\"\\n\\t{key}: '{value}'\"\n elif isinstance(info, str):\n log_str += f\"{info}\" if header is None else f\"\\n{info}\"\n self._logger.info(log_str)\n "},{"location":"bcorag/#bcorag.bcorag.BcoRag._user_selection_hash","title":"_user_selection_hash(params) ","text":"Generates an MD5 hash of the parameter set. Parameters: Name Type Description Default params UserSelections The user configuration selections. required Returns: Type Description str The hexidecimal MD5 hash. Source code in bcorag/bcorag.py def _user_selection_hash(self, params: UserSelections) -> str:\n \"\"\"Generates an MD5 hash of the parameter set.\n\n Parameters\n ----------\n params : UserSelections\n The user configuration selections.\n\n Returns\n -------\n str\n The hexidecimal MD5 hash.\n \"\"\"\n hash_list = []\n hash_list.append(params[\"llm\"])\n hash_list.append(params[\"embedding_model\"])\n hash_list.append(params[\"vector_store\"])\n hash_list.append(params[\"loader\"])\n hash_list.append(str(params[\"similarity_top_k\"]))\n hash_list.append(params[\"chunking_config\"])\n\n if params[\"git_data\"] is not None:\n\n hash_list.append(params[\"git_data\"][\"user\"])\n hash_list.append(params[\"git_data\"][\"repo\"])\n hash_list.append(params[\"git_data\"][\"branch\"])\n\n for filter in params[\"git_data\"][\"filters\"]:\n\n filter_type = (\n \"include\"\n if filter[\"filter_type\"]\n == GithubRepositoryReader.FilterType.INCLUDE\n else \"exclude\"\n )\n filter_str = f\"{filter_type}-{filter['value']}\"\n hash_list.append(filter_str)\n\n sorted(hash_list)\n hash_str = \"_\".join(hash_list)\n hash_hex = md5(hash_str.encode(\"utf-8\")).hexdigest()\n return hash_hex\n "},{"location":"bcorag/#bcorag.bcorag.BcoRag._check_dependencies","title":"_check_dependencies(domain) ","text":"Checks a domain's dependencies. Parameters: Name Type Description Default domain DomainKey The domain to check. required Returns: Type Description bool True if dependencies are satisfied, False otherwise. Source code in bcorag/bcorag.py def _check_dependencies(self, domain: DomainKey) -> bool:\n \"\"\"Checks a domain's dependencies.\n\n Parameters\n ----------\n domain : DomainKey\n The domain to check.\n\n Returns\n -------\n bool\n True if dependencies are satisfied, False otherwise.\n \"\"\"\n for dependency in self._domain_map[domain][\"dependencies\"]:\n if self.domain_content[dependency] is None:\n print(\n f\"Error: {dependency.title()} domain must be generated before the {domain.title()} domain.\"\n )\n return False\n return True\n "},{"location":"bcorag/#bcorag.bcorag.supress_stdout","title":"supress_stdout() ","text":"Context manager that redirects stdout and stderr to devnull. Source code in bcorag/bcorag.py @contextmanager\ndef supress_stdout():\n \"\"\"Context manager that redirects stdout and stderr to devnull.\"\"\"\n with open(os.devnull, \"w\") as f, redirect_stdout(f):\n yield\n "},{"location":"error-frame/","title":"Error Frame","text":""},{"location":"error-frame/#evaluator.frontend.components.evaluation_frames.error_frame.ErrorFrame","title":"ErrorFrame ","text":" Bases: CTkFrame , EvaluationBaseFrame Class for the error evaluation frame. Source code in evaluator/frontend/components/evaluation_frames/error_frame.py class ErrorFrame(ctk.CTkFrame, EvaluationBaseFrame):\n \"\"\"Class for the error evaluation frame.\"\"\"\n\n def __init__(\n self, master: ctk.CTkFrame, app_state: AppState, run_state: RunState, **kwargs\n ):\n \"\"\"Constructor.\"\"\"\n super().__init__(master, **kwargs)\n self.state = app_state\n self.run = run_state\n self.error_eval = self.run[\"eval_data\"][\"error_eval\"]\n\n self.grid_columnconfigure(0, weight=1)\n self.grid_rowconfigure(1, weight=1)\n\n self.main_err_label = ctk.CTkLabel(\n master=self, text=\"Error Evaluation\", font=(self.state[\"font\"], 28, \"bold\")\n )\n self.main_err_label.grid(\n row=0,\n columnspan=2,\n padx=self.state[\"padding\"],\n pady=self.state[\"padding\"],\n )\n\n self.inf_err_var = ctk.StringVar(\n value=reverse_cast_checkbox(\n self.error_eval.get(\n \"inferred_knowledge_error\",\n EVAL_DEFAULTS[\"inferred_knowledge_error\"],\n )\n )\n )\n self.inf_checkbox = ctk.CTkCheckBox(\n master=self,\n text=\"Inferred Knowledge Error\",\n variable=self.inf_err_var,\n onvalue=\"on\",\n offvalue=\"off\",\n )\n self.inf_checkbox.grid(\n row=2,\n column=0,\n padx=(self.state[\"padding\"], self.state[\"padding\"] // 4),\n pady=self.state[\"padding\"] // 2,\n sticky=\"w\",\n )\n\n self.ext_err_var = ctk.StringVar(\n value=reverse_cast_checkbox(\n self.error_eval.get(\n \"external_knowledge_error\",\n EVAL_DEFAULTS[\"external_knowledge_error\"],\n )\n )\n )\n self.ext_checkbox = ctk.CTkCheckBox(\n master=self,\n text=\"External Knowledge Error\",\n variable=self.ext_err_var,\n onvalue=\"on\",\n offvalue=\"off\",\n )\n self.ext_checkbox.grid(\n row=3,\n column=0,\n padx=(self.state[\"padding\"], self.state[\"padding\"] // 4),\n pady=self.state[\"padding\"] // 4,\n sticky=\"w\",\n )\n\n self.json_err_var = ctk.StringVar(\n value=reverse_cast_checkbox(\n self.error_eval.get(\n \"json_format_error\", EVAL_DEFAULTS[\"json_format_error\"]\n )\n )\n )\n self.json_checkbox = ctk.CTkCheckBox(\n master=self,\n text=\"JSON Formatting Error\",\n variable=self.json_err_var,\n onvalue=\"on\",\n offvalue=\"off\",\n )\n self.json_checkbox.grid(\n row=2,\n column=1,\n padx=(self.state[\"padding\"] // 4, self.state[\"padding\"]),\n pady=self.state[\"padding\"] // 2,\n sticky=\"w\",\n )\n\n self.other_err_var = ctk.StringVar(\n value=reverse_cast_checkbox(\n self.error_eval.get(\"other_error\", EVAL_DEFAULTS[\"other_error\"])\n )\n )\n self.other_err_checkbox = ctk.CTkCheckBox(\n master=self,\n text=\"Other Error\",\n variable=self.other_err_var,\n onvalue=\"on\",\n offvalue=\"off\",\n )\n self.other_err_checkbox.grid(\n row=3,\n column=1,\n padx=(self.state[\"padding\"] // 4, self.state[\"padding\"]),\n pady=self.state[\"padding\"] // 4,\n sticky=\"w\",\n )\n\n self.error_notes_label = ctk.CTkLabel(\n master=self, text=\"Notes\", font=(self.state[\"font\"], 16, \"bold\")\n )\n self.error_notes_label.grid(\n row=4,\n column=0,\n padx=self.state[\"padding\"],\n pady=(self.state[\"padding\"] // 2, self.state[\"padding\"] // 4),\n sticky=\"w\",\n )\n\n self.error_notes = ctk.CTkTextbox(\n master=self, wrap=\"word\", font=(self.state[\"font\"], 18)\n )\n self.error_notes.grid(\n row=5,\n columnspan=2,\n padx=self.state[\"padding\"] // 2,\n pady=(self.state[\"padding\"] // 4, self.state[\"padding\"]),\n sticky=\"nsew\",\n )\n\n def update_state(self, app_state: AppState, run_state: RunState) -> None:\n \"\"\"Update the component state.\n\n Parameters\n ----------\n app_state : AppState\n The updated app state.\n run_state : RunState\n The updated run state.\n \"\"\"\n self.run = run_state\n self.state = app_state\n self.error_eval = self.run[\"eval_data\"][\"error_eval\"]\n\n self.inf_err_var = ctk.StringVar(\n value=reverse_cast_checkbox(\n self.error_eval.get(\n \"inferred_knowledge_error\",\n EVAL_DEFAULTS[\"inferred_knowledge_error\"],\n )\n )\n )\n self.inf_checkbox.configure(variable=self.inf_err_var)\n\n self.ext_err_var = ctk.StringVar(\n value=reverse_cast_checkbox(\n self.error_eval.get(\n \"external_knowledge_error\",\n EVAL_DEFAULTS[\"external_knowledge_error\"],\n )\n )\n )\n self.ext_checkbox.configure(variable=self.ext_err_var)\n\n self.json_err_var = ctk.StringVar(\n value=reverse_cast_checkbox(\n self.error_eval.get(\n \"json_format_error\", EVAL_DEFAULTS[\"json_format_error\"]\n )\n )\n )\n self.json_checkbox.configure(variable=self.json_err_var)\n\n self.other_err_var = ctk.StringVar(\n value=reverse_cast_checkbox(\n self.error_eval.get(\"other_error\", EVAL_DEFAULTS[\"other_error\"])\n )\n )\n self.other_err_checkbox.configure(variable=self.other_err_var)\n\n self.error_notes.delete(0.0, \"end\")\n self.error_notes.insert(\n 0.0, self.error_eval.get(\"notes\", EVAL_DEFAULTS[\"notes\"])\n )\n\n def get_results(self) -> ErrorEval:\n \"\"\"Returns the error evaluations.\n\n Returns\n -------\n ErrorEval\n The error evaluation results.\n \"\"\"\n error_eval = create_error_val(\n inf_err=self.inf_err_var.get(),\n ext_err=self.ext_err_var.get(),\n json_err=self.json_err_var.get(),\n other_err=self.other_err_var.get(),\n notes=self.error_notes.get(0.0, \"end\"),\n )\n return error_eval\n "},{"location":"error-frame/#evaluator.frontend.components.evaluation_frames.error_frame.ErrorFrame.__init__","title":"__init__(master, app_state, run_state, **kwargs) ","text":"Constructor. Source code in evaluator/frontend/components/evaluation_frames/error_frame.py def __init__(\n self, master: ctk.CTkFrame, app_state: AppState, run_state: RunState, **kwargs\n):\n \"\"\"Constructor.\"\"\"\n super().__init__(master, **kwargs)\n self.state = app_state\n self.run = run_state\n self.error_eval = self.run[\"eval_data\"][\"error_eval\"]\n\n self.grid_columnconfigure(0, weight=1)\n self.grid_rowconfigure(1, weight=1)\n\n self.main_err_label = ctk.CTkLabel(\n master=self, text=\"Error Evaluation\", font=(self.state[\"font\"], 28, \"bold\")\n )\n self.main_err_label.grid(\n row=0,\n columnspan=2,\n padx=self.state[\"padding\"],\n pady=self.state[\"padding\"],\n )\n\n self.inf_err_var = ctk.StringVar(\n value=reverse_cast_checkbox(\n self.error_eval.get(\n \"inferred_knowledge_error\",\n EVAL_DEFAULTS[\"inferred_knowledge_error\"],\n )\n )\n )\n self.inf_checkbox = ctk.CTkCheckBox(\n master=self,\n text=\"Inferred Knowledge Error\",\n variable=self.inf_err_var,\n onvalue=\"on\",\n offvalue=\"off\",\n )\n self.inf_checkbox.grid(\n row=2,\n column=0,\n padx=(self.state[\"padding\"], self.state[\"padding\"] // 4),\n pady=self.state[\"padding\"] // 2,\n sticky=\"w\",\n )\n\n self.ext_err_var = ctk.StringVar(\n value=reverse_cast_checkbox(\n self.error_eval.get(\n \"external_knowledge_error\",\n EVAL_DEFAULTS[\"external_knowledge_error\"],\n )\n )\n )\n self.ext_checkbox = ctk.CTkCheckBox(\n master=self,\n text=\"External Knowledge Error\",\n variable=self.ext_err_var,\n onvalue=\"on\",\n offvalue=\"off\",\n )\n self.ext_checkbox.grid(\n row=3,\n column=0,\n padx=(self.state[\"padding\"], self.state[\"padding\"] // 4),\n pady=self.state[\"padding\"] // 4,\n sticky=\"w\",\n )\n\n self.json_err_var = ctk.StringVar(\n value=reverse_cast_checkbox(\n self.error_eval.get(\n \"json_format_error\", EVAL_DEFAULTS[\"json_format_error\"]\n )\n )\n )\n self.json_checkbox = ctk.CTkCheckBox(\n master=self,\n text=\"JSON Formatting Error\",\n variable=self.json_err_var,\n onvalue=\"on\",\n offvalue=\"off\",\n )\n self.json_checkbox.grid(\n row=2,\n column=1,\n padx=(self.state[\"padding\"] // 4, self.state[\"padding\"]),\n pady=self.state[\"padding\"] // 2,\n sticky=\"w\",\n )\n\n self.other_err_var = ctk.StringVar(\n value=reverse_cast_checkbox(\n self.error_eval.get(\"other_error\", EVAL_DEFAULTS[\"other_error\"])\n )\n )\n self.other_err_checkbox = ctk.CTkCheckBox(\n master=self,\n text=\"Other Error\",\n variable=self.other_err_var,\n onvalue=\"on\",\n offvalue=\"off\",\n )\n self.other_err_checkbox.grid(\n row=3,\n column=1,\n padx=(self.state[\"padding\"] // 4, self.state[\"padding\"]),\n pady=self.state[\"padding\"] // 4,\n sticky=\"w\",\n )\n\n self.error_notes_label = ctk.CTkLabel(\n master=self, text=\"Notes\", font=(self.state[\"font\"], 16, \"bold\")\n )\n self.error_notes_label.grid(\n row=4,\n column=0,\n padx=self.state[\"padding\"],\n pady=(self.state[\"padding\"] // 2, self.state[\"padding\"] // 4),\n sticky=\"w\",\n )\n\n self.error_notes = ctk.CTkTextbox(\n master=self, wrap=\"word\", font=(self.state[\"font\"], 18)\n )\n self.error_notes.grid(\n row=5,\n columnspan=2,\n padx=self.state[\"padding\"] // 2,\n pady=(self.state[\"padding\"] // 4, self.state[\"padding\"]),\n sticky=\"nsew\",\n )\n "},{"location":"error-frame/#evaluator.frontend.components.evaluation_frames.error_frame.ErrorFrame.update_state","title":"update_state(app_state, run_state) ","text":"Update the component state. Parameters: Name Type Description Default app_state AppState The updated app state. required run_state RunState The updated run state. required Source code in evaluator/frontend/components/evaluation_frames/error_frame.py def update_state(self, app_state: AppState, run_state: RunState) -> None:\n \"\"\"Update the component state.\n\n Parameters\n ----------\n app_state : AppState\n The updated app state.\n run_state : RunState\n The updated run state.\n \"\"\"\n self.run = run_state\n self.state = app_state\n self.error_eval = self.run[\"eval_data\"][\"error_eval\"]\n\n self.inf_err_var = ctk.StringVar(\n value=reverse_cast_checkbox(\n self.error_eval.get(\n \"inferred_knowledge_error\",\n EVAL_DEFAULTS[\"inferred_knowledge_error\"],\n )\n )\n )\n self.inf_checkbox.configure(variable=self.inf_err_var)\n\n self.ext_err_var = ctk.StringVar(\n value=reverse_cast_checkbox(\n self.error_eval.get(\n \"external_knowledge_error\",\n EVAL_DEFAULTS[\"external_knowledge_error\"],\n )\n )\n )\n self.ext_checkbox.configure(variable=self.ext_err_var)\n\n self.json_err_var = ctk.StringVar(\n value=reverse_cast_checkbox(\n self.error_eval.get(\n \"json_format_error\", EVAL_DEFAULTS[\"json_format_error\"]\n )\n )\n )\n self.json_checkbox.configure(variable=self.json_err_var)\n\n self.other_err_var = ctk.StringVar(\n value=reverse_cast_checkbox(\n self.error_eval.get(\"other_error\", EVAL_DEFAULTS[\"other_error\"])\n )\n )\n self.other_err_checkbox.configure(variable=self.other_err_var)\n\n self.error_notes.delete(0.0, \"end\")\n self.error_notes.insert(\n 0.0, self.error_eval.get(\"notes\", EVAL_DEFAULTS[\"notes\"])\n )\n "},{"location":"error-frame/#evaluator.frontend.components.evaluation_frames.error_frame.ErrorFrame.get_results","title":"get_results() ","text":"Returns the error evaluations. Returns: Type Description ErrorEval The error evaluation results. Source code in evaluator/frontend/components/evaluation_frames/error_frame.py def get_results(self) -> ErrorEval:\n \"\"\"Returns the error evaluations.\n\n Returns\n -------\n ErrorEval\n The error evaluation results.\n \"\"\"\n error_eval = create_error_val(\n inf_err=self.inf_err_var.get(),\n ext_err=self.ext_err_var.get(),\n json_err=self.json_err_var.get(),\n other_err=self.other_err_var.get(),\n notes=self.error_notes.get(0.0, \"end\"),\n )\n return error_eval\n "},{"location":"evaluation-app/","title":"Evaluation Application","text":" - Starting the Application
- View Page
- Sidebar
- Tab View
- Compare JSON
- Source Nodes
- Parameter Set
- Evaluate
- Score Evaluation
- Error Evaluation
- Reference Evaluation
- General Evaluation
- Miscellaneous Evaluation
In order to accurately and consistently evaluate generated domains, such as those created with parameter searches, the BcoRag tool has an accompanying evaluation application that provides a more user friendly GUI. "},{"location":"evaluation-app/#starting-the-application","title":"Starting the Application","text":"The evaluation application can be run from the main.py entrpoint using the evaluate positional argument like so: (env) python main.py evaluate\n On startup, you will be presented with the login screen. The login mechanism is a naive first and last name login that just keeps track of which domains you have already evaluated and what scores you submitted for each domain. If you are a new user, you'll be prompted to start from the beginning. If you are a returning user, you will be prompted whether you want to start from the beginning or resume from your last session. "},{"location":"evaluation-app/#view-page","title":"View Page","text":"The view page consists of the tab view and a sidebar. "},{"location":"evaluation-app/#sidebar","title":"Sidebar","text":"The top of the sidebar contains the navigation buttons. The Previous button will navigate back one generated domain and the Next button will navigate to the next generated domain. If you are at the first run the Previous button will be greyed out, and similarily when you are at the last available run the Next button will be greyed out. If you have already submitted an evaluation for that particular run, a red notice label will appear below the run counter showing the message Already Evaluated . The Save button will save your evaluation results to disk. The Exit button will exit the application. At the bottom of the sidebar you can switch between Light and Dark mode. Underneath the appearance dropdown there is a scaling dropdown for UI scaling. "},{"location":"evaluation-app/#tab-view","title":"Tab View","text":""},{"location":"evaluation-app/#compare-json","title":"Compare JSON","text":"The compare JSON tab allows you to inspect the generated domain against a human curated domain for the same paper. If the JSON serialization failed after generating the domain, the raw text file will be displayed with a note at the top saying Failed JSON serialization. Raw text output: . "},{"location":"evaluation-app/#source-nodes","title":"Source Nodes","text":"The source node tab will display the nodes that were retrieved during the retrieval process and sent as context with the domain query. Depending on the similarity_top_k parameter chosen for the run, the number of reference nodes will be marked with delimiting lines in the format of: ----------------- Source Node x/n -----------------\n "},{"location":"evaluation-app/#parameter-set","title":"Parameter Set","text":"The parameter set tab will display the exact parameter set that was used to generate the target domain. "},{"location":"evaluation-app/#evaluate","title":"Evaluate","text":"The evaluate tab is where reviewers will input there evaluation ratings. All evaluation sections have corresponding Notes sections that allow for free text notes regarding that evaluation category. All numeric segmented buttons have a range from 0 to 2 (with 0 being the worst, 1 being satisfactory, and 2 being the best score). The default score of -1 is just a placeholder value used to filter out potentially un-finished or erroneously pre-maturly submitted evaluations. The bottom right Submit button will save the evaluation to the session in memory. If you click on the Next or Previous buttons before submitting the evaluation it will be lost. "},{"location":"evaluation-app/#score-evaluation","title":"Score Evaluation","text":"The score evaluation frame contains evaluation options for the BCO domain score. The Score label displays the calculated BCO score returned from the BCO score API endpoint (on API error a default value of -1.0 is shown). The Score version label displays the score version according to the BCO score API endpoint (on API error a default value of 0.0 is shown). Depending on the quality of the generated domain, the evaluator should mark whether the score should actually be higher, lower, or if it is about right using the segmented button. "},{"location":"evaluation-app/#error-evaluation","title":"Error Evaluation","text":"The error evaluation frame allows the user to indicate any errors in the generated domain. The types of errors are: - Inferred Knowledge Errors: Fields that require inferred knowledge can result in undefined behavior. For example, multiple domains make use of the uri object that is defined in the top level BCO JSON schema. The uri object has a field for access_time, which expects a fully JSON compliant
date-time . An exact timestamp is very likely to not be explicitly listed in the source material. In early testing, the tool seems to use a default value of 2023-11-01T12:00:00Z for these fields. - External Knowledge Error: External knowledge errors result in non-specific information when the field requires knowledge from external dependencies. A common scenario is for the authors of the paper to include links to the Github repository that contains the source code and corresponding input/output files. In the Description domain, each pipeline step includes a target field for output files generated by the particular step. Since the specific location of the scripts and output files are usually not explicitly described in the paper, the tool will fill the output file field with a generated link to the repository, not the specific location of the file within the repository. For example, a link to the repository such as https://github.com/biocompute-objects/bco-rag/tree/main, versus a link to the specific file within the repository such as https://github.com/biocompute-objects/bco-rag/blob/main/docs/evaluation_app.md.
- JSON Formatting Error: JSON formatting errors occur when the generated domain either 1) is not valid JSON or 2) when the generated domain does not validate against the BCO JSON schema.
- Other Error: Other errors can be any other errors not caught by the previous three categories.
"},{"location":"evaluation-app/#reference-evaluation","title":"Reference Evaluation","text":"The reference evaluation frame allows for the user to rate the reference nodes. The reference nodes are arguably the most important part of the domain generation process as they provide the LLM with the context required to complete the domain request. Ideally, all reference nodes should be relevant to the domain purpose and should logically make sense. For example, the Usability domain \"is a plain language description of what was done in the workflow\". Most often, much of that information would be present in the paper abstract, and not in the paper citations section. "},{"location":"evaluation-app/#general-evaluation","title":"General Evaluation","text":"The general evaluation frame allows for the user to rate the generated domain directly. Ideally, generated domains should be relevant to the domain purpose, human readable, and comprehensible by someone with less knowledge of the source material. "},{"location":"evaluation-app/#miscellaneous-evaluation","title":"Miscellaneous Evaluation","text":"The miscellaneous evaluation frame allows for the user to include some metadata for the evaluation not relating directly to the generated domain. In evaluation of scoring data, it should be noted how confident and familiar with the source material the evaluator was. In doing so, isolating high quality reviews will be clearer. The user can also evaluate the quality of the human curated domain. "},{"location":"evaluator-custom-types/","title":"Types","text":"Handles the custom types for the App backend. Type Aliases ScoreEvalLiteral = Literal[\"Lower\", \"About right\", \"Higher\"] RunStateKey = Literal[\"paper\", \"domain\", \"generated_domain\", \"score\", \"score_version\", \"generated_file_path\", \"human_curated_domain\", \"param_set\", \"reference_nodes\", \"run_index\", \"total_runs\", \"already_evaluated\", \"logger\", \"eval_data\"] AppStateKey = Literal[\"logger\", \"results_dir_path\", \"bco_results_file_name\", \"bco_results_data\", \"user_results_file_name\", \"user_results_data\", \"users_file_name\", \"users_data\", \"generated_directory_paths\", \"padding\", \"font\", \"user_hash\"] "},{"location":"evaluator-custom-types/#evaluator.backend.custom_types.ConfigData","title":"ConfigData ","text":" Bases: TypedDict Defines the schema for the JSON config data. Attributes: Name Type Description logger_path str The path to the logger. logger_name str The name of the logger. generated_output_dir_path str The filepath to the generated domains directory to evaluate. glob_pattern str The glob patterns to traverse the generated output directory. results_dir_path str The path to the directory to dump the evaluation results. ignore_files list[str] Identifiers to ignore certain files (used like if ignore_files[x] in filename ). bco_results_file_name str The file name for the BCO results file. user_results_file_name str The file name for the user evaluations results file. users_file_name str The file name for the users file. padding int The default root padding used throughout the frontend components. font str The default font used throughout the frontend components. Source code in evaluator/backend/custom_types.py class ConfigData(TypedDict):\n \"\"\"Defines the schema for the JSON config data.\n\n Attributes\n ----------\n logger_path : str\n The path to the logger.\n logger_name : str\n The name of the logger.\n generated_output_dir_path : str\n The filepath to the generated domains directory to evaluate.\n glob_pattern : str\n The glob patterns to traverse the generated output directory.\n results_dir_path : str\n The path to the directory to dump the evaluation results.\n ignore_files : list[str]\n Identifiers to ignore certain files (used like `if ignore_files[x] in filename`).\n bco_results_file_name : str\n The file name for the BCO results file.\n user_results_file_name : str\n The file name for the user evaluations results file.\n users_file_name : str\n The file name for the users file.\n padding : int\n The default root padding used throughout the frontend components.\n font : str\n The default font used throughout the frontend components.\n \"\"\"\n\n logger_path: str\n logger_name: str\n generated_output_dir_path: str\n glob_pattern: str\n results_dir_path: str\n ignore_files: list[str]\n bco_results_file_name: str\n user_results_file_name: str\n users_file_name: str\n padding: int\n font: str\n "},{"location":"evaluator-custom-types/#evaluator.backend.custom_types.ScoreEval","title":"ScoreEval ","text":" Bases: TypedDict Score evaluation results. Attributes: Name Type Description eval ScoreEvalLiteral The score eval literal. eval_code int The casted score eval literal. notes str Any additional notes from the evaluator regarding the score evaluation. Source code in evaluator/backend/custom_types.py class ScoreEval(TypedDict):\n \"\"\"Score evaluation results.\n\n Attributes\n ----------\n eval : ScoreEvalLiteral\n The score eval literal.\n eval_code : int\n The casted score eval literal.\n notes : str\n Any additional notes from the evaluator regarding the score evaluation.\n \"\"\"\n\n eval: ScoreEvalLiteral\n eval_code: int\n notes: str\n "},{"location":"evaluator-custom-types/#evaluator.backend.custom_types.ErrorEval","title":"ErrorEval ","text":" Bases: TypedDict Error evaluation data. Attributes: Name Type Description inferred_knowledge_error bool Whether there was an inferred knowledge error. external_knowledge_error bool Whether there was an external knowledge error. json_format_error bool Whether there was a JSON formatting error. other_error bool Whether there was any other error. notes str Any additional notes from the evaluator regarding the error evaluation. Source code in evaluator/backend/custom_types.py class ErrorEval(TypedDict):\n \"\"\"Error evaluation data.\n\n Attributes\n ----------\n inferred_knowledge_error: bool\n Whether there was an inferred knowledge error.\n external_knowledge_error: bool\n Whether there was an external knowledge error.\n json_format_error: bool\n Whether there was a JSON formatting error.\n other_error: bool\n Whether there was any other error.\n notes: str\n Any additional notes from the evaluator regarding the error evaluation.\n \"\"\"\n\n inferred_knowledge_error: bool\n external_knowledge_error: bool\n json_format_error: bool\n other_error: bool\n notes: str\n "},{"location":"evaluator-custom-types/#evaluator.backend.custom_types.RefereceEval","title":"RefereceEval ","text":" Bases: TypedDict Reference evaluation data. Attributes: Name Type Description reference_relevancy int Indicates how relevant the reference nodes were to the domain. top_reference_retrieval bool Whether the top node retrieved was the most relevant. notes str Any additional notes from the evaluator regarding the reference evaluation. Source code in evaluator/backend/custom_types.py class RefereceEval(TypedDict):\n \"\"\"Reference evaluation data.\n\n Attributes\n ----------\n reference_relevancy : int\n Indicates how relevant the reference nodes were to the domain.\n top_reference_retrieval : bool\n Whether the top node retrieved was the most relevant.\n notes : str\n Any additional notes from the evaluator regarding the reference evaluation.\n \"\"\"\n\n reference_relevancy: int\n top_reference_retrieval: bool\n notes: str\n "},{"location":"evaluator-custom-types/#evaluator.backend.custom_types.GeneralEval","title":"GeneralEval ","text":" Bases: TypedDict General evaluation data. Attributes: Name Type Description relevancy int Indicates how relevant the generated domain was. readability int Indicates how readable the generated domain was. reproducibility int Indicates how reproduceable the domain steps are. confidence_rating int Indicates how confident the evaluator was in their evaluation. notes str Any additional notes from the evaluator regarding the general evaluation. Source code in evaluator/backend/custom_types.py class GeneralEval(TypedDict):\n \"\"\"General evaluation data.\n\n Attributes\n ----------\n relevancy : int\n Indicates how relevant the generated domain was.\n readability : int\n Indicates how readable the generated domain was.\n reproducibility : int\n Indicates how reproduceable the domain steps are.\n confidence_rating : int\n Indicates how confident the evaluator was in their evaluation.\n notes : str\n Any additional notes from the evaluator regarding the general evaluation.\n \"\"\"\n\n relevancy: int\n readability: int\n reproducibility: int\n confidence_rating: int\n notes: str\n "},{"location":"evaluator-custom-types/#evaluator.backend.custom_types.MiscEval","title":"MiscEval ","text":" Bases: TypedDict Miscellaneous evaluation data. Attributes: Name Type Description human_domain_rating int The high level human domain rating for the generated domain. evaluator_confidence_rating int Indicates how confident the evaluator is in their evaluation. evaluator_familiarity_level int Indicates how familiar the evaluator is with the paper content. notes str Any additional notes from the evaluator regarding the miscellaneous evaluation. Source code in evaluator/backend/custom_types.py class MiscEval(TypedDict):\n \"\"\"Miscellaneous evaluation data.\n\n Attributes\n ----------\n human_domain_rating : int\n The high level human domain rating for the generated domain.\n evaluator_confidence_rating : int\n Indicates how confident the evaluator is in their evaluation.\n evaluator_familiarity_level: int\n Indicates how familiar the evaluator is with the paper content.\n notes : str\n Any additional notes from the evaluator regarding the miscellaneous evaluation.\n \"\"\"\n\n human_domain_rating: int\n evaluator_confidence_rating: int\n evaluator_familiarity_level: int\n notes: str\n "},{"location":"evaluator-custom-types/#evaluator.backend.custom_types.EvalData","title":"EvalData ","text":" Bases: TypedDict Full evaluation data. Attributes: Name Type Description score_eval ScoreEval error_eval ErrorEval reference_eval RefereceEval general_eval GeneralEval misc_eval MiscEval Source code in evaluator/backend/custom_types.py class EvalData(TypedDict):\n \"\"\"Full evaluation data.\n\n Attributes\n ----------\n score_eval: ScoreEval\n error_eval: ErrorEval\n reference_eval: RefereceEval\n general_eval: GeneralEval\n misc_eval: MiscEval\n \"\"\"\n\n score_eval: ScoreEval\n error_eval: ErrorEval\n reference_eval: RefereceEval\n general_eval: GeneralEval\n misc_eval: MiscEval\n "},{"location":"evaluator-custom-types/#evaluator.backend.custom_types.RunState","title":"RunState ","text":" Bases: TypedDict Holds the data for the current run being evaluated. Attributes: Name Type Description paper str The paper for the current run state. domain str The domain the current run is for. generated_domain str The generated domain string for the current run. score float The score for the current run (from the BCO score API). score_version float The score version for the score (from the BCO score API). generated_file_path str The generated domain file path (points to the JSON file if valid JSON, otherwise points to the raw text file). human_curated_domain str The human curated domain string. param_set str The parameter set string for the run. reference_nodes str The retrieved reference node values. run_index int The run index. total_runs int The total number of runs to potentially evaluate. already_evaluated bool Whether the user has already evaluated this run. logger Logger The logger for the App. eval_data EvalData The evaluation data for the run. Source code in evaluator/backend/custom_types.py class RunState(TypedDict):\n \"\"\"Holds the data for the current run being evaluated.\n\n Attributes\n ----------\n paper: str\n The paper for the current run state.\n domain: str\n The domain the current run is for.\n generated_domain: str\n The generated domain string for the current run.\n score: float\n The score for the current run (from the BCO score API).\n score_version: float\n The score version for the score (from the BCO score API).\n generated_file_path: str\n The generated domain file path (points to the JSON file if valid JSON, otherwise points to the raw text file).\n human_curated_domain: str\n The human curated domain string.\n param_set: str\n The parameter set string for the run.\n reference_nodes: str\n The retrieved reference node values.\n run_index: int\n The run index.\n total_runs: int\n The total number of runs to potentially evaluate.\n already_evaluated: bool\n Whether the user has already evaluated this run.\n logger: Logger\n The logger for the App.\n eval_data: EvalData\n The evaluation data for the run.\n \"\"\"\n\n paper: str\n domain: str\n generated_domain: str\n score: float\n score_version: float\n generated_file_path: str\n human_curated_domain: str\n param_set: str\n reference_nodes: str\n run_index: int\n total_runs: int\n already_evaluated: bool\n logger: Logger\n eval_data: EvalData\n "},{"location":"evaluator-custom-types/#evaluator.backend.custom_types.AppAttributes","title":"AppAttributes ","text":" Bases: TypedDict Handles the app initialization attributes. Attributes: Name Type Description logger Logger The App logger. results_dir_path str The path to the directory to dump the evaluation results. bco_results_file_name str The file name for the BCO results file. bco_results_data dict The aggregates BCO results data. user_results_file_name str The file name for the user evaluations results file. user_results_data dict[str, dict[str, EvalData | None] | None] The user evaluation results. users_file_name str The file name for the users file. users_data dict The users data. generated_output_dir_root str The root filepath to the generated domains directory to evaluate. generated_directory_paths list[str] List of directory paths for all the papers. padding int The default root padding to use for all the frontend components. font str The default font to use for all the frontend components. Source code in evaluator/backend/custom_types.py class AppAttributes(TypedDict):\n \"\"\"Handles the app initialization attributes.\n\n Attributes\n ----------\n logger : Logger\n The App logger.\n results_dir_path : str\n The path to the directory to dump the evaluation results.\n bco_results_file_name : str\n The file name for the BCO results file.\n bco_results_data: dict\n The aggregates BCO results data.\n user_results_file_name: str\n The file name for the user evaluations results file.\n user_results_data: dict[str, dict[str, EvalData | None] | None]\n The user evaluation results.\n users_file_name: str\n The file name for the users file.\n users_data: dict\n The users data.\n generated_output_dir_root: str\n The root filepath to the generated domains directory to evaluate.\n generated_directory_paths: list[str]\n List of directory paths for all the papers.\n padding: int\n The default root padding to use for all the frontend components.\n font: str\n The default font to use for all the frontend components. \n \"\"\"\n\n logger: Logger\n results_dir_path: str\n bco_results_file_name: str\n bco_results_data: dict\n user_results_file_name: str\n user_results_data: dict[str, dict[str, EvalData | None] | None]\n users_file_name: str\n users_data: dict\n generated_output_dir_root: str\n generated_directory_paths: list[str]\n padding: int\n font: str\n "},{"location":"evaluator-custom-types/#evaluator.backend.custom_types.AppState","title":"AppState ","text":" Bases: AppAttributes Holds the application state information, essentially just the attributes plus the current user hash, new user flag and start from last session boolean. Attributes: Name Type Description user_hash str The user hash. new_user bool New user flag. resume_session bool Resume session flag. Source code in evaluator/backend/custom_types.py class AppState(AppAttributes):\n \"\"\"Holds the application state information, essentially\n just the attributes plus the current user hash, new user\n flag and start from last session boolean.\n\n Attributes\n ----------\n user_hash: str\n The user hash.\n new_user: bool\n New user flag.\n resume_session: bool\n Resume session flag.\n \"\"\"\n\n user_hash: str\n new_user: bool\n resume_session: bool\n "},{"location":"evaluator-custom-types/#evaluator.backend.custom_types.cast_checkbox","title":"cast_checkbox(val) ","text":"Cast checkbox string to boolean (assuming checkbox values are on , off ). Parameters: Name Type Description Default val str The value to cast. required Returns: Type Description bool The casted checkbox value. Source code in evaluator/backend/custom_types.py def cast_checkbox(val: str) -> bool:\n \"\"\"Cast checkbox string to boolean (assuming checkbox values are `on`, `off`).\n\n Parameters\n ----------\n val : str\n The value to cast.\n\n Returns\n -------\n bool\n The casted checkbox value.\n \"\"\"\n val = val.strip().lower()\n if val == \"on\":\n return True\n elif val == \"off\":\n return False\n raise ValueError(f\"Error casting `{val}` to bool.\")\n "},{"location":"evaluator-custom-types/#evaluator.backend.custom_types.reverse_cast_checkbox","title":"reverse_cast_checkbox(err) ","text":"Reverse cast checkbox bool to string (assuming checkbox values are on , off ). Parameters: Name Type Description Default err bool The value to revserse cast. required Returns: Type Description str The reverse casted value. Source code in evaluator/backend/custom_types.py def reverse_cast_checkbox(err: bool) -> str:\n \"\"\"Reverse cast checkbox bool to string (assuming checkbox values are `on`, `off`).\n\n Parameters\n ----------\n err : bool\n The value to revserse cast.\n\n Returns\n -------\n str\n The reverse casted value.\n \"\"\"\n if err:\n return \"on\"\n else:\n return \"off\"\n "},{"location":"evaluator-custom-types/#evaluator.backend.custom_types.create_score_eval","title":"create_score_eval(eval, notes) ","text":"Constructor for the ScoreEval TypedDict. The score eval literal will be automatically casted to the eval code. Parameters: Name Type Description Default eval ScoreEvalLiteral The score eval literal. required notes str Any additional notes from the evaluator regarding the score evaluation. required Returns: Type Description ScoreEval Source code in evaluator/backend/custom_types.py def create_score_eval(eval: ScoreEvalLiteral, notes: str) -> ScoreEval:\n \"\"\"Constructor for the ScoreEval TypedDict. The score eval literal\n will be automatically casted to the eval code.\n\n Parameters\n ----------\n eval : ScoreEvalLiteral\n The score eval literal.\n notes : str\n Any additional notes from the evaluator regarding the score evaluation.\n\n Returns\n -------\n ScoreEval\n \"\"\"\n eval_str = str(eval.strip().lower())\n\n eval_code = 0\n match eval_str:\n case \"lower\":\n eval_code = -1\n case \"higher\":\n eval_code = 1\n\n return_data: ScoreEval = {\n \"eval\": eval,\n \"eval_code\": eval_code,\n \"notes\": notes.strip(),\n }\n\n return return_data\n "},{"location":"evaluator-custom-types/#evaluator.backend.custom_types.cast_score_eval","title":"cast_score_eval(score_eval_str) ","text":"Cast a string to ScoreEvalLiteral (if possible). Parameters: Name Type Description Default score_eval_str str The string to cast. required Returns: Type Description ScoreEvalLiteral Source code in evaluator/backend/custom_types.py def cast_score_eval(score_eval_str: str) -> ScoreEvalLiteral:\n \"\"\"Cast a string to ScoreEvalLiteral (if possible).\n\n Parameters\n ----------\n score_eval_str : str\n The string to cast.\n\n Returns\n -------\n ScoreEvalLiteral\n \"\"\"\n score_eval_str = score_eval_str.strip().lower()\n match score_eval_str:\n case \"lower\":\n return \"Lower\"\n case \"about right\":\n return \"About right\"\n case \"higher\":\n return \"Higher\"\n raise ValueError(f\"Error casting `{score_eval_str}` to ScoreEvalLiteral.\")\n "},{"location":"evaluator-custom-types/#evaluator.backend.custom_types.create_error_val","title":"create_error_val(inf_err, ext_err, json_err, other_err, notes) ","text":"Constructor for the ErrorEval TypedDict. Parameters: Name Type Description Default inf_err bool | str The inferred knowledge error indicator. required ext_err bool | str The external knowledge error indicator. required json_err bool | str The JSON formattign error indicator. required notes str Any additional notes from the evaluator regarding the error evaluation. required Source code in evaluator/backend/custom_types.py def create_error_val(\n inf_err: bool | str,\n ext_err: bool | str,\n json_err: bool | str,\n other_err: bool | str,\n notes: str,\n) -> ErrorEval:\n \"\"\"Constructor for the ErrorEval TypedDict.\n\n Parameters\n ----------\n inf_err : bool | str\n The inferred knowledge error indicator.\n ext_err : bool | str\n The external knowledge error indicator.\n json_err : bool | str\n The JSON formattign error indicator.\n notes : str\n Any additional notes from the evaluator regarding the error evaluation.\n \"\"\"\n if isinstance(inf_err, str):\n inf_err = cast_checkbox(inf_err)\n if isinstance(ext_err, str):\n ext_err = cast_checkbox(ext_err)\n if isinstance(json_err, str):\n json_err = cast_checkbox(json_err)\n if isinstance(other_err, str):\n other_err = cast_checkbox(other_err)\n\n return_data: ErrorEval = {\n \"inferred_knowledge_error\": inf_err,\n \"external_knowledge_error\": ext_err,\n \"json_format_error\": json_err,\n \"other_error\": other_err,\n \"notes\": notes.strip(),\n }\n return return_data\n "},{"location":"evaluator-custom-types/#evaluator.backend.custom_types.create_reference_eval","title":"create_reference_eval(reference_relevancy, top_reference_retrieval, notes) ","text":"Constructor for the RefereceEval TypedDict. Parameters: Name Type Description Default reference_relevancy int Indicates how relevant the reference nodes were to the domain. required top_reference_retrieval bool Whether the top node retrieved was the most relevant. required notes str Any additional notes from the evaluator regarding the reference evaluation. required Returns: Type Description ReferenceEval Source code in evaluator/backend/custom_types.py def create_reference_eval(\n reference_relevancy: int, top_reference_retrieval: bool | str, notes: str\n) -> RefereceEval:\n \"\"\"Constructor for the RefereceEval TypedDict.\n\n Parameters\n ----------\n reference_relevancy : int\n Indicates how relevant the reference nodes were to the domain.\n top_reference_retrieval : bool\n Whether the top node retrieved was the most relevant.\n notes : str\n Any additional notes from the evaluator regarding the reference evaluation.\n\n Returns\n -------\n ReferenceEval\n \"\"\"\n if isinstance(top_reference_retrieval, str):\n top_reference_retrieval = cast_checkbox(top_reference_retrieval)\n\n return_data: RefereceEval = {\n \"reference_relevancy\": reference_relevancy,\n \"top_reference_retrieval\": top_reference_retrieval,\n \"notes\": notes.strip(),\n }\n\n return return_data\n "},{"location":"evaluator-custom-types/#evaluator.backend.custom_types.create_general_eval","title":"create_general_eval(relevancy, readability, reproducibility, confidence_rating, notes) ","text":"Constructor for the GeneralEval TypedDict. Parameters: Name Type Description Default relevancy int Indicates how relevant the generated domain was. required readability int Indicates how readable the generated domain was. required reproducibility int Indicates how reproduceable the domain steps are. required confidence_rating int Indicates how confident the evaluator is in the generated domain. required notes str Any additional notes from the evaluator regarding the general evaluation. required Returns: Type Description GeneralEval Source code in evaluator/backend/custom_types.py def create_general_eval(\n relevancy: int,\n readability: int,\n reproducibility: int,\n confidence_rating: int,\n notes: str,\n) -> GeneralEval:\n \"\"\"Constructor for the GeneralEval TypedDict.\n\n Parameters\n ----------\n relevancy : int\n Indicates how relevant the generated domain was.\n readability : int\n Indicates how readable the generated domain was.\n reproducibility : int\n Indicates how reproduceable the domain steps are.\n confidence_rating : int\n Indicates how confident the evaluator is in the generated domain.\n notes : str\n Any additional notes from the evaluator regarding the general evaluation.\n\n Returns\n -------\n GeneralEval\n \"\"\"\n return_data: GeneralEval = {\n \"relevancy\": relevancy,\n \"readability\": readability,\n \"reproducibility\": reproducibility,\n \"confidence_rating\": confidence_rating,\n \"notes\": notes.strip(),\n }\n return return_data\n "},{"location":"evaluator-custom-types/#evaluator.backend.custom_types.create_misc_eval","title":"create_misc_eval(human_domain_rating, evaluator_confidence_rating, evaluator_familiarity_level, notes) ","text":"Constructor for the MiscEval TypedDict. Parameters: Name Type Description Default human_domain_rating int The high level human domain rating for the generated domain. required evaluator_confidence_rating int Indicates how confident the evaluator is in their evaluation. required evaluator_familiarity_level int Indicates how familiar the evaluator is with the paper content. required notes str Any additional notes from the evaluator regarding the miscellaneous evaluation. required Returns: Type Description MiscEval Source code in evaluator/backend/custom_types.py def create_misc_eval(\n human_domain_rating: int,\n evaluator_confidence_rating: int,\n evaluator_familiarity_level: int,\n notes: str,\n) -> MiscEval:\n \"\"\"Constructor for the MiscEval TypedDict.\n\n Parameters\n ----------\n human_domain_rating : int\n The high level human domain rating for the generated domain.\n evaluator_confidence_rating : int\n Indicates how confident the evaluator is in their evaluation.\n evaluator_familiarity_level: int\n Indicates how familiar the evaluator is with the paper content.\n notes : str\n Any additional notes from the evaluator regarding the miscellaneous evaluation.\n\n Returns\n -------\n MiscEval\n \"\"\"\n return_data: MiscEval = {\n \"human_domain_rating\": human_domain_rating,\n \"evaluator_confidence_rating\": evaluator_confidence_rating,\n \"evaluator_familiarity_level\": evaluator_familiarity_level,\n \"notes\": notes.strip(),\n }\n return return_data\n "},{"location":"evaluator-custom-types/#evaluator.backend.custom_types.create_full_eval","title":"create_full_eval(score_eval, error_eval, reference_eval, general_eval, misc_eval) ","text":"Constructor for the EvalData TypedDict. Parameters: Name Type Description Default score_eval ScoreEval required error_eval ErrorEval required reference_eval RefereceEval required general_eval GeneralEval required misc_eval MiscEval required Source code in evaluator/backend/custom_types.py def create_full_eval(\n score_eval: ScoreEval,\n error_eval: ErrorEval,\n reference_eval: RefereceEval,\n general_eval: GeneralEval,\n misc_eval: MiscEval,\n) -> EvalData:\n \"\"\"Constructor for the EvalData TypedDict.\n\n Parameters\n ----------\n score_eval: ScoreEval\n error_eval: ErrorEval\n reference_eval: RefereceEval\n general_eval: GeneralEval\n misc_eval: MiscEval\n \"\"\"\n return_data: EvalData = {\n \"score_eval\": score_eval,\n \"error_eval\": error_eval,\n \"reference_eval\": reference_eval,\n \"general_eval\": general_eval,\n \"misc_eval\": misc_eval,\n }\n return return_data\n "},{"location":"evaluator-custom-types/#evaluator.backend.custom_types.load_score_defaults","title":"load_score_defaults(filepath='./evaluator/backend/score_defaults.json') ","text":"Loads the score defaults JSON file. Parameters: Name Type Description Default filepath str The filepath to the score defaults JSON file. './evaluator/backend/score_defaults.json' Returns: Type Description EvalData | None The evaluation data with the default values or None on error. Source code in evaluator/backend/custom_types.py def load_score_defaults(\n filepath: str = \"./evaluator/backend/score_defaults.json\",\n) -> Optional[EvalData]:\n \"\"\"Loads the score defaults JSON file.\n\n Parameters\n ----------\n filepath : str, optional\n The filepath to the score defaults JSON file.\n\n Returns\n -------\n EvalData | None\n The evaluation data with the default values or None on error.\n \"\"\"\n naive_load_data = misc_fns.load_json(filepath)\n if naive_load_data is None:\n return None\n if isinstance(naive_load_data, dict):\n eval_defaults = cast(EvalData, naive_load_data)\n return eval_defaults\n return None\n "},{"location":"evaluator-custom-types/#evaluator.backend.custom_types.default_eval","title":"default_eval() ","text":"Get a default EvalData. Returns: Type Description EvalData Source code in evaluator/backend/custom_types.py def default_eval() -> EvalData:\n \"\"\"Get a default EvalData.\n\n Returns\n -------\n EvalData\n \"\"\"\n eval_defaults = load_score_defaults()\n if eval_defaults is None:\n misc_fns.graceful_exit(1, \"Error loading score defaults.\")\n return eval_defaults\n "},{"location":"evaluator-custom-types/#evaluator.backend.custom_types.check_default_eval","title":"check_default_eval(val) ","text":"Checks if the EvalData is still the default. This helps to prevent saving erroneous save data. Parameters: Name Type Description Default val dict | EvalData The evaluation data to check. required Returns: Type Description bool True if still the default, False if different. Source code in evaluator/backend/custom_types.py def check_default_eval(val: dict | EvalData) -> bool:\n \"\"\"Checks if the EvalData is still the default. This\n helps to prevent saving erroneous save data.\n\n Parameters\n ----------\n val : dict | EvalData\n The evaluation data to check.\n\n Returns\n -------\n bool\n True if still the default, False if different.\n \"\"\"\n default_eval_dict = default_eval()\n diff = DeepDiff(\n default_eval_dict,\n val,\n ignore_order=True,\n ignore_string_case=True,\n ignore_nan_inequality=True,\n )\n if diff == {}:\n return True\n else:\n return False\n "},{"location":"evaluator-custom-types/#evaluator.backend.custom_types.create_run_state","title":"create_run_state(paper, domain, generated_domain, generated_file_path, human_curated_domain, param_set, reference_nodes, run_index, total_runs, already_evaluated, logger, eval_data) ","text":"Constructor for the RunState TypedDict. Parameters: Name Type Description Default paper str The paper for the current run state. required domain str The domain the current run is for. required generated_domain str | dict The generated domain for the current run. required generated_file_path str The generated domain file path (points to the JSON file if valid JSON, otherwise points to the raw text file). required human_curated_domain str The human curated domain string. required param_set str The parameter set string for the run. required reference_nodes str The retrieved reference node values. required run_index int The run index. required total_runs int The total number of runs to potentially evaluate. required already_evaluated bool Whether the user has already evaluated this run. required logger Logger The logger for the App. required eval_data EvalData The evaluation data for the run. required Source code in evaluator/backend/custom_types.py def create_run_state(\n paper: str,\n domain: str,\n generated_domain: str | dict,\n generated_file_path: str,\n human_curated_domain: str,\n param_set: str,\n reference_nodes: str,\n run_index: int,\n total_runs: int,\n already_evaluated: bool,\n logger: Logger,\n eval_data: EvalData,\n) -> RunState:\n \"\"\"Constructor for the RunState TypedDict.\n\n Parameters\n ----------\n paper: str\n The paper for the current run state.\n domain: str\n The domain the current run is for.\n generated_domain: str | dict\n The generated domain for the current run.\n generated_file_path: str\n The generated domain file path (points to the JSON file if valid JSON, otherwise points to the raw text file).\n human_curated_domain: str\n The human curated domain string.\n param_set: str\n The parameter set string for the run.\n reference_nodes: str\n The retrieved reference node values.\n run_index: int\n The run index.\n total_runs: int\n The total number of runs to potentially evaluate.\n already_evaluated: bool\n Whether the user has already evaluated this run.\n logger: Logger\n The logger for the App.\n eval_data: EvalData\n The evaluation data for the run.\n \"\"\"\n score = -1.0\n score_version = 0.0\n if isinstance(generated_domain, dict):\n # TODO : whenever the BCO score API endpoint is\n # created hit that here.\n generated_domain = json.dumps(generated_domain, indent=4)\n\n return_data: RunState = {\n \"paper\": paper,\n \"domain\": domain,\n \"generated_domain\": generated_domain,\n \"score\": score,\n \"score_version\": score_version,\n \"generated_file_path\": generated_file_path,\n \"human_curated_domain\": human_curated_domain,\n \"param_set\": param_set,\n \"reference_nodes\": reference_nodes,\n \"run_index\": run_index,\n \"total_runs\": total_runs,\n \"already_evaluated\": already_evaluated,\n \"logger\": logger,\n \"eval_data\": eval_data,\n }\n return return_data\n "},{"location":"evaluator-custom-types/#evaluator.backend.custom_types.create_app_attributes","title":"create_app_attributes(logger, results_dir_path, bco_results_file_name, bco_results_data, user_results_file_name, user_results_data, users_file_name, users_data, generated_output_dir_root, generated_directory_paths, padding, font) ","text":"Constructor for the AppAttributes TypedDict. Parameters: Name Type Description Default logger Logger The App logger. required results_dir_path str The path to the directory to dump the evaluation results. required bco_results_file_name str The file name for the BCO results file. required bco_results_data dict The aggregates BCO results data. required user_results_file_name str The file name for the user evaluations results file. required user_results_data dict[str, dict[str, EvalData | None] | None] The user evaluation results. required users_file_name str The file name for the users file. required users_data dict The users data. required generated_output_dir_root str The root filepath to the generated domains directory to evaluate. required generated_directory_paths list[str] List of directory paths for all the papers. required padding int The default root padding to use for all the frontend components. required font str The default font to use for all the frontend components. required Source code in evaluator/backend/custom_types.py def create_app_attributes(\n logger: Logger,\n results_dir_path: str,\n bco_results_file_name: str,\n bco_results_data: dict,\n user_results_file_name: str,\n user_results_data: dict[str, dict[str, EvalData | None] | None],\n users_file_name: str,\n users_data: dict,\n generated_output_dir_root: str,\n generated_directory_paths: list[str],\n padding: int,\n font: str,\n) -> AppAttributes:\n \"\"\"Constructor for the AppAttributes TypedDict.\n\n Parameters\n ----------\n logger : Logger\n The App logger.\n results_dir_path : str\n The path to the directory to dump the evaluation results.\n bco_results_file_name : str\n The file name for the BCO results file.\n bco_results_data: dict\n The aggregates BCO results data.\n user_results_file_name: str\n The file name for the user evaluations results file.\n user_results_data: dict[str, dict[str, EvalData | None] | None]\n The user evaluation results.\n users_file_name: str\n The file name for the users file.\n users_data: dict\n The users data.\n generated_output_dir_root: str\n The root filepath to the generated domains directory to evaluate.\n generated_directory_paths: list[str]\n List of directory paths for all the papers.\n padding: int\n The default root padding to use for all the frontend components.\n font: str\n The default font to use for all the frontend components. \n \"\"\"\n return_data: AppAttributes = {\n \"logger\": logger,\n \"results_dir_path\": results_dir_path,\n \"bco_results_file_name\": bco_results_file_name,\n \"bco_results_data\": bco_results_data,\n \"user_results_file_name\": user_results_file_name,\n \"user_results_data\": user_results_data,\n \"users_file_name\": users_file_name,\n \"users_data\": users_data,\n \"generated_output_dir_root\": generated_output_dir_root,\n \"generated_directory_paths\": generated_directory_paths,\n \"padding\": padding,\n \"font\": font,\n }\n return return_data\n "},{"location":"evaluator-custom-types/#evaluator.backend.custom_types.create_app_state","title":"create_app_state(attributes, user_hash, new_user, resume_session=False) ","text":"Constructor for the AppState TypedDict. Parameters: Name Type Description Default attributes AppAttributes The app attributes to base the state off of. required user_hash str The user hash. required new_user bool New user flag. required resume_session bool Resume session flag. False Returns: Type Description AppState Source code in evaluator/backend/custom_types.py def create_app_state(\n attributes: AppAttributes,\n user_hash: str,\n new_user: bool,\n resume_session: bool = False,\n) -> AppState:\n \"\"\"Constructor for the AppState TypedDict.\n\n Parameters\n ----------\n attributes : AppAttributes\n The app attributes to base the state off of.\n user_hash: str\n The user hash.\n new_user: bool\n New user flag.\n resume_session: bool, optional\n Resume session flag.\n\n Returns\n -------\n AppState\n \"\"\"\n return_data: AppState = {\n \"logger\": attributes[\"logger\"],\n \"results_dir_path\": attributes[\"results_dir_path\"],\n \"bco_results_file_name\": attributes[\"bco_results_file_name\"],\n \"bco_results_data\": attributes[\"bco_results_data\"],\n \"user_results_file_name\": attributes[\"user_results_file_name\"],\n \"user_results_data\": attributes[\"user_results_data\"],\n \"users_file_name\": attributes[\"users_file_name\"],\n \"users_data\": attributes[\"users_data\"],\n \"generated_output_dir_root\": attributes[\"generated_output_dir_root\"],\n \"generated_directory_paths\": attributes[\"generated_directory_paths\"],\n \"padding\": attributes[\"padding\"],\n \"font\": attributes[\"font\"],\n \"user_hash\": user_hash,\n \"new_user\": new_user,\n \"resume_session\": resume_session,\n }\n return return_data\n "},{"location":"general-frame/","title":"General Frame","text":""},{"location":"general-frame/#evaluator.frontend.components.evaluation_frames.general_frame.GeneralFrame","title":"GeneralFrame ","text":" Bases: CTkFrame , EvaluationBaseFrame Class for the general evaluation frame. Source code in evaluator/frontend/components/evaluation_frames/general_frame.py class GeneralFrame(ctk.CTkFrame, EvaluationBaseFrame):\n \"\"\"Class for the general evaluation frame.\"\"\"\n\n def __init__(\n self, master: ctk.CTkFrame, app_state: AppState, run_state: RunState, **kwargs\n ):\n \"\"\"Constructor.\"\"\"\n super().__init__(master, **kwargs)\n self.state = app_state\n self.run = run_state\n self.general_eval = self.run[\"eval_data\"][\"general_eval\"]\n\n self.grid_columnconfigure(0, weight=1)\n self.grid_rowconfigure(1, weight=1)\n\n self.main_gen_label = ctk.CTkLabel(\n master=self,\n text=\"General Evaluation\",\n font=(self.state[\"font\"], 28, \"bold\"),\n )\n self.main_gen_label.grid(\n row=0, columnspan=2, padx=self.state[\"padding\"], pady=self.state[\"padding\"]\n )\n\n self.relevancy_label = ctk.CTkLabel(\n master=self,\n text=\"How relevant is the domain content?\",\n font=(self.state[\"font\"], 16, \"bold\"),\n )\n self.relevancy_label.grid(\n row=2,\n column=0,\n padx=(self.state[\"padding\"], self.state[\"padding\"] // 4),\n pady=(self.state[\"padding\"] // 2, self.state[\"padding\"] // 4),\n sticky=\"w\",\n )\n\n self.relevancy_var = ctk.IntVar(\n value=self.general_eval.get(\"relevancy\", EVAL_DEFAULTS[\"relevancy\"])\n )\n self.relevancy_button = ctk.CTkSegmentedButton(\n master=self, values=[-1, 0, 1, 2], variable=self.relevancy_var\n )\n self.relevancy_button.grid(\n row=2,\n column=1,\n padx=(self.state[\"padding\"] // 4, self.state[\"padding\"]),\n pady=(self.state[\"padding\"] // 2, self.state[\"padding\"] // 4),\n sticky=\"w\",\n )\n\n self.readability_label = ctk.CTkLabel(\n master=self,\n text=\"How readable is the domain content?\",\n font=(self.state[\"font\"], 16, \"bold\"),\n )\n self.readability_label.grid(\n row=3,\n column=0,\n padx=(self.state[\"padding\"], self.state[\"padding\"] // 4),\n pady=self.state[\"padding\"] // 4,\n sticky=\"w\",\n )\n\n self.readability_var = ctk.IntVar(\n value=self.general_eval.get(\"readability\", EVAL_DEFAULTS[\"readability\"])\n )\n self.readability_button = ctk.CTkSegmentedButton(\n master=self, values=[-1, 0, 1, 2], variable=self.readability_var\n )\n self.readability_button.grid(\n row=3,\n column=1,\n padx=(self.state[\"padding\"] // 4, self.state[\"padding\"]),\n pady=self.state[\"padding\"] // 4,\n sticky=\"w\",\n )\n\n self.reproducibility_label = ctk.CTkLabel(\n master=self,\n text=\"How reproducible is the domain content?\",\n font=(self.state[\"font\"], 16, \"bold\"),\n )\n self.reproducibility_label.grid(\n row=4,\n column=0,\n padx=(self.state[\"padding\"], self.state[\"padding\"] // 4),\n pady=self.state[\"padding\"] // 4,\n sticky=\"w\",\n )\n\n self.reproducibility_var = ctk.IntVar(\n value=self.general_eval.get(\n \"reproducibility\", EVAL_DEFAULTS[\"reproducibility\"]\n )\n )\n self.reproducibility_button = ctk.CTkSegmentedButton(\n master=self, values=[-1, 0, 1, 2], variable=self.reproducibility_var\n )\n self.reproducibility_button.grid(\n row=4,\n column=1,\n padx=(self.state[\"padding\"] // 4, self.state[\"padding\"]),\n pady=self.state[\"padding\"] // 4,\n sticky=\"w\",\n )\n\n self.conf_label = ctk.CTkLabel(\n master=self,\n text=\"What is your confidence rating for the domain?\",\n font=(self.state[\"font\"], 16, \"bold\"),\n )\n self.conf_label.grid(\n row=5,\n column=0,\n padx=(self.state[\"padding\"], self.state[\"padding\"] // 4),\n pady=self.state[\"padding\"] // 4,\n sticky=\"w\",\n )\n\n self.conf_var = ctk.IntVar(\n value=self.general_eval.get(\n \"confidence_rating\", EVAL_DEFAULTS[\"confidence_rating\"]\n )\n )\n self.conf_button = ctk.CTkSegmentedButton(\n master=self, values=[-1, 0, 1, 2], variable=self.conf_var\n )\n self.conf_button.grid(\n row=5,\n column=1,\n padx=(self.state[\"padding\"] // 4, self.state[\"padding\"]),\n pady=self.state[\"padding\"] // 4,\n sticky=\"w\",\n )\n\n self.general_notes_label = ctk.CTkLabel(\n master=self, text=\"Notes\", font=(self.state[\"font\"], 16, \"bold\")\n )\n self.general_notes_label.grid(\n row=6,\n column=0,\n padx=self.state[\"padding\"],\n pady=(self.state[\"padding\"] // 2, self.state[\"padding\"] // 4),\n sticky=\"w\",\n )\n\n self.general_notes = ctk.CTkTextbox(\n master=self, wrap=\"word\", font=(self.state[\"font\"], 18)\n )\n self.general_notes.grid(\n row=7,\n columnspan=2,\n padx=self.state[\"padding\"] // 2,\n pady=(self.state[\"padding\"] // 4, self.state[\"padding\"]),\n sticky=\"nsew\",\n )\n\n def update_state(self, app_state: AppState, run_state: RunState) -> None:\n \"\"\"Update the component state.\n\n Parameters\n ----------\n app_state : AppState\n The updated app state.\n run_state : RunState\n The updated run state.\n \"\"\"\n self.run = run_state\n self.state = app_state\n self.general_eval = self.run[\"eval_data\"][\"general_eval\"]\n\n self.relevancy_var = ctk.IntVar(\n value=self.general_eval.get(\"relevancy\", EVAL_DEFAULTS[\"relevancy\"])\n )\n self.relevancy_button.configure(variable=self.relevancy_var)\n\n self.readability_var = ctk.IntVar(\n value=self.general_eval.get(\"readability\", EVAL_DEFAULTS[\"readability\"])\n )\n self.readability_button.configure(variable=self.readability_var)\n\n self.reproducibility_var = ctk.IntVar(\n value=self.general_eval.get(\n \"reproducibility\", EVAL_DEFAULTS[\"reproducibility\"]\n )\n )\n self.reproducibility_button.configure(variable=self.reproducibility_var)\n\n self.conf_var = ctk.IntVar(\n value=self.general_eval.get(\n \"confidence_rating\", EVAL_DEFAULTS[\"confidence_rating\"]\n )\n )\n self.conf_button.configure(variable=self.conf_var)\n\n self.general_notes.delete(0.0, \"end\")\n self.general_notes.insert(\n 0.0, self.general_eval.get(\"notes\", EVAL_DEFAULTS[\"notes\"])\n )\n\n def get_results(self) -> GeneralEval:\n \"\"\"Returns the general evaluations.\n\n Returns\n -------\n GeneralEval\n The general evaluation results.\n \"\"\"\n relevancy_val = self.relevancy_var.get()\n readability_var = self.readability_var.get()\n reproducibility_var = self.reproducibility_var.get()\n conf_var = self.conf_var.get()\n general_val = create_general_eval(\n relevancy=relevancy_val,\n readability=readability_var,\n reproducibility=reproducibility_var,\n confidence_rating=conf_var,\n notes=self.general_notes.get(0.0, \"end\"),\n )\n return general_val\n "},{"location":"general-frame/#evaluator.frontend.components.evaluation_frames.general_frame.GeneralFrame.__init__","title":"__init__(master, app_state, run_state, **kwargs) ","text":"Constructor. Source code in evaluator/frontend/components/evaluation_frames/general_frame.py def __init__(\n self, master: ctk.CTkFrame, app_state: AppState, run_state: RunState, **kwargs\n):\n \"\"\"Constructor.\"\"\"\n super().__init__(master, **kwargs)\n self.state = app_state\n self.run = run_state\n self.general_eval = self.run[\"eval_data\"][\"general_eval\"]\n\n self.grid_columnconfigure(0, weight=1)\n self.grid_rowconfigure(1, weight=1)\n\n self.main_gen_label = ctk.CTkLabel(\n master=self,\n text=\"General Evaluation\",\n font=(self.state[\"font\"], 28, \"bold\"),\n )\n self.main_gen_label.grid(\n row=0, columnspan=2, padx=self.state[\"padding\"], pady=self.state[\"padding\"]\n )\n\n self.relevancy_label = ctk.CTkLabel(\n master=self,\n text=\"How relevant is the domain content?\",\n font=(self.state[\"font\"], 16, \"bold\"),\n )\n self.relevancy_label.grid(\n row=2,\n column=0,\n padx=(self.state[\"padding\"], self.state[\"padding\"] // 4),\n pady=(self.state[\"padding\"] // 2, self.state[\"padding\"] // 4),\n sticky=\"w\",\n )\n\n self.relevancy_var = ctk.IntVar(\n value=self.general_eval.get(\"relevancy\", EVAL_DEFAULTS[\"relevancy\"])\n )\n self.relevancy_button = ctk.CTkSegmentedButton(\n master=self, values=[-1, 0, 1, 2], variable=self.relevancy_var\n )\n self.relevancy_button.grid(\n row=2,\n column=1,\n padx=(self.state[\"padding\"] // 4, self.state[\"padding\"]),\n pady=(self.state[\"padding\"] // 2, self.state[\"padding\"] // 4),\n sticky=\"w\",\n )\n\n self.readability_label = ctk.CTkLabel(\n master=self,\n text=\"How readable is the domain content?\",\n font=(self.state[\"font\"], 16, \"bold\"),\n )\n self.readability_label.grid(\n row=3,\n column=0,\n padx=(self.state[\"padding\"], self.state[\"padding\"] // 4),\n pady=self.state[\"padding\"] // 4,\n sticky=\"w\",\n )\n\n self.readability_var = ctk.IntVar(\n value=self.general_eval.get(\"readability\", EVAL_DEFAULTS[\"readability\"])\n )\n self.readability_button = ctk.CTkSegmentedButton(\n master=self, values=[-1, 0, 1, 2], variable=self.readability_var\n )\n self.readability_button.grid(\n row=3,\n column=1,\n padx=(self.state[\"padding\"] // 4, self.state[\"padding\"]),\n pady=self.state[\"padding\"] // 4,\n sticky=\"w\",\n )\n\n self.reproducibility_label = ctk.CTkLabel(\n master=self,\n text=\"How reproducible is the domain content?\",\n font=(self.state[\"font\"], 16, \"bold\"),\n )\n self.reproducibility_label.grid(\n row=4,\n column=0,\n padx=(self.state[\"padding\"], self.state[\"padding\"] // 4),\n pady=self.state[\"padding\"] // 4,\n sticky=\"w\",\n )\n\n self.reproducibility_var = ctk.IntVar(\n value=self.general_eval.get(\n \"reproducibility\", EVAL_DEFAULTS[\"reproducibility\"]\n )\n )\n self.reproducibility_button = ctk.CTkSegmentedButton(\n master=self, values=[-1, 0, 1, 2], variable=self.reproducibility_var\n )\n self.reproducibility_button.grid(\n row=4,\n column=1,\n padx=(self.state[\"padding\"] // 4, self.state[\"padding\"]),\n pady=self.state[\"padding\"] // 4,\n sticky=\"w\",\n )\n\n self.conf_label = ctk.CTkLabel(\n master=self,\n text=\"What is your confidence rating for the domain?\",\n font=(self.state[\"font\"], 16, \"bold\"),\n )\n self.conf_label.grid(\n row=5,\n column=0,\n padx=(self.state[\"padding\"], self.state[\"padding\"] // 4),\n pady=self.state[\"padding\"] // 4,\n sticky=\"w\",\n )\n\n self.conf_var = ctk.IntVar(\n value=self.general_eval.get(\n \"confidence_rating\", EVAL_DEFAULTS[\"confidence_rating\"]\n )\n )\n self.conf_button = ctk.CTkSegmentedButton(\n master=self, values=[-1, 0, 1, 2], variable=self.conf_var\n )\n self.conf_button.grid(\n row=5,\n column=1,\n padx=(self.state[\"padding\"] // 4, self.state[\"padding\"]),\n pady=self.state[\"padding\"] // 4,\n sticky=\"w\",\n )\n\n self.general_notes_label = ctk.CTkLabel(\n master=self, text=\"Notes\", font=(self.state[\"font\"], 16, \"bold\")\n )\n self.general_notes_label.grid(\n row=6,\n column=0,\n padx=self.state[\"padding\"],\n pady=(self.state[\"padding\"] // 2, self.state[\"padding\"] // 4),\n sticky=\"w\",\n )\n\n self.general_notes = ctk.CTkTextbox(\n master=self, wrap=\"word\", font=(self.state[\"font\"], 18)\n )\n self.general_notes.grid(\n row=7,\n columnspan=2,\n padx=self.state[\"padding\"] // 2,\n pady=(self.state[\"padding\"] // 4, self.state[\"padding\"]),\n sticky=\"nsew\",\n )\n "},{"location":"general-frame/#evaluator.frontend.components.evaluation_frames.general_frame.GeneralFrame.update_state","title":"update_state(app_state, run_state) ","text":"Update the component state. Parameters: Name Type Description Default app_state AppState The updated app state. required run_state RunState The updated run state. required Source code in evaluator/frontend/components/evaluation_frames/general_frame.py def update_state(self, app_state: AppState, run_state: RunState) -> None:\n \"\"\"Update the component state.\n\n Parameters\n ----------\n app_state : AppState\n The updated app state.\n run_state : RunState\n The updated run state.\n \"\"\"\n self.run = run_state\n self.state = app_state\n self.general_eval = self.run[\"eval_data\"][\"general_eval\"]\n\n self.relevancy_var = ctk.IntVar(\n value=self.general_eval.get(\"relevancy\", EVAL_DEFAULTS[\"relevancy\"])\n )\n self.relevancy_button.configure(variable=self.relevancy_var)\n\n self.readability_var = ctk.IntVar(\n value=self.general_eval.get(\"readability\", EVAL_DEFAULTS[\"readability\"])\n )\n self.readability_button.configure(variable=self.readability_var)\n\n self.reproducibility_var = ctk.IntVar(\n value=self.general_eval.get(\n \"reproducibility\", EVAL_DEFAULTS[\"reproducibility\"]\n )\n )\n self.reproducibility_button.configure(variable=self.reproducibility_var)\n\n self.conf_var = ctk.IntVar(\n value=self.general_eval.get(\n \"confidence_rating\", EVAL_DEFAULTS[\"confidence_rating\"]\n )\n )\n self.conf_button.configure(variable=self.conf_var)\n\n self.general_notes.delete(0.0, \"end\")\n self.general_notes.insert(\n 0.0, self.general_eval.get(\"notes\", EVAL_DEFAULTS[\"notes\"])\n )\n "},{"location":"general-frame/#evaluator.frontend.components.evaluation_frames.general_frame.GeneralFrame.get_results","title":"get_results() ","text":"Returns the general evaluations. Returns: Type Description GeneralEval The general evaluation results. Source code in evaluator/frontend/components/evaluation_frames/general_frame.py def get_results(self) -> GeneralEval:\n \"\"\"Returns the general evaluations.\n\n Returns\n -------\n GeneralEval\n The general evaluation results.\n \"\"\"\n relevancy_val = self.relevancy_var.get()\n readability_var = self.readability_var.get()\n reproducibility_var = self.reproducibility_var.get()\n conf_var = self.conf_var.get()\n general_val = create_general_eval(\n relevancy=relevancy_val,\n readability=readability_var,\n reproducibility=reproducibility_var,\n confidence_rating=conf_var,\n notes=self.general_notes.get(0.0, \"end\"),\n )\n return general_val\n "},{"location":"grid-search/","title":"Grid Search","text":"Grid search class implementation. "},{"location":"grid-search/#parameter_search.grid_search.BcoGridSearch","title":"BcoGridSearch ","text":" Bases: BcoParameterSearch BCO grid search class. Subclass of BcoParameterSearch . Source code in parameter_search/grid_search.py class BcoGridSearch(BcoParameterSearch):\n \"\"\"BCO grid search class. Subclass of `BcoParameterSearch`.\"\"\"\n\n def __init__(self, search_space: SearchSpace):\n \"\"\"Constructor.\n\n Parameters\n ----------\n search_space : SearchSpace\n The parameter search space.\n \"\"\"\n super().__init__(search_space)\n\n def _setup_logger(self, path: str = \"./logs\", name: str = \"grid-search\") -> Logger:\n \"\"\"Sets up the logger.\n\n Parameters\n ----------\n path : str, optional\n File path for the logger.\n name : str, optional\n Name for the logger output.\n\n Returns\n -------\n Logger\n The grid search logger.\n \"\"\"\n check_dir(path)\n if not name.endswith(\".log\"):\n name = f\"{name}.log\"\n return setup_root_logger(os.path.join(path, name))\n\n def _create_param_sets(self) -> list[UserSelections]:\n \"\"\"Creates a cartesian product of the parameter space.\n\n Returns\n -------\n list[UserSelections]\n Every comination of the parameter search space.\n \"\"\"\n param_sets: list[UserSelections] = []\n\n for (\n llm,\n embedding_model,\n filepath,\n loader,\n chunking_config,\n vector_store,\n similarity_top_k,\n ) in product(\n self._llms,\n self._embedding_models,\n self._files,\n self._loaders,\n self._chunking_configs,\n self._vector_stores,\n self._similarity_top_k,\n ):\n base_selections = {\n \"llm\": llm,\n \"embedding_model\": embedding_model,\n \"filename\": os.path.basename(str(filepath)),\n \"filepath\": filepath,\n \"vector_store\": vector_store,\n \"loader\": loader,\n \"mode\": \"production\",\n \"similarity_top_k\": similarity_top_k,\n \"chunking_config\": chunking_config,\n }\n\n if self._git_data is None:\n base_selections[\"git_data\"] = None\n else:\n for git_data in self._git_data:\n if git_data[\"filename\"] == filepath or git_data[\n \"filename\"\n ] == os.path.basename(str(filepath)):\n base_selections[\"git_data\"] = create_git_data(\n user=git_data[\"git_info\"][\"user\"],\n repo=git_data[\"git_info\"][\"repo\"],\n branch=git_data[\"git_info\"][\"branch\"],\n filters=git_data[\"git_info\"][\"filters\"],\n )\n\n if self._other_docs is None:\n base_selections[\"other_docs\"] = None\n else:\n for paper, other_docs in self._other_docs.items():\n if paper == os.path.basename(str(filepath)):\n base_selections[\"other_docs\"] = other_docs\n\n user_selections = create_user_selections(\n base_selections[\"llm\"],\n base_selections[\"embedding_model\"],\n base_selections[\"filename\"],\n base_selections[\"filepath\"],\n base_selections[\"vector_store\"],\n base_selections[\"loader\"],\n base_selections[\"mode\"],\n base_selections[\"similarity_top_k\"],\n base_selections[\"chunking_config\"],\n base_selections[\"git_data\"],\n base_selections[\"other_docs\"],\n )\n param_sets.append(user_selections)\n\n return param_sets\n "},{"location":"grid-search/#parameter_search.grid_search.BcoGridSearch.__init__","title":"__init__(search_space) ","text":"Constructor. Parameters: Name Type Description Default search_space SearchSpace The parameter search space. required Source code in parameter_search/grid_search.py def __init__(self, search_space: SearchSpace):\n \"\"\"Constructor.\n\n Parameters\n ----------\n search_space : SearchSpace\n The parameter search space.\n \"\"\"\n super().__init__(search_space)\n "},{"location":"grid-search/#parameter_search.grid_search.BcoGridSearch._setup_logger","title":"_setup_logger(path='./logs', name='grid-search') ","text":"Sets up the logger. Parameters: Name Type Description Default path str File path for the logger. './logs' name str Name for the logger output. 'grid-search' Returns: Type Description Logger The grid search logger. Source code in parameter_search/grid_search.py def _setup_logger(self, path: str = \"./logs\", name: str = \"grid-search\") -> Logger:\n \"\"\"Sets up the logger.\n\n Parameters\n ----------\n path : str, optional\n File path for the logger.\n name : str, optional\n Name for the logger output.\n\n Returns\n -------\n Logger\n The grid search logger.\n \"\"\"\n check_dir(path)\n if not name.endswith(\".log\"):\n name = f\"{name}.log\"\n return setup_root_logger(os.path.join(path, name))\n "},{"location":"grid-search/#parameter_search.grid_search.BcoGridSearch._create_param_sets","title":"_create_param_sets() ","text":"Creates a cartesian product of the parameter space. Returns: Type Description list[UserSelections] Every comination of the parameter search space. Source code in parameter_search/grid_search.py def _create_param_sets(self) -> list[UserSelections]:\n \"\"\"Creates a cartesian product of the parameter space.\n\n Returns\n -------\n list[UserSelections]\n Every comination of the parameter search space.\n \"\"\"\n param_sets: list[UserSelections] = []\n\n for (\n llm,\n embedding_model,\n filepath,\n loader,\n chunking_config,\n vector_store,\n similarity_top_k,\n ) in product(\n self._llms,\n self._embedding_models,\n self._files,\n self._loaders,\n self._chunking_configs,\n self._vector_stores,\n self._similarity_top_k,\n ):\n base_selections = {\n \"llm\": llm,\n \"embedding_model\": embedding_model,\n \"filename\": os.path.basename(str(filepath)),\n \"filepath\": filepath,\n \"vector_store\": vector_store,\n \"loader\": loader,\n \"mode\": \"production\",\n \"similarity_top_k\": similarity_top_k,\n \"chunking_config\": chunking_config,\n }\n\n if self._git_data is None:\n base_selections[\"git_data\"] = None\n else:\n for git_data in self._git_data:\n if git_data[\"filename\"] == filepath or git_data[\n \"filename\"\n ] == os.path.basename(str(filepath)):\n base_selections[\"git_data\"] = create_git_data(\n user=git_data[\"git_info\"][\"user\"],\n repo=git_data[\"git_info\"][\"repo\"],\n branch=git_data[\"git_info\"][\"branch\"],\n filters=git_data[\"git_info\"][\"filters\"],\n )\n\n if self._other_docs is None:\n base_selections[\"other_docs\"] = None\n else:\n for paper, other_docs in self._other_docs.items():\n if paper == os.path.basename(str(filepath)):\n base_selections[\"other_docs\"] = other_docs\n\n user_selections = create_user_selections(\n base_selections[\"llm\"],\n base_selections[\"embedding_model\"],\n base_selections[\"filename\"],\n base_selections[\"filepath\"],\n base_selections[\"vector_store\"],\n base_selections[\"loader\"],\n base_selections[\"mode\"],\n base_selections[\"similarity_top_k\"],\n base_selections[\"chunking_config\"],\n base_selections[\"git_data\"],\n base_selections[\"other_docs\"],\n )\n param_sets.append(user_selections)\n\n return param_sets\n "},{"location":"in-progress/","title":"In-Progress Report Documentation","text":"The BCO standard describes comprehensive documentation on the complete specifications of a bioinformatics workflow. Unfortunately, this makes it difficult to create BCOs while work is still in progress. If a full paper describing the complete workflow for the project has not yet been completed, the in-progress mode can be used to create in progress documentation using the Aggregator tool. The Aggregator tool leverages the OpenAI gpt-4o-mini model to generate a plain text summary that follows a similar structure to the domains of a BioCompute Object (BCO). The in progress documentation aggregator can be run from the main.py entrypoint using the in-progress positional argument and the --path option. The available options for the in progress mode are as follows: --path : The path to the directory to process (required). --include : Comma delimited list of glob patterns to include (optional). --exclude : Comma delimited list of glob patterns to exclude (optional). --exclude-from-tree : Whether to exclude non-included files in the source tree (optional, store true argument). --include-priority : Whether to prioritize include or exclude patterns in the case of conflict (optional, store false argument). Here's an example output from the Aggregator tool when run on this project: "},{"location":"in-progress/#biocompute-object-documentation-for-the-bco-rag-project","title":"BioCompute Object Documentation for the BCO-RAG Project","text":""},{"location":"in-progress/#usability-domain","title":"Usability Domain","text":"The BCO-RAG project aims to provide an automated assistant for generating BioCompute Objects (BCOs) from existing biological research publications. This tool allows researchers to easily convert their publications into a standardized format, thus enhancing reproducibility and transparency in biological data analysis workflows. The primary use case is to reduce the overhead of retroactively documenting existing workflows used in research, making it easier for users to adhere to BCO standards while leveraging advanced language models for generation. "},{"location":"in-progress/#io-domain","title":"IO Domain","text":""},{"location":"in-progress/#input-files","title":"Input Files:","text":" - High resolution measurement PDF file located in
bco-rag/test_papers/High resolution measurement.pdf . "},{"location":"in-progress/#output-files","title":"Output Files:","text":" - Output directory structure will be created under
output/high_resolution_measurement/ containing: generated_domains/ subdirectory with generated domain files. - JSON and TXT files for each domain generated (e.g.,
usability-1-{hash}.json , io-1-{hash}.txt ). reference_sources/ subdirectory for tracking source references. output_map.json and output_map.tsv files that track generated domains and parameter sets. "},{"location":"in-progress/#description-domain","title":"Description Domain","text":""},{"location":"in-progress/#keywords","title":"Keywords:","text":" - BCO-RAG, BioCompute Object, automation, reproducibility, biological data analysis, retrieval-augmented generation, documentation standardization.
"},{"location":"in-progress/#workflow-steps","title":"Workflow Steps:","text":" - Load the PDF: Use PDF or directory reader to ingest the publication.
- Generate Domain: Execute
perform_query for each BCO domain including usability, IO, description, execution, parametric, and error domains. - Store Outputs: Save generated outputs to the specified output directory.
- Log Data: Keep track of input/output files and their relationships in
output_map.json . "},{"location":"in-progress/#execution-domain","title":"Execution Domain","text":"The BCO-RAG requires the following setup for execution: - Dependencies: Users must have Python 3.10 or higher installed. - Required Packages: Install dependencies specified in requirements.txt using pip install -r requirements.txt . - Environment Configuration: - Set the OpenAI API key in a .env file. - Set the Github personal access token if using Github options. "},{"location":"in-progress/#run-instructions","title":"Run Instructions:","text":""},{"location":"in-progress/#parametric-domain","title":"Parametric Domain","text":"The following parameters affect the computational workflow: - loader (str): Data loader used (e.g., PDFReader ). - chunking_config (str): Configuration for chunking strategy (e.g., 1024 chunk size/20 chunk overlap ). - embedding_model (str): Model used for embeddings (e.g., text-embedding-3-large ). - vector_store (str): Name of the vector store used (e.g., VectorStoreIndex ). - similarity_top_k (int): Number of top entries to retrieve during similarity search. - llm (str): Language model choice (e.g., gpt-4-turbo ). - git_data (Optional[GitData]): Includes repository info if GitHub is used. "},{"location":"in-progress/#examples-of-parameters","title":"Examples of Parameters:","text":" - Sample settings might include: LLM as
gpt-4 , embedding model as text-embedding-3-large , similarity_top_k as 3 , etc. "},{"location":"in-progress/#error-domain","title":"Error Domain","text":"The project tracks potential errors in the generated domains: - Inferred Knowledge Errors: Errors related to information that require inference based on external conditions not stated in the source material. - External Knowledge Errors: Errors arising from insufficient context provided for the domain's connections to external references. - JSON Formatting Errors: Issues arising if the generated output is not valid JSON. - Miscellaneous Errors: Any other discrepancies consistently tracked for documentation purposes. "},{"location":"in-progress/#evaluation","title":"Evaluation:","text":"For each output generated, the tool logs potential errors and evaluations of the output quality, ensuring that all relevant data is captured in the final documentation. "},{"location":"in-progress/#overall-functionality","title":"Overall Functionality:","text":"The BCO-RAG project automates the generation of a structured and standardized representation of computational research workflows, significantly aiding in data sharing and reproducibility within the biological research community. "},{"location":"installation/","title":"Installation and Setup","text":" - Prerequisites
- Quickstart
- Virtual Environment
- Create Log Directory
- OpenAI API Key
"},{"location":"installation/#prerequisites","title":"Prerequisites","text":"This directory requires at least Python 3.10 to setup. The code in this directory makes extensive use of an alternate way to indicate union type annotations as X | Y instead of Union[X, Y] from the Typing library. "},{"location":"installation/#quickstart","title":"Quickstart","text":"Getting started with the BCO-RAG assistant requires minimal setup. From a high level, this guide will walk you through: - Getting the code on your local machine.
- Setting up a virtual environment and downloading the project dependencies.
- Required environment variable(s).
- Starting up the assistant in its primary usage form.
"},{"location":"installation/#clone-the-repository","title":"Clone the repository","text":"First, clone the repository to your machine: git clone git@github.com:biocompute-objects/bco-rag.git\n This example uses the ssh method, replace with HTTPS URL as needed. "},{"location":"installation/#virtual-environment","title":"Virtual Environment","text":"Create a virtual environment from with the bco-rag/ root directory: virtualenv env\n This example uses virtualenv to create the virtual environment, replace with venv or your preferred virtual environment handler as needed. To activate the virtual environment on Windows: env/Scripts/activate\n To activate the virtual environment on MacOS/Linux: source env/bin/activate\n Then install the project dependencies: (env) pip install -r requirements.txt\n "},{"location":"installation/#openai-api-key","title":"OpenAI API Key","text":"Create your .env file and add your OpenAI API key and Github personal access token (if using Github option). For example: OPENAI_API_KEY=<KEY>\nGITHUB_TOKEN=<TOKEN>\n If you are not planning on including Github repositories in the data ingestion process, you don't need to include a Github personal access token in your .env file. Additional information on obtaining API keys/tokens: - OpenAI API Key
- Github Personal Access Token
"},{"location":"installation/#create-log-directory","title":"Create log Directory","text":"Within the root of the project, create the log directory: mkdir logs/\n "},{"location":"installation/#basic-usage","title":"Basic Usage","text":"The base one-shot approach can be run like so: (env) python main.py\n or (env) python main.py one-shot\n On startup, you will be prompted to choose the paper generate the BCO domains for. You can place any .pdf paper in the ./bco-rag/bcorag/test_papers directory for it to be included in this menu. The arrow keys or j /k can be used to navigate the menus. Press Enter to choose an option. Please choose the PDF file to index:\n\n -> High resolution measurement.pdf\n Exit\n After choosing the paper to index, you'll be prompted for the data loader to use. On any configuration menu step, there will be a link that will direct you to detailed documentation on the differences, strengths, and weaknesses between each of the options. Please choose one of the following Loaders.\n Documentation can be found at:\n https://biocompute-objects.github.io/bco-rag/options/#data-loader.\n\n -> SimpleDirectoryReader (default)\n PDFReader\n PDFMarker\n Exit\n After choosing the data loader, the chunking strategy has to be chosen: Please choose one of the following Chunking Configs.\n Documentation can be found at:\n https://biocompute-objects.github.io/bco-rag/options/#chunking-strategy.\n\n -> 256 chunk size/20 chunk overlap\n 512 chunk size/50 chunk overlap\n 1024 chunk size/20 chunk overlap (default)\n 2048 chunk size/50 chunk overlap\n semantic\n Exit\n After choosing the chunking strategy, the embedding model has to be chosen: Please choose one of the following Embedding Models.\n Documentation can be found at:\n https://biocompute-objects.github.io/bco-rag/options/#embedding-model.\n\n -> text-embedding-3-small (default)\n text-embedding-3-large\n text-embedding-ada-002\n Exit\n After choosing the embedding model, the vector store has to be chosen: Please choose one of the following Vector Stores.\n Documentation can be found at:\n https://biocompute-objects.github.io/bco-rag/options/#vector-store.\n\n -> VectorStoreIndex (default)\n Exit\n After choosing the vector store, the similarity top k value has to be chosen: Please choose one of the following Similarity Top Ks.\n Documentation can be found at:\n https://biocompute-objects.github.io/bco-rag/options/#similarity-top-k.\n\n -> 1 (default)\n 2\n 3\n 4\n 5\n Exit\n After choosing the similarity top k value, the LLM has to be chosen: Please choose one of the following Llms.\n Documentation can be found at:\n https://biocompute-objects.github.io/bco-rag/options/#llm-model.\n\n -> gpt-3.5-turbo\n gpt-4-turbo (default)\n gpt-4-turbo-preview\n gpt-4\n Exit\n Next, choose the run mode. The run mode will control the verbosity of the logging for each generated domain. Choosing debug mode will include an extensive logging of everything that is happening under the hood during ewach run. Choosing production mode will only include the minimum necessary logging, wuch as the user options and return responses. Please choose one of the following Modes.\n Documentation can be found at:\n https://biocompute-objects.github.io/bco-rag/options/#mode.\n\n -> debug\n production (default)\n Exit\n Next, you will be prompted to include a Github repository to include as supplemental knowledge for the retrieval step. In this example, we have pasted the URL to the repository for this project. If you have included a Github repository, you will then be prompted for more granular configuration options regarding how the repository will be ingested. These configuration options include which repository branch to index and optional directory/file extension filters. In this example, we are indexing the repository's main branch, excluding the output/ , logs/ , parameter_search/ , output/ , and evaluator/ directories. We are also excluding any files with the file extensions of .txt , .log , and .md . If you would like to include a Github repository enter the URL below. Enter \"x\" to exit or leave blank to skip.\n> https://github.com/biocompute-objects/bco-rag\nRepo branch to index (case sensitive):\n> main\nWould you like to include a directory filter?\nEnter a list of comma-delimited directories to either conditionally exclude or inclusively include. Or leave blank to skip.\n> output, logs, parameter_search, output, evaluator\nEnter \"include\" or \"exclude\" for the directory filter.\n> exclude\nWould you like to include a file extension filter?\nEnter a list of comma-delimited file extensions to either conditionally exclude or inclusively include. Or leave blank to skip.\n> .txt, .log, .md\nEnter \"include\" or \"exclude\" for the file extension filter.\n> exclude\n More extensive documentation that goes past this quick start guide can be found on the usage page. Once the configuration steps are completed, you can select which domains to generate. You can enter the shorthand code, denoted inside the [] brackets, or the full domain name and then pressing Enter . Which domain would you like to generate? Supported domains are:\n [u]sability\n [i]o\n [d]escription\n [e]xecution\n [p]arametric\n [err]or\n E[x]it\n\n>\n "},{"location":"intermediate-screen/","title":"Intermediate Screen","text":""},{"location":"intermediate-screen/#evaluator.frontend.components.intermediate_screen.IntermediateScreen","title":"IntermediateScreen ","text":" Bases: CTkFrame Class for the intermediate screen for the user to choose to start from the beginning or to continue from last session. Source code in evaluator/frontend/components/intermediate_screen.py class IntermediateScreen(ctk.CTkFrame):\n \"\"\"Class for the intermediate screen for the user to choose\n to start from the beginning or to continue from last session.\n \"\"\"\n\n def __init__(\n self,\n master: ctk.CTk,\n on_start: Callable[[AppState], None],\n app_state: AppState,\n **kwargs\n ):\n \"\"\"Constructor.\"\"\"\n super().__init__(master, **kwargs)\n\n self.state = app_state\n self.on_start = on_start\n\n self.grid_rowconfigure(0, weight=1)\n self.grid_columnconfigure(0, weight=1)\n self.grid(\n row=0,\n column=0,\n padx=self.state[\"padding\"] + 10,\n pady=self.state[\"padding\"] + 10,\n )\n\n self.welcome_label = ctk.CTkLabel(\n master=self, text=\"\", font=(self.state[\"font\"], 32, \"bold\")\n )\n self.welcome_label.grid(\n row=0, column=0, padx=self.state[\"padding\"], pady=self.state[\"padding\"] + 10\n )\n\n self.start_new_button = ctk.CTkButton(\n master=self,\n text=\"Start From Beginning\",\n command=self._start_new,\n font=(self.state[\"font\"], 16),\n )\n self.start_new_button.grid(\n row=1, column=0, padx=self.state[\"padding\"], pady=self.state[\"padding\"] + 10\n )\n\n if self.state[\"new_user\"]:\n welcome_text = \"New User\"\n else:\n welcome_text = \"Welcome Back\"\n self.continue_button = ctk.CTkButton(\n master=self,\n text=\"Continue Last Session\",\n command=self._continue_last,\n font=(self.state[\"font\"], 16),\n )\n self.continue_button.grid(\n row=2,\n column=0,\n padx=self.state[\"padding\"],\n pady=self.state[\"padding\"] + 10,\n )\n self.welcome_label.configure(text=welcome_text)\n\n def _start_new(self) -> None:\n \"\"\"User chose to start a new session.\"\"\"\n self.state = set_resume_session(self.state, False)\n self.on_start(self.state)\n\n def _continue_last(self) -> None:\n \"\"\"User chose to continue from last session.\"\"\"\n self.state = set_resume_session(self.state, True)\n self.on_start(self.state)\n "},{"location":"intermediate-screen/#evaluator.frontend.components.intermediate_screen.IntermediateScreen.__init__","title":"__init__(master, on_start, app_state, **kwargs) ","text":"Constructor. Source code in evaluator/frontend/components/intermediate_screen.py def __init__(\n self,\n master: ctk.CTk,\n on_start: Callable[[AppState], None],\n app_state: AppState,\n **kwargs\n):\n \"\"\"Constructor.\"\"\"\n super().__init__(master, **kwargs)\n\n self.state = app_state\n self.on_start = on_start\n\n self.grid_rowconfigure(0, weight=1)\n self.grid_columnconfigure(0, weight=1)\n self.grid(\n row=0,\n column=0,\n padx=self.state[\"padding\"] + 10,\n pady=self.state[\"padding\"] + 10,\n )\n\n self.welcome_label = ctk.CTkLabel(\n master=self, text=\"\", font=(self.state[\"font\"], 32, \"bold\")\n )\n self.welcome_label.grid(\n row=0, column=0, padx=self.state[\"padding\"], pady=self.state[\"padding\"] + 10\n )\n\n self.start_new_button = ctk.CTkButton(\n master=self,\n text=\"Start From Beginning\",\n command=self._start_new,\n font=(self.state[\"font\"], 16),\n )\n self.start_new_button.grid(\n row=1, column=0, padx=self.state[\"padding\"], pady=self.state[\"padding\"] + 10\n )\n\n if self.state[\"new_user\"]:\n welcome_text = \"New User\"\n else:\n welcome_text = \"Welcome Back\"\n self.continue_button = ctk.CTkButton(\n master=self,\n text=\"Continue Last Session\",\n command=self._continue_last,\n font=(self.state[\"font\"], 16),\n )\n self.continue_button.grid(\n row=2,\n column=0,\n padx=self.state[\"padding\"],\n pady=self.state[\"padding\"] + 10,\n )\n self.welcome_label.configure(text=welcome_text)\n "},{"location":"intermediate-screen/#evaluator.frontend.components.intermediate_screen.IntermediateScreen._start_new","title":"_start_new() ","text":"User chose to start a new session. Source code in evaluator/frontend/components/intermediate_screen.py def _start_new(self) -> None:\n \"\"\"User chose to start a new session.\"\"\"\n self.state = set_resume_session(self.state, False)\n self.on_start(self.state)\n "},{"location":"intermediate-screen/#evaluator.frontend.components.intermediate_screen.IntermediateScreen._continue_last","title":"_continue_last() ","text":"User chose to continue from last session. Source code in evaluator/frontend/components/intermediate_screen.py def _continue_last(self) -> None:\n \"\"\"User chose to continue from last session.\"\"\"\n self.state = set_resume_session(self.state, True)\n self.on_start(self.state)\n "},{"location":"login-screen/","title":"Login Screen","text":""},{"location":"login-screen/#evaluator.frontend.components.login_screen.LoginScreen","title":"LoginScreen ","text":" Bases: CTkFrame Class for the login screen. Source code in evaluator/frontend/components/login_screen.py class LoginScreen(ctk.CTkFrame):\n \"\"\"Class for the login screen.\"\"\"\n\n def __init__(\n self,\n master: ctk.CTk,\n on_login: Callable[[str, str, AppAttributes], tuple[str, Optional[AppState]]],\n on_login_success: Callable[[AppState], None],\n on_exit: Callable[[], NoReturn],\n attributes: AppAttributes,\n **kwargs\n ):\n \"\"\"Constructor.\"\"\"\n super().__init__(master, **kwargs)\n\n self.on_login = on_login\n self.on_login_success = on_login_success\n self.on_exit = on_exit\n self.attributes = attributes\n\n self.grid_rowconfigure(0, weight=1)\n self.grid_columnconfigure(0, weight=1)\n self.grid(\n row=0,\n column=0,\n padx=self.attributes[\"padding\"] + 10,\n pady=self.attributes[\"padding\"] + 10,\n )\n\n self.login_label = ctk.CTkLabel(\n master=self, text=\"Login\", font=(self.attributes[\"font\"], 32, \"bold\")\n )\n self.login_label.grid(\n row=0,\n column=0,\n columnspan=2,\n pady=(self.attributes[\"padding\"], self.attributes[\"padding\"] + 10),\n )\n\n self.first_name_entry = ctk.CTkEntry(\n master=self,\n placeholder_text=\"First name\",\n font=(self.attributes[\"font\"], 16),\n )\n self.first_name_entry.grid(\n row=1,\n column=0,\n padx=self.attributes[\"padding\"],\n pady=self.attributes[\"padding\"],\n )\n\n self.last_name_entry = ctk.CTkEntry(\n master=self,\n placeholder_text=\"Last name\",\n font=(self.attributes[\"font\"], 16),\n )\n self.last_name_entry.grid(\n row=1,\n column=1,\n padx=self.attributes[\"padding\"],\n pady=self.attributes[\"padding\"],\n )\n\n self.login_button = ctk.CTkButton(\n master=self,\n text=\"Login\",\n command=self._login,\n font=(self.attributes[\"font\"], 16),\n )\n self.login_button.grid(\n row=2, column=0, rowspan=2, columnspan=3, pady=self.attributes[\"padding\"]\n )\n\n self.exit_button = ctk.CTkButton(\n master=self,\n text=\"Exit\",\n command=self._exit_app,\n font=(self.attributes[\"font\"], 16),\n )\n self.exit_button.grid(\n row=3, column=0, rowspan=2, columnspan=3, pady=self.attributes[\"padding\"]\n )\n\n self.error_label = ctk.CTkLabel(\n master=self, text=\"\", font=(self.attributes[\"font\"], 14), text_color=\"red\"\n )\n self.error_label.grid(\n row=4, column=0, columnspan=2, pady=(self.attributes[\"padding\"], 0)\n )\n\n def _login(self) -> None:\n \"\"\"Intermediate callback for the login button.\"\"\"\n return_str, state = self.on_login(\n self.first_name_entry.get(), self.last_name_entry.get(), self.attributes\n )\n if state is None:\n self.error_label.configure(text=return_str)\n return\n self.on_login_success(state)\n\n def _exit_app(self) -> NoReturn:\n \"\"\"Intermediate callback for the exit button.\"\"\"\n self.on_exit()\n "},{"location":"login-screen/#evaluator.frontend.components.login_screen.LoginScreen.__init__","title":"__init__(master, on_login, on_login_success, on_exit, attributes, **kwargs) ","text":"Constructor. Source code in evaluator/frontend/components/login_screen.py def __init__(\n self,\n master: ctk.CTk,\n on_login: Callable[[str, str, AppAttributes], tuple[str, Optional[AppState]]],\n on_login_success: Callable[[AppState], None],\n on_exit: Callable[[], NoReturn],\n attributes: AppAttributes,\n **kwargs\n):\n \"\"\"Constructor.\"\"\"\n super().__init__(master, **kwargs)\n\n self.on_login = on_login\n self.on_login_success = on_login_success\n self.on_exit = on_exit\n self.attributes = attributes\n\n self.grid_rowconfigure(0, weight=1)\n self.grid_columnconfigure(0, weight=1)\n self.grid(\n row=0,\n column=0,\n padx=self.attributes[\"padding\"] + 10,\n pady=self.attributes[\"padding\"] + 10,\n )\n\n self.login_label = ctk.CTkLabel(\n master=self, text=\"Login\", font=(self.attributes[\"font\"], 32, \"bold\")\n )\n self.login_label.grid(\n row=0,\n column=0,\n columnspan=2,\n pady=(self.attributes[\"padding\"], self.attributes[\"padding\"] + 10),\n )\n\n self.first_name_entry = ctk.CTkEntry(\n master=self,\n placeholder_text=\"First name\",\n font=(self.attributes[\"font\"], 16),\n )\n self.first_name_entry.grid(\n row=1,\n column=0,\n padx=self.attributes[\"padding\"],\n pady=self.attributes[\"padding\"],\n )\n\n self.last_name_entry = ctk.CTkEntry(\n master=self,\n placeholder_text=\"Last name\",\n font=(self.attributes[\"font\"], 16),\n )\n self.last_name_entry.grid(\n row=1,\n column=1,\n padx=self.attributes[\"padding\"],\n pady=self.attributes[\"padding\"],\n )\n\n self.login_button = ctk.CTkButton(\n master=self,\n text=\"Login\",\n command=self._login,\n font=(self.attributes[\"font\"], 16),\n )\n self.login_button.grid(\n row=2, column=0, rowspan=2, columnspan=3, pady=self.attributes[\"padding\"]\n )\n\n self.exit_button = ctk.CTkButton(\n master=self,\n text=\"Exit\",\n command=self._exit_app,\n font=(self.attributes[\"font\"], 16),\n )\n self.exit_button.grid(\n row=3, column=0, rowspan=2, columnspan=3, pady=self.attributes[\"padding\"]\n )\n\n self.error_label = ctk.CTkLabel(\n master=self, text=\"\", font=(self.attributes[\"font\"], 14), text_color=\"red\"\n )\n self.error_label.grid(\n row=4, column=0, columnspan=2, pady=(self.attributes[\"padding\"], 0)\n )\n "},{"location":"login-screen/#evaluator.frontend.components.login_screen.LoginScreen._login","title":"_login() ","text":"Intermediate callback for the login button. Source code in evaluator/frontend/components/login_screen.py def _login(self) -> None:\n \"\"\"Intermediate callback for the login button.\"\"\"\n return_str, state = self.on_login(\n self.first_name_entry.get(), self.last_name_entry.get(), self.attributes\n )\n if state is None:\n self.error_label.configure(text=return_str)\n return\n self.on_login_success(state)\n "},{"location":"login-screen/#evaluator.frontend.components.login_screen.LoginScreen._exit_app","title":"_exit_app() ","text":"Intermediate callback for the exit button. Source code in evaluator/frontend/components/login_screen.py def _exit_app(self) -> NoReturn:\n \"\"\"Intermediate callback for the exit button.\"\"\"\n self.on_exit()\n "},{"location":"login/","title":"Login","text":"Handles the backend for the login process. "},{"location":"login/#evaluator.backend.login.login","title":"login(first_name, last_name, attributes) ","text":"Login entry point. Parameters: Name Type Description Default first_name str First name entered by the user. required last_name str Last name entered by the user. required attributes AppAttributes The current app attributes. required Returns: Type Description (str, AppState | None) A string containing the user hash on success or an error message on errror and the current app state on success or None on error. Source code in evaluator/backend/login.py def login(\n first_name: str, last_name: str, attributes: AppAttributes\n) -> tuple[str, Optional[AppState]]:\n \"\"\"Login entry point.\n\n Parameters\n ----------\n first_name : str\n First name entered by the user.\n last_name : str\n Last name entered by the user.\n attributes : AppAttributes\n The current app attributes.\n\n Returns\n -------\n (str, AppState | None)\n A string containing the user hash on success or an\n error message on errror and the current app state\n on success or None on error.\n \"\"\"\n if not first_name and not last_name:\n return \"Error: First and last name are required.\", None\n elif not first_name:\n return \"Error: First name is required.\", None\n elif not last_name:\n return \"Error: Last name is required.\", None\n\n first_name = first_name.strip().lower()\n last_name = last_name.strip().lower()\n\n user_hash = _generate_user_hash(first_name, last_name)\n\n if _check_user_existence(user_hash, attributes):\n attributes[\"logger\"].info(f\"Found existing user for {last_name}, {first_name}\")\n new_user = False\n else:\n new_user = True\n\n app_state = create_app_state(\n attributes=attributes, user_hash=user_hash, new_user=new_user\n )\n if new_user:\n app_state = create_new_user(\n app_state=app_state, first_name=first_name, last_name=last_name\n )\n\n log_state(app_state, \"app\")\n\n return user_hash, app_state\n "},{"location":"login/#evaluator.backend.login._check_user_existence","title":"_check_user_existence(user_hash, attributes) ","text":"Checks if the user already exists or not. user_hash : str The user's md5 hash. attributes : AppAttributes The current app state. Returns: Type Description bool True if the user exists, False otherwise. Source code in evaluator/backend/login.py def _check_user_existence(user_hash: str, attributes: AppAttributes) -> bool:\n \"\"\"Checks if the user already exists or not.\n\n user_hash : str\n The user's md5 hash.\n attributes : AppAttributes\n The current app state.\n\n Returns\n -------\n bool\n True if the user exists, False otherwise.\n \"\"\"\n if user_hash in attributes[\"users_data\"]:\n return True\n else:\n return False\n "},{"location":"login/#evaluator.backend.login._generate_user_hash","title":"_generate_user_hash(first_name, last_name) ","text":"Generates the user's MD5 hash. Parameters: Name Type Description Default first_name str The user's first name. required last_name str The user's last name. required Returns: Type Description str The user hash. Source code in evaluator/backend/login.py def _generate_user_hash(first_name: str, last_name: str) -> str:\n \"\"\"Generates the user's MD5 hash.\n\n Parameters\n ----------\n first_name : str\n The user's first name.\n last_name : str\n The user's last name.\n\n Returns\n -------\n str\n The user hash.\n \"\"\"\n name_list = [first_name, last_name]\n sorted(name_list)\n name_str = \"_\".join(name_list)\n hash_hex = md5(name_str.encode(\"utf-8\")).hexdigest()\n return hash_hex\n "},{"location":"misc_functions/","title":"Utils","text":"Miscellaneous util functions. "},{"location":"misc_functions/#bcorag.misc_functions.graceful_exit","title":"graceful_exit(exit_code=0, error_msg=None) ","text":"Gracefully exits the program with an exit code. Parameters: Name Type Description Default exit_code int The exit code. 0 error_msg str | None The error message to print before exiting. None Source code in bcorag/misc_functions.py def graceful_exit(exit_code: int = 0, error_msg: Optional[str] = None) -> NoReturn:\n \"\"\"Gracefully exits the program with an exit code.\n\n Parameters\n ----------\n exit_code : int, optional\n The exit code.\n error_msg : str | None, optional\n The error message to print before exiting.\n \"\"\"\n if exit_code != 0:\n if error_msg is not None:\n print(f\"{error_msg}\")\n print(f\"exit code: {exit_code}\")\n print(\"Exiting...\")\n logging.info(f\"Exiting with status code {exit_code}.\")\n logging.info(\n \"---------------------------------- RUN END ----------------------------------\"\n )\n sys.exit(exit_code)\n "},{"location":"misc_functions/#bcorag.misc_functions.load_json","title":"load_json(filepath) ","text":"Loads a JSON file and returns the deserialized data (or an empty dict if the file doesn't exist). Parameters: Name Type Description Default filepath str File path to the JSON file to load. required Returns: Type Description dict | None The deserialized JSON data or None if the file doesn't exist. Source code in bcorag/misc_functions.py def load_json(filepath: str) -> Optional[dict]:\n \"\"\"Loads a JSON file and returns the deserialized data (or\n an empty dict if the file doesn't exist).\n\n Parameters\n ----------\n filepath : str\n File path to the JSON file to load.\n\n Returns\n -------\n dict | None\n The deserialized JSON data or None if the file doesn't exist.\n \"\"\"\n if not os.path.isfile(filepath):\n return None\n with open(filepath, \"r\") as f:\n data = json.load(f)\n return data\n "},{"location":"misc_functions/#bcorag.misc_functions.load_config_data","title":"load_config_data(filepath='./conf.json') ","text":"Loads the config JSON object file. Parameters: Name Type Description Default filepath str File path to the config JSON file. './conf.json' Returns: Type Description ConfigObject | None Casted ConfigObject or None on some type of error. Source code in bcorag/misc_functions.py def load_config_data(filepath: str = \"./conf.json\") -> Optional[ConfigObject]:\n \"\"\"Loads the config JSON object file.\n\n Parameters\n ----------\n filepath : str, optional\n File path to the config JSON file.\n\n Returns\n -------\n ConfigObject | None\n Casted ConfigObject or None on some type of error.\n \"\"\"\n naive_load_data = load_json(filepath)\n if naive_load_data is None:\n return None\n if isinstance(naive_load_data, dict):\n config_object = cast(ConfigObject, naive_load_data)\n return config_object\n return None\n "},{"location":"misc_functions/#bcorag.misc_functions.load_output_tracker","title":"load_output_tracker(filepath) ","text":"Loads the JSON output tracker file. Parameters: Name Type Description Default filepath str File path to the JSON file to load. required Returns: Type Description OutputTrackerFile | None Casted OutputTrackerFile or None on some type of error. Source code in bcorag/misc_functions.py def load_output_tracker(filepath: str) -> Optional[OutputTrackerFile]:\n \"\"\"Loads the JSON output tracker file.\n\n Parameters\n ----------\n filepath : str\n File path to the JSON file to load.\n\n Returns\n -------\n OutputTrackerFile | None\n Casted OutputTrackerFile or None on some type of error.\n \"\"\"\n naive_load_data = load_json(filepath)\n if naive_load_data is None:\n return None\n if isinstance(naive_load_data, dict):\n output_tracker_data = cast(OutputTrackerFile, naive_load_data)\n return output_tracker_data\n return None\n "},{"location":"misc_functions/#bcorag.misc_functions.write_json","title":"write_json(output_path, data) ","text":"Writes JSON out to the output path. Will create the file if it doesn't exist. Parameters: Name Type Description Default output_path str The output file path. required data dict | list | OutputTrackerFile The data to dump. required Returns: Type Description bool Whether the process was successful. Source code in bcorag/misc_functions.py def write_json(output_path: str, data: dict | list | OutputTrackerFile) -> bool:\n \"\"\"Writes JSON out to the output path. Will create the file if it doesn't exist.\n\n Parameters\n ----------\n output_path : str\n The output file path.\n data : dict | list | OutputTrackerFile\n The data to dump.\n\n Returns\n -------\n bool\n Whether the process was successful.\n \"\"\"\n try:\n with open(output_path, \"w\") as f:\n json.dump(data, f, indent=4)\n return True\n except Exception as e:\n logging.error(f\"Failed to dump JSON to output path '{output_path}'.\\n{e}\")\n return False\n "},{"location":"misc_functions/#bcorag.misc_functions.dump_output_file_map_tsv","title":"dump_output_file_map_tsv(output_path, data) ","text":"Dumps the OutputTrackerFile object into a TSV table for better human readability. Parameters: Name Type Description Default output_path str The output file path. required data OutputTrackerFile The OutputTrackerFile object to format for a TSV file. required Source code in bcorag/misc_functions.py def dump_output_file_map_tsv(output_path: str, data: OutputTrackerFile):\n \"\"\"Dumps the OutputTrackerFile object into a TSV table for better\n human readability.\n\n Parameters\n ----------\n output_path : str\n The output file path.\n data: OutputTrackerFile\n The OutputTrackerFile object to format for a TSV file.\n \"\"\"\n with open(output_path, mode=\"w\", newline=\"\") as out_file:\n tsv_writer = csv.writer(out_file, delimiter=\"\\t\")\n tsv_writer.writerow(\n [\n \"timestamp\",\n \"domain\",\n \"txt_file\",\n \"json_file\",\n \"node_source_file\",\n \"hash_string\",\n \"index\",\n \"loader\",\n \"vector_store\",\n \"llm\",\n \"embedding_model\",\n \"similarity_top_k\",\n \"chunking_config\",\n \"git_user\",\n \"git_repo\",\n \"git_branch\",\n \"directory_filter\",\n \"file_ext_filter\",\n \"elapsed_time\",\n \"version\",\n ]\n )\n domain: DomainKey\n for domain in get_args(DomainKey):\n domain_entry_list = data[domain]\n for entry_set in domain_entry_list:\n for entry in entry_set[\"entries\"][\"runs\"]:\n row = [\n entry[\"timestamp\"],\n domain,\n os.path.basename(entry[\"txt_file\"]),\n os.path.basename(entry[\"json_file\"]),\n os.path.basename(entry[\"source_node_file\"]),\n entry_set[\"hash_str\"],\n entry[\"index\"],\n entry_set[\"entries\"][\"params\"][\"loader\"],\n entry_set[\"entries\"][\"params\"][\"vector_store\"],\n entry_set[\"entries\"][\"params\"][\"llm\"],\n entry_set[\"entries\"][\"params\"][\"embedding_model\"],\n entry_set[\"entries\"][\"params\"][\"similarity_top_k\"],\n entry_set[\"entries\"][\"params\"][\"chunking_config\"],\n entry_set[\"entries\"][\"params\"][\"git_user\"],\n entry_set[\"entries\"][\"params\"][\"git_repo\"],\n entry_set[\"entries\"][\"params\"][\"git_branch\"],\n entry_set[\"entries\"][\"params\"][\"directory_git_filter\"],\n entry_set[\"entries\"][\"params\"][\"file_ext_git_filter\"],\n entry[\"elapsed_time\"],\n entry[\"version\"],\n ]\n tsv_writer.writerow(row)\n "},{"location":"misc_functions/#bcorag.misc_functions.dump_string","title":"dump_string(output_path, data) ","text":"Dumps a string to a text file. Parameters: Name Type Description Default output_path str The output file path. required data str The string to dump. required Source code in bcorag/misc_functions.py def dump_string(output_path: str, data: str):\n \"\"\"Dumps a string to a text file.\n\n Parameters\n ----------\n output_path : str\n The output file path.\n data: str\n The string to dump.\n \"\"\"\n check_dir(os.path.split(output_path)[0])\n with open(output_path, \"w\") as f:\n f.write(data)\n "},{"location":"misc_functions/#bcorag.misc_functions.check_dir","title":"check_dir(path) ","text":"Checks whether a directory creates and if it doesn't, create it. Note, this really only works for checking/creating the last level direcotry. Will fail if there are issues in the parent level directories in the path. Parameters: Name Type Description Default path str Directory filepath to check. required Source code in bcorag/misc_functions.py def check_dir(path: str):\n \"\"\"Checks whether a directory creates and if it doesn't, create it. Note, this\n really only works for checking/creating the last level direcotry. Will fail if\n there are issues in the parent level directories in the path.\n\n Parameters\n ----------\n path : str\n Directory filepath to check.\n \"\"\"\n if not os.path.isdir(path):\n os.mkdir(path)\n "},{"location":"misc_functions/#bcorag.misc_functions.setup_root_logger","title":"setup_root_logger(log_path, name='bcorag') ","text":"Configures the root logger. Parameters: Name Type Description Default log_path str The filepath to the log handler. required name str The name of the root logger. 'bcorag' Returns: Type Description Logger The root logger. Source code in bcorag/misc_functions.py def setup_root_logger(log_path: str, name: str = \"bcorag\") -> logging.Logger:\n \"\"\"Configures the root logger.\n\n Parameters\n ----------\n log_path : str\n The filepath to the log handler.\n name : str, optional\n The name of the root logger.\n\n Returns\n -------\n logging.Logger\n The root logger.\n \"\"\"\n logger = logging.getLogger(name)\n logger.setLevel(logging.DEBUG)\n handler = logging.FileHandler(filename=log_path, encoding=\"utf-8\", mode=\"w\")\n formatter = logging.Formatter(\n \"%(asctime)s - %(levelname)s - %(name)s - %(message)s\"\n )\n handler.setFormatter(formatter)\n logger.addHandler(handler)\n return logger\n "},{"location":"misc_functions/#bcorag.misc_functions.setup_document_logger","title":"setup_document_logger(name, parent_logger='bcorag') ","text":"Configures a document specific logger. Parameters: Name Type Description Default name str The name of the document to setup the logger for. required parent_logger str Name of the parent logger to setup under. 'bcorag' Returns: Type Description Logger The document logger. Source code in bcorag/misc_functions.py def setup_document_logger(name: str, parent_logger: str = \"bcorag\") -> logging.Logger:\n \"\"\"Configures a document specific logger.\n\n Parameters\n ----------\n name : str\n The name of the document to setup the logger for.\n parent_logger : str, optional\n Name of the parent logger to setup under.\n\n Returns\n -------\n logging.Logger\n The document logger.\n \"\"\"\n document_logger_name = f\"{parent_logger}.{name}\"\n return logging.getLogger(document_logger_name)\n "},{"location":"misc_functions/#bcorag.misc_functions.create_timestamp","title":"create_timestamp() ","text":"Creates a current timestamp. Returns: Type Description str The current timestamp as a string. Source code in bcorag/misc_functions.py def create_timestamp() -> str:\n \"\"\"Creates a current timestamp.\n\n Returns\n -------\n str\n The current timestamp as a string.\n \"\"\"\n timestamp = datetime.datetime.now(pytz.timezone(TIMEZONE)).strftime(\n TIMESTAMP_FORMAT\n )\n return timestamp\n "},{"location":"misc_functions/#bcorag.misc_functions.extract_repo_data","title":"extract_repo_data(url) ","text":"Extracts the repository information from the repo URL. Parameters: Name Type Description Default url str The Github repository URL. required Returns: Type Description (str, str) | None Returns the tuple containing the extracted github user and repo or None on failure to parse the URL. Source code in bcorag/misc_functions.py def extract_repo_data(url: str) -> Optional[tuple[str, str]]:\n \"\"\"Extracts the repository information from the repo URL.\n\n Parameters\n ----------\n url : str\n The Github repository URL.\n\n Returns\n -------\n (str, str) | None\n Returns the tuple containing the extracted github user\n and repo or None on failure to parse the URL.\n \"\"\"\n url = url.strip().lower()\n pattern = r\"https://github\\.com/([^/]+)/([^/]+)\"\n match = re.match(pattern, url)\n if match is None:\n return None\n user = str(match.groups()[0])\n repo = str(match.groups()[1])\n return user, repo\n "},{"location":"misc_functions/#bcorag.misc_functions.get_file_list","title":"get_file_list(path, filetype='pdf') ","text":"Gets the files from a glob pattern. Parameters: Name Type Description Default path str The file path to the target directory. required filetype str The file type to capture. 'pdf' Returns: Type Description list[str] List of the file paths found from the glob pattern. Source code in bcorag/misc_functions.py def get_file_list(path: str, filetype: str = \"pdf\") -> list[str]:\n \"\"\"Gets the files from a glob pattern.\n\n Parameters\n ----------\n path : str\n The file path to the target directory.\n filetype : str, optional\n The file type to capture.\n\n Returns\n -------\n list[str]\n List of the file paths found from the glob pattern.\n \"\"\"\n target_files = glob.glob(os.path.join(path, f\"*.{filetype}\"))\n return target_files\n "},{"location":"miscellaneous-frame/","title":"Miscellaneous Frame","text":""},{"location":"miscellaneous-frame/#evaluator.frontend.components.evaluation_frames.miscellaneous_frame.MiscFrame","title":"MiscFrame ","text":" Bases: CTkFrame , EvaluationBaseFrame Class for the miscellaneous evaluation frame. Source code in evaluator/frontend/components/evaluation_frames/miscellaneous_frame.py class MiscFrame(ctk.CTkFrame, EvaluationBaseFrame):\n \"\"\"Class for the miscellaneous evaluation frame.\"\"\"\n\n def __init__(\n self, master: ctk.CTkFrame, app_state: AppState, run_state: RunState, **kwargs\n ):\n \"\"\"Constructor.\"\"\"\n super().__init__(master, **kwargs)\n self.state = app_state\n self.run = run_state\n self.misc_eval = self.run[\"eval_data\"][\"misc_eval\"]\n\n self.grid_columnconfigure(0, weight=1)\n self.grid_columnconfigure(1, weight=20)\n self.grid_rowconfigure(8, weight=1)\n\n self.main_misc_label = ctk.CTkLabel(\n master=self,\n text=\"Miscellaneous Evaluation\",\n font=(self.state[\"font\"], 28, \"bold\"),\n )\n self.main_misc_label.grid(\n row=0,\n column=0,\n columnspan=2,\n padx=self.state[\"padding\"],\n pady=self.state[\"padding\"],\n sticky=\"n\",\n )\n\n self.human_domain_rating_label = ctk.CTkLabel(\n master=self,\n text=\"What would you rate the human curated domain?\",\n font=(self.state[\"font\"], 16, \"bold\"),\n )\n self.human_domain_rating_label.grid(\n row=2,\n column=0,\n padx=(self.state[\"padding\"], 0),\n pady=(self.state[\"padding\"] // 2, self.state[\"padding\"] // 4),\n sticky=\"w\",\n )\n\n self.human_domain_rating_var = ctk.IntVar(\n value=self.misc_eval.get(\n \"human_domain_rating\", EVAL_DEFAULTS[\"human_domain_rating\"]\n )\n )\n self.human_domain_rating_button = ctk.CTkSegmentedButton(\n master=self, values=[-1, 0, 1, 2], variable=self.human_domain_rating_var\n )\n self.human_domain_rating_button.grid(\n row=3,\n column=0,\n padx=(self.state[\"padding\"], 0),\n pady=(self.state[\"padding\"] // 4, self.state[\"padding\"] // 2),\n sticky=\"w\",\n )\n\n self.evaluator_conf_label = ctk.CTkLabel(\n master=self,\n text=\"What is your confidence in your evaluation?\",\n font=(self.state[\"font\"], 16, \"bold\"),\n )\n self.evaluator_conf_label.grid(\n row=4,\n column=0,\n padx=(self.state[\"padding\"], 0),\n pady=(self.state[\"padding\"] // 2, self.state[\"padding\"] // 4),\n sticky=\"w\",\n )\n\n self.evaluator_conf_var = ctk.IntVar(\n value=self.misc_eval.get(\n \"evaluator_confidence_rating\",\n EVAL_DEFAULTS[\"evaluator_confidence_rating\"],\n )\n )\n self.evaluator_conf_button = ctk.CTkSegmentedButton(\n master=self, values=[-1, 0, 1, 2], variable=self.evaluator_conf_var\n )\n self.evaluator_conf_button.grid(\n row=5,\n column=0,\n padx=(self.state[\"padding\"], 0),\n pady=(self.state[\"padding\"] // 4, self.state[\"padding\"] // 2),\n sticky=\"w\",\n )\n\n self.evaluator_fam_label = ctk.CTkLabel(\n master=self,\n text=\"What is your familiarity with the paper content?\",\n font=(self.state[\"font\"], 16, \"bold\"),\n )\n self.evaluator_fam_label.grid(\n row=6,\n column=0,\n padx=(self.state[\"padding\"], 0),\n pady=(self.state[\"padding\"] // 2, self.state[\"padding\"] // 4),\n sticky=\"w\",\n )\n\n self.evaluator_fam_var = ctk.IntVar(\n value=self.misc_eval.get(\n \"evaluator_familiarity_level\",\n EVAL_DEFAULTS[\"evaluator_familiarity_level\"],\n )\n )\n self.evaluator_fam_button = ctk.CTkSegmentedButton(\n master=self, values=[-1, 0, 1, 2], variable=self.evaluator_conf_var\n )\n self.evaluator_fam_button.grid(\n row=7,\n column=0,\n padx=(self.state[\"padding\"], 0),\n pady=(self.state[\"padding\"] // 4, self.state[\"padding\"] // 2),\n sticky=\"w\",\n )\n\n self.misc_notes_label = ctk.CTkLabel(\n master=self, text=\"Notes\", font=(self.state[\"font\"], 16, \"bold\")\n )\n self.misc_notes_label.grid(\n row=2,\n column=1,\n padx=(0, self.state[\"padding\"] // 4),\n pady=(self.state[\"padding\"] // 2, self.state[\"padding\"] // 4),\n sticky=\"w\",\n )\n\n self.misc_notes = ctk.CTkTextbox(\n master=self, wrap=\"word\", font=(self.state[\"font\"], 18)\n )\n self.misc_notes.grid(\n row=3,\n rowspan=6,\n column=1,\n padx=(0, self.state[\"padding\"] // 2),\n pady=(self.state[\"padding\"] // 4, self.state[\"padding\"]),\n sticky=\"nsew\",\n )\n\n def update_state(self, app_state: AppState, run_state: RunState) -> None:\n \"\"\"Update the component state.\n\n Parameters\n ----------\n app_state : AppState\n The updated app state.\n run_state : RunState\n The updated run state.\n \"\"\"\n self.run = run_state\n self.state = app_state\n self.misc_eval = self.run[\"eval_data\"][\"misc_eval\"]\n\n self.human_domain_rating_var = ctk.IntVar(\n value=self.misc_eval.get(\n \"human_domain_rating\", EVAL_DEFAULTS[\"human_domain_rating\"]\n )\n )\n self.human_domain_rating_button.configure(variable=self.human_domain_rating_var)\n\n self.evaluator_conf_var = ctk.IntVar(\n value=self.misc_eval.get(\n \"evaluator_confidence_rating\",\n EVAL_DEFAULTS[\"evaluator_confidence_rating\"],\n )\n )\n self.evaluator_conf_button.configure(variable=self.evaluator_conf_var)\n\n self.evaluator_fam_var = ctk.IntVar(\n value=self.misc_eval.get(\n \"evaluator_familiarity_level\",\n EVAL_DEFAULTS[\"evaluator_familiarity_level\"],\n )\n )\n self.evaluator_fam_button.configure(variable=self.evaluator_fam_var)\n\n self.misc_notes.delete(0.0, \"end\")\n self.misc_notes.insert(0.0, self.misc_eval.get(\"notes\", EVAL_DEFAULTS[\"notes\"]))\n\n def get_results(self) -> MiscEval:\n \"\"\"Returns the miscellaneous evaluations.\n\n Returns\n -------\n MiscEval\n The miscellaneous evaluation results.\n \"\"\"\n human_domain_rating = self.human_domain_rating_var.get()\n evaluator_conf_rating = self.evaluator_conf_var.get()\n evaluator_familiarity_level = self.evaluator_fam_var.get()\n misc_eval = create_misc_eval(\n human_domain_rating=human_domain_rating,\n evaluator_confidence_rating=evaluator_conf_rating,\n evaluator_familiarity_level=evaluator_familiarity_level,\n notes=self.misc_notes.get(0.0, \"end\"),\n )\n return misc_eval\n "},{"location":"miscellaneous-frame/#evaluator.frontend.components.evaluation_frames.miscellaneous_frame.MiscFrame.__init__","title":"__init__(master, app_state, run_state, **kwargs) ","text":"Constructor. Source code in evaluator/frontend/components/evaluation_frames/miscellaneous_frame.py def __init__(\n self, master: ctk.CTkFrame, app_state: AppState, run_state: RunState, **kwargs\n):\n \"\"\"Constructor.\"\"\"\n super().__init__(master, **kwargs)\n self.state = app_state\n self.run = run_state\n self.misc_eval = self.run[\"eval_data\"][\"misc_eval\"]\n\n self.grid_columnconfigure(0, weight=1)\n self.grid_columnconfigure(1, weight=20)\n self.grid_rowconfigure(8, weight=1)\n\n self.main_misc_label = ctk.CTkLabel(\n master=self,\n text=\"Miscellaneous Evaluation\",\n font=(self.state[\"font\"], 28, \"bold\"),\n )\n self.main_misc_label.grid(\n row=0,\n column=0,\n columnspan=2,\n padx=self.state[\"padding\"],\n pady=self.state[\"padding\"],\n sticky=\"n\",\n )\n\n self.human_domain_rating_label = ctk.CTkLabel(\n master=self,\n text=\"What would you rate the human curated domain?\",\n font=(self.state[\"font\"], 16, \"bold\"),\n )\n self.human_domain_rating_label.grid(\n row=2,\n column=0,\n padx=(self.state[\"padding\"], 0),\n pady=(self.state[\"padding\"] // 2, self.state[\"padding\"] // 4),\n sticky=\"w\",\n )\n\n self.human_domain_rating_var = ctk.IntVar(\n value=self.misc_eval.get(\n \"human_domain_rating\", EVAL_DEFAULTS[\"human_domain_rating\"]\n )\n )\n self.human_domain_rating_button = ctk.CTkSegmentedButton(\n master=self, values=[-1, 0, 1, 2], variable=self.human_domain_rating_var\n )\n self.human_domain_rating_button.grid(\n row=3,\n column=0,\n padx=(self.state[\"padding\"], 0),\n pady=(self.state[\"padding\"] // 4, self.state[\"padding\"] // 2),\n sticky=\"w\",\n )\n\n self.evaluator_conf_label = ctk.CTkLabel(\n master=self,\n text=\"What is your confidence in your evaluation?\",\n font=(self.state[\"font\"], 16, \"bold\"),\n )\n self.evaluator_conf_label.grid(\n row=4,\n column=0,\n padx=(self.state[\"padding\"], 0),\n pady=(self.state[\"padding\"] // 2, self.state[\"padding\"] // 4),\n sticky=\"w\",\n )\n\n self.evaluator_conf_var = ctk.IntVar(\n value=self.misc_eval.get(\n \"evaluator_confidence_rating\",\n EVAL_DEFAULTS[\"evaluator_confidence_rating\"],\n )\n )\n self.evaluator_conf_button = ctk.CTkSegmentedButton(\n master=self, values=[-1, 0, 1, 2], variable=self.evaluator_conf_var\n )\n self.evaluator_conf_button.grid(\n row=5,\n column=0,\n padx=(self.state[\"padding\"], 0),\n pady=(self.state[\"padding\"] // 4, self.state[\"padding\"] // 2),\n sticky=\"w\",\n )\n\n self.evaluator_fam_label = ctk.CTkLabel(\n master=self,\n text=\"What is your familiarity with the paper content?\",\n font=(self.state[\"font\"], 16, \"bold\"),\n )\n self.evaluator_fam_label.grid(\n row=6,\n column=0,\n padx=(self.state[\"padding\"], 0),\n pady=(self.state[\"padding\"] // 2, self.state[\"padding\"] // 4),\n sticky=\"w\",\n )\n\n self.evaluator_fam_var = ctk.IntVar(\n value=self.misc_eval.get(\n \"evaluator_familiarity_level\",\n EVAL_DEFAULTS[\"evaluator_familiarity_level\"],\n )\n )\n self.evaluator_fam_button = ctk.CTkSegmentedButton(\n master=self, values=[-1, 0, 1, 2], variable=self.evaluator_conf_var\n )\n self.evaluator_fam_button.grid(\n row=7,\n column=0,\n padx=(self.state[\"padding\"], 0),\n pady=(self.state[\"padding\"] // 4, self.state[\"padding\"] // 2),\n sticky=\"w\",\n )\n\n self.misc_notes_label = ctk.CTkLabel(\n master=self, text=\"Notes\", font=(self.state[\"font\"], 16, \"bold\")\n )\n self.misc_notes_label.grid(\n row=2,\n column=1,\n padx=(0, self.state[\"padding\"] // 4),\n pady=(self.state[\"padding\"] // 2, self.state[\"padding\"] // 4),\n sticky=\"w\",\n )\n\n self.misc_notes = ctk.CTkTextbox(\n master=self, wrap=\"word\", font=(self.state[\"font\"], 18)\n )\n self.misc_notes.grid(\n row=3,\n rowspan=6,\n column=1,\n padx=(0, self.state[\"padding\"] // 2),\n pady=(self.state[\"padding\"] // 4, self.state[\"padding\"]),\n sticky=\"nsew\",\n )\n "},{"location":"miscellaneous-frame/#evaluator.frontend.components.evaluation_frames.miscellaneous_frame.MiscFrame.update_state","title":"update_state(app_state, run_state) ","text":"Update the component state. Parameters: Name Type Description Default app_state AppState The updated app state. required run_state RunState The updated run state. required Source code in evaluator/frontend/components/evaluation_frames/miscellaneous_frame.py def update_state(self, app_state: AppState, run_state: RunState) -> None:\n \"\"\"Update the component state.\n\n Parameters\n ----------\n app_state : AppState\n The updated app state.\n run_state : RunState\n The updated run state.\n \"\"\"\n self.run = run_state\n self.state = app_state\n self.misc_eval = self.run[\"eval_data\"][\"misc_eval\"]\n\n self.human_domain_rating_var = ctk.IntVar(\n value=self.misc_eval.get(\n \"human_domain_rating\", EVAL_DEFAULTS[\"human_domain_rating\"]\n )\n )\n self.human_domain_rating_button.configure(variable=self.human_domain_rating_var)\n\n self.evaluator_conf_var = ctk.IntVar(\n value=self.misc_eval.get(\n \"evaluator_confidence_rating\",\n EVAL_DEFAULTS[\"evaluator_confidence_rating\"],\n )\n )\n self.evaluator_conf_button.configure(variable=self.evaluator_conf_var)\n\n self.evaluator_fam_var = ctk.IntVar(\n value=self.misc_eval.get(\n \"evaluator_familiarity_level\",\n EVAL_DEFAULTS[\"evaluator_familiarity_level\"],\n )\n )\n self.evaluator_fam_button.configure(variable=self.evaluator_fam_var)\n\n self.misc_notes.delete(0.0, \"end\")\n self.misc_notes.insert(0.0, self.misc_eval.get(\"notes\", EVAL_DEFAULTS[\"notes\"]))\n "},{"location":"miscellaneous-frame/#evaluator.frontend.components.evaluation_frames.miscellaneous_frame.MiscFrame.get_results","title":"get_results() ","text":"Returns the miscellaneous evaluations. Returns: Type Description MiscEval The miscellaneous evaluation results. Source code in evaluator/frontend/components/evaluation_frames/miscellaneous_frame.py def get_results(self) -> MiscEval:\n \"\"\"Returns the miscellaneous evaluations.\n\n Returns\n -------\n MiscEval\n The miscellaneous evaluation results.\n \"\"\"\n human_domain_rating = self.human_domain_rating_var.get()\n evaluator_conf_rating = self.evaluator_conf_var.get()\n evaluator_familiarity_level = self.evaluator_fam_var.get()\n misc_eval = create_misc_eval(\n human_domain_rating=human_domain_rating,\n evaluator_confidence_rating=evaluator_conf_rating,\n evaluator_familiarity_level=evaluator_familiarity_level,\n notes=self.misc_notes.get(0.0, \"end\"),\n )\n return misc_eval\n "},{"location":"miscellaneous/","title":"Utils","text":""},{"location":"miscellaneous/#evaluator.backend.miscellaneous.exit_app","title":"exit_app() ","text":"Gracefully exits the app. Source code in evaluator/backend/miscellaneous.py def exit_app() -> NoReturn:\n \"\"\"Gracefully exits the app.\"\"\"\n misc_functions.graceful_exit(0)\n "},{"location":"miscellaneous/#evaluator.backend.miscellaneous.log_state","title":"log_state(state, state_type) ","text":"Logs the app state. Parameters: Name Type Description Default state AppState or RunState The state to log. required state_type app or run The type of state being logged. required Source code in evaluator/backend/miscellaneous.py def log_state(state: AppState | RunState, state_type: Literal[\"app\", \"run\"]) -> None:\n \"\"\"Logs the app state.\n\n Parameters\n ----------\n state : AppState or RunState\n The state to log.\n state_type : \"app\" or \"run\"\n The type of state being logged.\n \"\"\"\n app_state_flag = True if state_type.lower().strip() == \"app\" else False\n log_str = \"App state:\\n\" if app_state_flag else \"Run state:\\n\"\n\n if app_state_flag:\n app_state = cast(AppState, state)\n app_key: AppStateKey\n for app_key in get_args(AppStateKey):\n if app_key == \"logger\":\n continue\n log_str += f\"\\t{app_key}: {app_state[app_key]}\\n\"\n else:\n run_state = cast(RunState, state)\n run_key: RunStateKey\n for run_key in get_args(RunStateKey):\n if run_key in {\n \"generated_domain\",\n \"human_curated_domain\",\n \"reference_nodes\",\n \"param_set\",\n \"logger\",\n \"eval_data\"\n }:\n continue\n log_str += f\"\\t{run_key}: {run_state[run_key]}\\n\"\n\n state[\"logger\"].info(log_str)\n "},{"location":"option-picker/","title":"Option Picker","text":"Simple CLI interface for choosing one of the pre-selected baseline testing paper. Will automatically grab any PDF file in the ../../papers/ directory. "},{"location":"option-picker/#bcorag.option_picker.initialize_picker","title":"initialize_picker(filetype='pdf') ","text":"Kicks off the initial pipeline step where the user picks their PDF file to index and chooser the data loader from a pre-set list. Parameters: Name Type Description Default filetype str The filetype to filter on, this project was build to handle PDF files so it is highly unlikely you will want to override this default. 'pdf' Returns: Type Description UserSelections | None The user selections or None indicating user chose to exit or error. Source code in bcorag/option_picker.py def initialize_picker(filetype: str = \"pdf\") -> Optional[UserSelections]:\n \"\"\"Kicks off the initial pipeline step where the user picks their\n PDF file to index and chooser the data loader from a pre-set list.\n\n Parameters\n ----------\n filetype : str, optional\n The filetype to filter on, this project was build to handle PDF\n files so it is highly unlikely you will want to override this default.\n\n Returns\n -------\n UserSelections | None\n The user selections or None indicating user chose to exit or error.\n \"\"\"\n\n presets = misc_fns.load_config_data(\"./bcorag/conf.json\")\n if presets is None or isinstance(presets, list):\n print(f\"Error reading config file. Got type `{type(presets)}` for `presets`.\")\n misc_fns.graceful_exit()\n\n # set base keys\n return_data: UserSelections = { # type: ignore\n f\"{option}\": None for option in presets[\"options\"].keys()\n }\n\n target_file_information = _file_picker(presets[\"paper_directory\"], filetype)\n if target_file_information is None:\n return None\n return_data[\"filename\"] = target_file_information[0]\n return_data[\"filepath\"] = target_file_information[1]\n\n option: OptionKey\n for option in get_args(OptionKey):\n target_option = _create_picker(\n option,\n presets[\"options\"][option][\"documentation\"],\n presets[\"options\"][option][\"list\"],\n presets[\"options\"][option].get(\"default\", None),\n )\n if target_option is None:\n return None\n return_data[option] = int(target_option) if option in {\"similarity_top_k\"} else target_option # type: ignore\n\n repo_data = _repo_picker()\n if repo_data == 0:\n return None\n if repo_data is None:\n return_data[\"git_data\"] = None\n else:\n return_data[\"git_data\"] = repo_data\n\n in_progress_docs_path = _in_progress_docs()\n if in_progress_docs_path:\n return_data[\"other_docs\"] = [in_progress_docs_path]\n\n return return_data\n "},{"location":"option-picker/#bcorag.option_picker._file_picker","title":"_file_picker(path, filetype='pdf') ","text":"Create the CLI menu to pick the PDF file from the papers directory. Parameters: Name Type Description Default path str The path to the directory to display the CLI menu for. required filetype str The filetype to filter on, this project was build to handle PDF files so it is highly unlikely you will want to override this default. 'pdf' Returns: Type Description (str, str) | None Returns the name and path of the selected file or None if the user selects exit. Source code in bcorag/option_picker.py def _file_picker(path: str, filetype: str = \"pdf\") -> Optional[Tuple[str, str]]:\n \"\"\"Create the CLI menu to pick the PDF file from the papers directory.\n\n Parameters\n ----------\n path : str\n The path to the directory to display the CLI menu for.\n filetype : str, optional\n The filetype to filter on, this project was build to handle PDF\n files so it is highly unlikely you will want to override this default.\n\n Returns\n -------\n (str, str) | None\n Returns the name and path of the selected file or None if the user selects exit.\n \"\"\"\n target_files = misc_fns.get_file_list(path, filetype)\n pick_options = [os.path.basename(filename) for filename in target_files]\n pick_options.append(EXIT_OPTION)\n pick_title = \"Please choose the PDF file to index:\"\n option, _ = pick(pick_options, pick_title, indicator=\"->\")\n option = str(option)\n if option == EXIT_OPTION:\n return None\n return str(option), f\"{path}{option}\"\n "},{"location":"option-picker/#bcorag.option_picker._repo_picker","title":"_repo_picker() ","text":"Allows the user to input a github repository link to be included in the indexing. Returns: Type Description GitData | None | 0 Returns parsed repo information from the link, None if the user skips this step, or 0 (exit status) if the user chooses to exit. Source code in bcorag/option_picker.py def _repo_picker() -> Optional[GitData] | Literal[0]:\n \"\"\"Allows the user to input a github repository link to be included in the indexing.\n\n Returns\n -------\n GitData | None | 0\n Returns parsed repo information from the link, None if the user skips this step,\n or 0 (exit status) if the user chooses to exit.\n \"\"\"\n\n while True:\n\n url_prompt = 'If you would like to include a Github repository enter the URL below. Enter \"x\" to exit or leave blank to skip.\\n> '\n url = input(url_prompt)\n if not url or url is None:\n print(\"Skipping Github repo...\")\n return None\n elif url == \"x\":\n return 0\n\n match = misc_fns.extract_repo_data(url)\n if match is None:\n print(\"Error parsing repository URL.\")\n continue\n user = match[0]\n repo = match[1]\n\n branch = input(\"Repo branch to index (case sensitive):\\n> \")\n if not branch:\n branch = \"main\"\n\n git_filters: list[GitFilters] = []\n\n directory_filter_prompt = \"Would you like to include a directory filter?\"\n directory_filter_prompt += \"\\nEnter a list of comma-delimited directories to either conditionally exclude or inclusively include. \"\n directory_filter_prompt += \"Or leave blank to skip.\\n> \"\n directory_filter_val = input(directory_filter_prompt)\n if directory_filter_val and directory_filter_val is not None:\n directories = [\n dir.strip() for dir in directory_filter_val.split(\",\") if dir.strip()\n ]\n directory_filter_condition_prompt = (\n 'Enter \"include\" or \"exclude\" for the directory filter.\\n> '\n )\n directory_filter_condition_val = input(directory_filter_condition_prompt)\n directory_filter_type = (\n GithubRepositoryReader.FilterType.INCLUDE\n if directory_filter_condition_val.lower().strip() == \"include\"\n else GithubRepositoryReader.FilterType.EXCLUDE\n )\n directory_filter = create_git_filters(directory_filter_type, GitFilter.DIRECTORY, value=directories)\n git_filters.append(directory_filter)\n\n file_ext_filter_prompt = \"Would you like to include a file extension filter?\"\n file_ext_filter_prompt += \"\\nEnter a list of comma-delimited file extensions to either conditionally exclude or inclusively include. \"\n file_ext_filter_prompt += \"Or leave blank to skip.\\n> \"\n file_ext_filter_val = input(file_ext_filter_prompt)\n if file_ext_filter_val and file_ext_filter_val is not None:\n file_exts = [\n ext.strip() for ext in file_ext_filter_val.split(\",\") if ext.strip()\n ]\n file_ext_filter_condition_prompt = (\n 'Enter \"include\" or \"exclude\" for the file extension filter.\\n> '\n )\n file_ext_filter_condition_val = input(file_ext_filter_condition_prompt)\n file_ext_filter_type = (\n GithubRepositoryReader.FilterType.INCLUDE\n if file_ext_filter_condition_val.lower().strip() == \"include\"\n else GithubRepositoryReader.FilterType.EXCLUDE\n )\n file_ext_filter = create_git_filters(file_ext_filter_type, GitFilter.FILE_EXTENSION, value=file_exts)\n git_filters.append(file_ext_filter)\n\n return_data = create_git_data(user, repo, branch, git_filters)\n return return_data\n "},{"location":"option-picker/#bcorag.option_picker._create_picker","title":"_create_picker(title_keyword, documentation, option_list, default=None) ","text":"Creates a general picker CLI based on a list of options and the functionality to optionally mark one option as the default. Parameters: Name Type Description Default title_keyword str The keyword to use for the picker title. required documentation str Link to the documentation for the option. required option_list list[str] The list of options to display in the picker menu. required default str | None The option to mark one option as the default. None Returns: Type Description str | None The chosen option of None if the user selected to exit. Source code in bcorag/option_picker.py def _create_picker(\n title_keyword: str,\n documentation: str,\n option_list: list[str],\n default: Optional[str] = None,\n) -> Optional[str]:\n \"\"\"Creates a general picker CLI based on a list of options and the\n functionality to optionally mark one option as the default.\n\n Parameters\n ----------\n title_keyword : str\n The keyword to use for the picker title.\n documentation : str\n Link to the documentation for the option.\n option_list : list[str]\n The list of options to display in the picker menu.\n default : str | None, optional\n The option to mark one option as the default.\n\n Returns\n -------\n str | None\n The chosen option of None if the user selected to exit.\n \"\"\"\n pick_title = f\"Please choose one of the following {title_keyword.replace('_', ' ').title()}s.\\nDocumentation can be found at:\\n{documentation}.\"\n pick_options = [\n f\"{option} (default)\" if option == default else option for option in option_list\n ]\n pick_options.append(EXIT_OPTION)\n option, _ = pick(pick_options, pick_title, indicator=\"->\")\n option = str(option)\n if option == EXIT_OPTION:\n return None\n if \" (default)\" in option:\n option = option.replace(\" (default)\", \"\")\n return option\n "},{"location":"option-picker/#bcorag.option_picker._in_progress_docs","title":"_in_progress_docs() ","text":"Checks if in progress documentation is found. Returns: Type Description str or None The file path to the in progress documentation to include or None if the user chose not to include or no documentation was found. Source code in bcorag/option_picker.py def _in_progress_docs() -> Optional[str]:\n \"\"\"Checks if in progress documentation is found.\n\n Returns\n -------\n str or None\n The file path to the in progress documentation to include or None\n if the user chose not to include or no documentation was found.\n \"\"\"\n in_progress_docs_path = os.path.join(os.getcwd(), \"aggregator\", \"summary.md\")\n if os.path.isfile(in_progress_docs_path):\n prompt = \"Found summary.md, include it in the vector store? (y/n)\\n> \"\n answer = input(prompt)\n answer = answer.strip().lower()\n if answer == \"y\":\n return in_progress_docs_path\n return None\n "},{"location":"options/","title":"Usage","text":" - Preliminary Steps
- Startup
- Generate Domains
- Options
- Data Loader
- Chunking Strategy
- Embedding Model
- Vector Store
- Similarity Top K
- LLM Model
- Mode
- Github Repository
"},{"location":"options/#preliminary-steps","title":"Preliminary Steps","text":"Make sure the setup steps in the Installation and Setup documentation are complete. "},{"location":"options/#startup","title":"Startup","text":"From within the rag/ directory, start the project like so: (env) python main.py\n On startup, you will be prompted to choose some configuration options. More details on the specifics of each option are documented in the Options section. "},{"location":"options/#generate-domains","title":"Generate Domains","text":"After your configurations selections are confirmed, you'll be asked which domain you would like to generate. You can enter either the one letter shortcode for each domain or the full domain name. A new output subdirectory will be created in the output/ directory named after the PDF file. Each domain will have at least one output file on each generation. The code will attempt to serialize the return response into a valid JSON object. Regardless if the JSON serialization succeeds, the raw return response will be dumped in a text file. More detailed information about the output behaviour and structure can be found in the output structure documentation. "},{"location":"options/#options","title":"Options","text":"The option picker interface can be navigated with the j or down arrow keys for the next option, k or up arrow key for the previous option, and the Enter key to choose the option. If you choose the Exit option at any step in the process the program will exit with a status code of 0 . "},{"location":"options/#data-loader","title":"Data Loader","text":"The data loader (or Reader) is one of the key abstraction concepts in the LlamaIndex library. Data loaders handle the data ingestion and formatting into Document objects, which will eventually be chunked into Node objects by the vector store. At a high level, Documents are a generic container for the data source. By default, Documents store the text (and/or images) from the data source, a dictionary of annotations containing the metadata, and a dictionary of relationships to other Documents and Nodes. A Node represents a \"chunk\" of a source Document. Aside from the built in generic data loaders, LLamaIndex hosts an open source hub for various community built data loaders for a variety of data sources. Different data loaders differ in how they create and structure the resulting Documents. Depending on the specialization of the data loader in relation to the structure of the raw data source, this can have a significant impact on the overall performance of the downstream pipeline steps. The currently supported data loaders are: SimpleDirectoryReader (default): This is a built-in data loader provided directly by the LlamaIndex library. It is the most generic option and is not specialized in any specific file type. PDFReader : This is an external data loader from LlamaHub that is specialized to PDF files. PDFMarker : The PDF marker converts the PDF file to clean markdown before ingesting. "},{"location":"options/#chunking-strategy","title":"Chunking Strategy","text":"The chunking strategy is the specific technique to split the Documents into Nodes. The chunking strategy chosen should influence downstream configuration choices, specifically the embedding model and similarity top k parameter selections. Recent research has shown that chunking optimization in RAG systems can have more of an impact on performance then most other parameter configurations, making it one of the most important configuration options. There are two general chunking strategies that this tool currently supports: fixed sized chunking and semantic chunking. Fixed size chunking strategies involve pre-setting the chunk_size and chunk_overlap parameters. The chunk_size controls the granularity of the chunks (or Nodes) by setting the token limit per chunk. For example, a chunk size of 256 will create more granular chunks, and as a result, more Nodes. However, vital information might not be among the top retrieved chunks, especially if the similarity-top-k parameter is not scaled accordingly. Conversly, a chunk size of 2048 is more likely to encompass relevant information at the cost of increased noise and a loss of specificity. With fixed size chunking stragies, it is important to scale the similarity-top-k parameter appropriately and to choose an embedding model that both supports (and performs well on) the chosen chunk size. The semantic chunking supported by this tool involves using a semantic splitter to adaptively pick the breakpoint in-between sentences using embedding similarity. This ensure sthat a chunk contains sentences that are semantically related to each other. Note, semantic chunking introduces non-trival overhead in terms of computational resources and API calls. Especially for very large documents, expect worse runtime performance. There is also a possibility that the semantic splitter creates chunks that are too large for your chosen embedding model. While this bug is not specically addressed right now, it will probably have to be addressed with a custom second level safety net splitter eventually. The currently supported chunking strategies are: 256 chunk size/20 chunk overlap : Fixed chunking strategy with 256 tokens and a 20 token overlap between chunks. 512 chunk size/50 chunk overlap : Fixed chunking strategy with 512 tokens and a 50 token overlap between chunks. 1024 chunk size/20 chunk overlap (default): Fixed chunking strategy with 1024 tokens and a 20 token overlap between chunks. 2048 chunk size/50 chunk overlap : Fixed chunking strategy with 2048 tokens and a 50 token overlap between chunks. semantic : Semantic chunking based on adaptive chunk splitting. Note: There are known bugs with the semantic chunker, see here. "},{"location":"options/#embedding-model","title":"Embedding Model","text":"The embedding model is responsible for converting the text into a numerical representation, or embedding. The embedding model is used to transform both the query and the chunked nodes into embeddings which are then compared to find the most similar nodes relating to the query during the information retrieval process. Different embedding models can significantly impact the performance of the RAG pipeline. Additionally, different embedding models perform optimally on different chunk sizes, so the embedding model choice should ideally be harmonized with the chosen chunking strategy. The currently supported embedding models are: text-embedding-3-small (default): This is one of OpenAI's newest embedding models, designed for highly efficient embedding. text-embedding-3-large : This is the other new OpenAI embedding model, designed for maximum performance with support for embeddings up to 3,072 dimensions. text-embedding-ada-002 : This is an older OpenAI embedding model, generally not recommended outside testing purposes as it is less efficient and less powerful than both the text-embedding-3-small and text-embedding-3-large models. Currently, only OpenAI embedding models are supported. Futher documentation on the embedding models can be found here and information on pricing can be found here. "},{"location":"options/#vector-store","title":"Vector Store","text":"The vector store handles the indexing, retrieval, and storage process. The indexing process is the method by which a vector store chunks, embeds, organizes, and stores the resulting embeddings of the chunked documents in the vector store. This process can vary depending on the specific implementation of the vector store chosen. The specific chunking strategy chosen can have a significant impact on the retrieval process and effects your embedding model choice (different embedding models perform optimally on different chunk sizes). The retrieval process first converts the query into a vector embedding and then performs a dense search operation to rank all the embeddings by how semantically similar they are to the query. Once the ranking is complete, the vector store returns, or retrieves, the most similar embeddings. The number of chosen retrievals to send to the LLM is controlled by the similarity_top_k parameter. Different vector stores also support different metadata filtering methods that allow for filtering the candidate set of documents based on certain metadata before performing the semantic search. Aside from the built in generic vector stores, LLamaIndex hosts an open source hub for various other vector store options. The currently supported vector stores are: VectorStoreIndex (default): This is the default built-in vector store provided directly by the LlamaIndex library. While it does support metadata filtering, by default it does not perform any metadata filtering. "},{"location":"options/#similarity-top-k","title":"Similarity Top K","text":"The similarity_top_k parameter in the similarity search process refers to the number of nodes to return as a result of the semantic retrieval process. When the semantic search process is performend, the node embeddings are ranked by how smenatically similar they are to the query embedding. After the ranking process is completed, the top k most similar embeddings are sent to the LLM along with the query. Larger values will result in more input tokens. Note: The similarity_top_k parameter here is unrelated to the top k parameter for large language models which limits the model's vocabulary sampling set when considering the next word to generate. "},{"location":"options/#llm-model","title":"LLM Model","text":"The currently supported LLM models are: gpt-3.5-turbo : This is the least powerful model, offering the fastest performance at a low cost with the caveat of being the least powerful of the OpenAI offerings. gpt-4-turbo (default): This is the default model and is OpenAI's newest offering. As of writing, this model currently points to the gpt-4-turbo-2024-04-09 model. gpt-4-turbo-preview : As of writing, this model currently points to the gpt-4-0125-preview model. Generally not recommended outside of testing purposes as the gpt-4-turbo offers better performance at the same cost. gpt-4 : This is the most powerful model, but also the most expensive. Currently, only OpenAI LLM models are supported. Futher documentation on the specific LLM models can be found here and information on pricing can be found here. "},{"location":"options/#mode","title":"Mode","text":"The mode option has no effect on the RAG performance, but controls how much extra information is included in the run log. Choosing the debug mode will include an extensive logging of everything that is happening during each run. Choosing the production mode will only include the necessary logging, such as the user options and return responses. "},{"location":"options/#github-repository","title":"Github Repository","text":"After choosing the configuration options, you have the choice to also include a Github repository URL to include in the indexing process. The URL provided will automatically be parsed for the repository owner and repository name information. This will supplement the PDF data ingestion to provide more specific output for workflow specific steps in the description and parametric domains. If a github URL is entered, you'll be asked to confirm the branch of the repo to index (if none is entered, will default to main ). You will also have the choice to specify directory and filter extension filters. For each filter, you will have the option to specify whether to conditionally exclude certain directories and file types or to inclusively include certain directories and file types. Specify the directory path for directories to include in the filter. For file types, include the file extension, for example, .txt, .md with the include filter type will only include files that are of type text and markdown. Note, the filters are important for large repositories and will have a significant impact on the runtime performance of the indexing process and the quality of the retrieval step. Indexing repositories with large output, log, or data files can incur signficant performance overhead and additionally can lower output quality by polluting the retrieval step with noise. "},{"location":"output-map-types/","title":"Output Map Types","text":"The output map custom types. "},{"location":"output-map-types/#bcorag.custom_types.output_map_types.OutputTrackerGitFilter","title":"OutputTrackerGitFilter ","text":" Bases: TypedDict Parsed git filter TypedDict used for output map formatting. Attributes: Name Type Description filter tuple[str, list[str]] Tuple representing the filter type (include or exclude) and the filter values. Source code in bcorag/custom_types/output_map_types.py class OutputTrackerGitFilter(TypedDict):\n \"\"\"Parsed git filter TypedDict used for output map formatting.\n\n Attributes\n ----------\n filter : tuple[str, list[str]]\n Tuple representing the filter type (include or exclude) and the filter values.\n \"\"\"\n\n filter: tuple[str, list[str]]\n "},{"location":"output-map-types/#bcorag.custom_types.output_map_types.OutputTrackerParamSet","title":"OutputTrackerParamSet ","text":" Bases: TypedDict Parameter set for a run. Attributes: Name Type Description loader str The data loader used for the run. vector_store str The vector store used for the run. llm str The LLM name used for the run. embedding_model str The embedding model used for the run. similarity_top_k int The similarity top k value used for the run. chunking_config str The chunking strategy used for the run. git_user Optional[str] The user who owns the github repository included in the document ingestion for the run (if applicable). git_repo Optional[str] The github repository included in the document ingestion for the run (if applicable). git_branch Optional[str] The github repository branch indexed during the document ingestion for the run (if applicable). directory_git_filter Optional[OutputTrackerGitFilter] The directory filter used for indexing the github repository (if applicable). file_ext_git_filter Optional[OutputTrackerGitFilter] The file extension filter used for indexing the github repository (if applicable). other_docs Optional[list[str]] The file path to any additional documentation included in the documents. Source code in bcorag/custom_types/output_map_types.py class OutputTrackerParamSet(TypedDict):\n \"\"\"Parameter set for a run.\n\n Attributes\n ----------\n loader : str\n The data loader used for the run.\n vector_store : str\n The vector store used for the run.\n llm : str\n The LLM name used for the run.\n embedding_model : str\n The embedding model used for the run.\n similarity_top_k : int\n The similarity top k value used for the run.\n chunking_config : str\n The chunking strategy used for the run.\n git_user : Optional[str]\n The user who owns the github repository included in the document ingestion for the run (if applicable).\n git_repo : Optional[str]\n The github repository included in the document ingestion for the run (if applicable).\n git_branch : Optional[str]\n The github repository branch indexed during the document ingestion for the run (if applicable).\n directory_git_filter : Optional[OutputTrackerGitFilter]\n The directory filter used for indexing the github repository (if applicable).\n file_ext_git_filter : Optional[OutputTrackerGitFilter]\n The file extension filter used for indexing the github repository (if applicable).\n other_docs : Optional[list[str]]\n The file path to any additional documentation included in the documents.\n \"\"\"\n\n loader: str\n vector_store: str\n llm: str\n embedding_model: str\n similarity_top_k: int\n chunking_config: str\n git_user: Optional[str]\n git_repo: Optional[str]\n git_branch: Optional[str]\n directory_git_filter: Optional[OutputTrackerGitFilter]\n file_ext_git_filter: Optional[OutputTrackerGitFilter]\n other_docs: Optional[list[str]]\n "},{"location":"output-map-types/#bcorag.custom_types.output_map_types.OutputTrackerRunsEntry","title":"OutputTrackerRunsEntry ","text":" Bases: TypedDict Specific file data under a parameter set. Attributes: Name Type Description index int The index for the run (the index represents the run number for that specific domain parameter set). timestamp str The timestamp for the run. txt_file str File path to the raw output dump text file. json_file str File path to the JSON output file. source_node_file str File path to the source node text file. elapsed_time float The elapsed time (in seconds) for how long the domain generation took. version str The version of the bcorag tool used. Source code in bcorag/custom_types/output_map_types.py class OutputTrackerRunsEntry(TypedDict):\n \"\"\"Specific file data under a parameter set.\n\n Attributes\n ----------\n index : int\n The index for the run (the index represents the run number for that specific domain parameter set).\n timestamp : str\n The timestamp for the run.\n txt_file : str\n File path to the raw output dump text file.\n json_file : str\n File path to the JSON output file.\n source_node_file : str\n File path to the source node text file.\n elapsed_time : float\n The elapsed time (in seconds) for how long the domain generation took.\n version : str\n The version of the bcorag tool used.\n \"\"\"\n\n index: int\n timestamp: str\n txt_file: str\n json_file: str\n source_node_file: str\n elapsed_time: float\n version: str\n "},{"location":"output-map-types/#bcorag.custom_types.output_map_types.OutputTrackerEntry","title":"OutputTrackerEntry ","text":" Bases: TypedDict Entry in the output map under a specific domain hash string. Attributes: Name Type Description curr_index int The most recent run index. params OutputTrackerParamSet The parameter set for the run. runs list[OutputTrackerRunsEntry] The list of runs for this parameter set. Source code in bcorag/custom_types/output_map_types.py class OutputTrackerEntry(TypedDict):\n \"\"\"Entry in the output map under a specific domain hash string.\n\n Attributes\n ----------\n curr_index : int\n The most recent run index.\n params : OutputTrackerParamSet\n The parameter set for the run.\n runs : list[OutputTrackerRunsEntry]\n The list of runs for this parameter set.\n \"\"\"\n\n curr_index: int\n params: OutputTrackerParamSet\n runs: list[OutputTrackerRunsEntry]\n "},{"location":"output-map-types/#bcorag.custom_types.output_map_types.OutputTrackerDomainEntry","title":"OutputTrackerDomainEntry ","text":" Bases: TypedDict Entry for a specific domain. Note: this isn't the most ideal way to do this. Ideally the hash string itself for the parameter set would be the key instead of forcing the OutputTrackerDomainField to be kept as a list of objects. However, there doesn't seem to be a good way to do this in a pythonic way while enforcing type safety with static type checkers. As they currently exist, TypedDict's require all keys are specified at the time of creating the definition. I would rather not specify regular dictionaries with extensive and verbose type annotations and I expect these map output files are likely to be small enough that serious linear runtime complexity won't cause issues. Attributes: Name Type Description hash_str str The hash of the parameter set used for run collision identification. entries OutputTrackerEntry The run objects. Source code in bcorag/custom_types/output_map_types.py class OutputTrackerDomainEntry(TypedDict):\n \"\"\"Entry for a specific domain.\n\n *Note*: this isn't the most ideal way to do this. Ideally\n the hash string itself for the parameter set would be the\n key instead of forcing the OutputTrackerDomainField to be\n kept as a list of objects. However, there doesn't seem to\n be a good way to do this in a pythonic way while enforcing\n type safety with static type checkers. As they currently\n exist, TypedDict's require all keys are specified at the\n time of creating the definition. I would rather not specify\n regular dictionaries with extensive and verbose type annotations\n and I expect these map output files are likely to be small enough\n that serious linear runtime complexity won't cause issues.\n\n Attributes\n ----------\n hash_str : str\n The hash of the parameter set used for run collision identification.\n entries : OutputTrackerEntry\n The run objects.\n \"\"\"\n\n hash_str: str\n entries: OutputTrackerEntry\n "},{"location":"output-map-types/#bcorag.custom_types.output_map_types.OutputTrackerFile","title":"OutputTrackerFile ","text":" Bases: TypedDict Top level schema for the output file. Attributes: Name Type Description usability list[OutputTrackerDomainEntry] The output map for the usability domain. io list[OutputTrackerDomainEntry] The output map for the io domain. description list[OutputTrackerDomainEntry] The output map for the description domain. execution list[OutputTrackerDomainEntry] The output map for the execution domain. parametric list[OutputTrackerDomainEntry] The output map for the parametric domain. error list[OutputTrackerDomainEntry] The output map for the error domain. Source code in bcorag/custom_types/output_map_types.py class OutputTrackerFile(TypedDict):\n \"\"\"Top level schema for the output file.\n\n Attributes\n ----------\n usability : list[OutputTrackerDomainEntry]\n The output map for the usability domain.\n io : list[OutputTrackerDomainEntry]\n The output map for the io domain.\n description : list[OutputTrackerDomainEntry]\n The output map for the description domain.\n execution : list[OutputTrackerDomainEntry]\n The output map for the execution domain.\n parametric : list[OutputTrackerDomainEntry]\n The output map for the parametric domain.\n error : list[OutputTrackerDomainEntry]\n The output map for the error domain.\n \"\"\"\n\n usability: list[OutputTrackerDomainEntry]\n io: list[OutputTrackerDomainEntry]\n description: list[OutputTrackerDomainEntry]\n execution: list[OutputTrackerDomainEntry]\n parametric: list[OutputTrackerDomainEntry]\n error: list[OutputTrackerDomainEntry]\n "},{"location":"output-map-types/#bcorag.custom_types.output_map_types.create_output_tracker_git_filter","title":"create_output_tracker_git_filter(filter) ","text":"Constructor for the OutputTrackerGitFilter TypedDict. Parameters: Name Type Description Default filter tuple[str, list[str]] required Returns: Type Description OutputTrackerGitFilter Source code in bcorag/custom_types/output_map_types.py def create_output_tracker_git_filter(\n filter: tuple[str, list[str]]\n) -> OutputTrackerGitFilter:\n \"\"\"Constructor for the `OutputTrackerGitFilter` TypedDict.\n\n Parameters\n ----------\n filter : tuple[str, list[str]]\n\n Returns\n -------\n OutputTrackerGitFilter\n \"\"\"\n return_data: OutputTrackerGitFilter = {\"filter\": filter}\n return return_data\n "},{"location":"output-map-types/#bcorag.custom_types.output_map_types.create_output_tracker_param_set","title":"create_output_tracker_param_set(loader, vector_store, llm, embedding_model, similarity_top_k, chunking_config, git_user, git_repo, git_branch, directory_git_filter=None, file_ext_git_filter=None, other_docs=None) ","text":"Constructor for the OutputTrackerParamSet TypedDict. Parameters: Name Type Description Default loader str The data loader used for the run. required vector_store str The vector store used for the run. required llm str The LLM name used for the run. required embedding_model str The embedding model used for the run. required similarity_top_k int The similarity top k value used for the run. required chunking_config str The chunking strategy used for the run. required git_user Optional[str] The user who owns the github repository included in the document ingestion for the run (if applicable). required git_repo Optional[str] The github repository included in the document ingestion for the run (if applicable). required git_branch Optional[str] The github repository branch indexed during the document ingestion for the run (if applicable). required directory_git_filter Optional[OutputTrackerGitFilter] The directory filter used for indexing the github repository (if applicable). None file_ext_git_filter Optional[OutputTrackerGitFilter] The file extension filter used for indexing the github repository (if applicable). None other_docs Optional[list[str]] The file path to any additional documentation included in the documents. None Returns: Type Description OutputTrackerParamSet Source code in bcorag/custom_types/output_map_types.py def create_output_tracker_param_set(\n loader: str,\n vector_store: str,\n llm: str,\n embedding_model: str,\n similarity_top_k: int,\n chunking_config: str,\n git_user: Optional[str],\n git_repo: Optional[str],\n git_branch: Optional[str],\n directory_git_filter: Optional[OutputTrackerGitFilter] = None,\n file_ext_git_filter: Optional[OutputTrackerGitFilter] = None,\n other_docs: Optional[list[str]] = None\n) -> OutputTrackerParamSet:\n \"\"\"Constructor for the `OutputTrackerParamSet` TypedDict.\n\n Parameters\n ----------\n loader : str\n The data loader used for the run.\n vector_store : str\n The vector store used for the run.\n llm : str\n The LLM name used for the run.\n embedding_model : str\n The embedding model used for the run.\n similarity_top_k : int\n The similarity top k value used for the run.\n chunking_config : str\n The chunking strategy used for the run.\n git_user : Optional[str]\n The user who owns the github repository included in the document ingestion for the run (if applicable).\n git_repo : Optional[str]\n The github repository included in the document ingestion for the run (if applicable).\n git_branch : Optional[str]\n The github repository branch indexed during the document ingestion for the run (if applicable).\n directory_git_filter : Optional[OutputTrackerGitFilter], optional\n The directory filter used for indexing the github repository (if applicable).\n file_ext_git_filter : Optional[OutputTrackerGitFilter], optional\n The file extension filter used for indexing the github repository (if applicable).\n other_docs : Optional[list[str]]\n The file path to any additional documentation included in the documents.\n\n Returns\n -------\n OutputTrackerParamSet\n \"\"\"\n return_data: OutputTrackerParamSet = {\n \"loader\": loader,\n \"vector_store\": vector_store,\n \"llm\": llm,\n \"embedding_model\": embedding_model,\n \"similarity_top_k\": similarity_top_k,\n \"chunking_config\": chunking_config,\n \"git_user\": git_user,\n \"git_repo\": git_repo,\n \"git_branch\": git_branch,\n \"directory_git_filter\": directory_git_filter,\n \"file_ext_git_filter\": file_ext_git_filter,\n \"other_docs\": other_docs,\n }\n return return_data\n "},{"location":"output-map-types/#bcorag.custom_types.output_map_types.create_output_tracker_runs_entry","title":"create_output_tracker_runs_entry(index, timestamp, txt_file, json_file, source_node_file, elapsed_time, version=__version__) ","text":"Constructor for the OutputTrackerRunsEntry TypedDict. Parameters: Name Type Description Default index int The index for the run (the index represents the run number for that specific domain parameter set). required timestamp str The timestamp for the run. required txt_file str File path to the raw output dump text file. required json_file str File path to the JSON output file. required source_node_file str File path to the source node text file. required elapsed_time float The elapsed time (in seconds) for how long the domain generation took. required version str The version of the bcorag tool used. __version__ Source code in bcorag/custom_types/output_map_types.py def create_output_tracker_runs_entry(\n index: int,\n timestamp: str,\n txt_file: str,\n json_file: str,\n source_node_file: str,\n elapsed_time: float,\n version: str = __version__,\n) -> OutputTrackerRunsEntry:\n \"\"\"Constructor for the `OutputTrackerRunsEntry` TypedDict.\n\n Parameters\n ----------\n index : int\n The index for the run (the index represents the run number for that specific domain parameter set).\n timestamp : str\n The timestamp for the run.\n txt_file : str\n File path to the raw output dump text file.\n json_file : str\n File path to the JSON output file.\n source_node_file : str\n File path to the source node text file.\n elapsed_time : float\n The elapsed time (in seconds) for how long the domain generation took.\n version : str, optional\n The version of the `bcorag` tool used.\n \"\"\"\n return_data: OutputTrackerRunsEntry = {\n \"index\": index,\n \"timestamp\": timestamp,\n \"txt_file\": txt_file,\n \"json_file\": json_file,\n \"source_node_file\": source_node_file,\n \"elapsed_time\": elapsed_time,\n \"version\": version,\n }\n return return_data\n "},{"location":"output-map-types/#bcorag.custom_types.output_map_types.create_output_tracker_entry","title":"create_output_tracker_entry(curr_index, params, runs) ","text":"Constructor for the OutputTrackerEntry TypedDict. Parameters: Name Type Description Default curr_index int The most recent run index. required params OutputTrackerParamSet The parameter set for the run. required runs list[OutputTrackerRunsEntry] The list of runs for this parameter set. required Returns: Type Description OutputTrackerEntry Source code in bcorag/custom_types/output_map_types.py def create_output_tracker_entry(\n curr_index: int, params: OutputTrackerParamSet, runs: list[OutputTrackerRunsEntry]\n) -> OutputTrackerEntry:\n \"\"\"Constructor for the `OutputTrackerEntry` TypedDict.\n\n Parameters\n ----------\n curr_index : int\n The most recent run index.\n params : OutputTrackerParamSet\n The parameter set for the run.\n runs : list[OutputTrackerRunsEntry]\n The list of runs for this parameter set.\n\n Returns\n -------\n OutputTrackerEntry\n \"\"\"\n return_data: OutputTrackerEntry = {\n \"curr_index\": curr_index,\n \"params\": params,\n \"runs\": runs,\n }\n return return_data\n "},{"location":"output-map-types/#bcorag.custom_types.output_map_types.create_output_tracker_domain_entry","title":"create_output_tracker_domain_entry(hash_str, entries) ","text":"Constructor for the OutputTrackerDomainEntry TypedDict. Parameters: Name Type Description Default hash_str str The hash of the parameter set used for run collision identification. required entries OutputTrackerEntry The run objects. required Returns: Type Description OutputTrackerDomainEntry Source code in bcorag/custom_types/output_map_types.py def create_output_tracker_domain_entry(\n hash_str: str, entries: OutputTrackerEntry\n) -> OutputTrackerDomainEntry:\n \"\"\"Constructor for the `OutputTrackerDomainEntry` TypedDict.\n\n Parameters\n ----------\n hash_str : str\n The hash of the parameter set used for run collision identification.\n entries : OutputTrackerEntry\n The run objects.\n\n Returns\n -------\n OutputTrackerDomainEntry\n \"\"\"\n return_data: OutputTrackerDomainEntry = {\"hash_str\": hash_str, \"entries\": entries}\n return return_data\n "},{"location":"output-map-types/#bcorag.custom_types.output_map_types.default_output_tracker_file","title":"default_output_tracker_file() ","text":"Creates an empty, default output tracker file instance. Returns: Type Description OutputTrackerFile Source code in bcorag/custom_types/output_map_types.py def default_output_tracker_file() -> OutputTrackerFile:\n \"\"\"Creates an empty, default output tracker file instance.\n\n Returns\n -------\n OutputTrackerFile\n \"\"\"\n return_data: OutputTrackerFile = {\n \"usability\": [],\n \"io\": [],\n \"description\": [],\n \"execution\": [],\n \"parametric\": [],\n \"error\": [],\n }\n return return_data\n "},{"location":"output-structure/","title":"Output Structure","text":" - Output Directory
- Generated Content
- Output Maps
"},{"location":"output-structure/#output-directory","title":"Output Directory","text":"All output files and sub-directories will be placed within the output/ directory at the root of this repository. When starting up a run for a PDF file, a new subdirectory will be created with the name of the PDF file. For example, if the paper being indexed is named High resolution measurement.pdf , the output directory created will be at the path output/high_resolution_measurement/ (whitespaces replaced with underscores). Within that sub-directory will be two more sub-directories, generated_domains/ and reference_sources/ , and two files, output_map.json and output_map.json . "},{"location":"output-structure/#generated-content","title":"Generated Content","text":"Output filenames contain three components: Domain - the corresponding BioCompute domain. Index - the run number for the domain under that parameter set (used to delineate between hash collisions). Parameter Set Hash - used to uniquely identify parameter sets for a run. The filename formats are as follows: {domain}-{index}-{parameter set hash}.json\n{domain}-{index}-{parameter set hash}.txt\n When generating a domain, the LLM generated domain response will be attempted to be serialized into a valid JSON object. If successful, a JSON file will be created within the generated_domains/ sub-directory. Whether or not the JSON serialization is successful, the raw response message will be dumped into a text file in the generated_domains/ sub-directory. A key component of any RAG pipeline is the retrieval process. In order to accurately capture the state of the tool when generating a domain, we capture the referenced sources that were retrieved based on the standardized domain queries. These are stored in the referernce_sources/ sub-directory and follow the same filename format as the output text files. "},{"location":"output-structure/#output-maps","title":"Output Maps","text":"Along with the generated content output, an output_map.json file is generated (or updated) to keep track of the parameter sets for each run. As a convenience for human-readability, the JSON output map is also dumped as a TSV file (however, the TSV file is not used for tracking at all by the code). "},{"location":"output-structure/#map-structure","title":"Map Structure","text":"{\n \"{domain}\": [\n {\n \"hash_str\": \"{parameter set hash}\",\n \"entries\": {\n \"curr_index\": \"{current run index}\",\n \"params\": {\n \"loader\": \"{data loader used}\",\n \"vector_store\": \"{vector store used}\",\n \"llm\": \"{llm used}\",\n \"embedding_model\": \"{embedding model used}\",\n \"similarity_top_k\": \"{similarity top k selected}\",\n \"chunking_config\": \"{chunking strategy used for node parsing}\",\n \"git_user\": \"{github user (or org) that owns the github repo used (if applicable)}\",\n \"git_repo\": \"{github repo indexed (if applicable)}\",\n \"git_branch\": \"{github branch to index (if applicable)}\",\n \"directory_git_filter\": \"{the directory filters included, if applicable}\",\n \"fiel_ext_filter\": \"{the file extension filters included, if applicable}\"\n },\n \"runs\": [\n {\n \"index\": \"{index for this run}\",\n \"timestamp\": \"{timestamp of the run}\",\n \"txt_file\": \"{filepath to the raw txt dump}\",\n \"json_file\": \"{filepath to the serialized JSON response (if applicable)}\",\n \"source_node_file\": \"{filepath to the retrieved nodes file}\",\n \"elapsed_time\": \"{elapsed time in seconds to generate the domain}\",\n \"version\": \"{version of the tool that was used}\"\n }\n ]\n }\n }\n ]\n}\n "},{"location":"parameter-custom-types/","title":"Types","text":""},{"location":"parameter-custom-types/#parameter_search.custom_types._AvailFilters","title":"_AvailFilters ","text":" Bases: TypedDict Internal class for the available parameter set. Source code in parameter_search/custom_types.py class _AvailFilters(TypedDict):\n \"\"\"Internal class for the available parameter set.\"\"\"\n\n loader: list[str]\n chunking_config: list[str]\n embedding_model: list[str]\n vector_store: list[str]\n similarity_top_k: list[int]\n llm: list[str]\n mode: list[str]\n "},{"location":"parameter-custom-types/#parameter_search.custom_types.GitDataFileConfig","title":"GitDataFileConfig ","text":" Bases: TypedDict Git data instance for a file. Attributes: Name Type Description filename str The file (paper) to associate this github repository info with. git_info GitData The github repository information for document ingestion. Source code in parameter_search/custom_types.py class GitDataFileConfig(TypedDict):\n \"\"\"Git data instance for a file.\n\n Attributes\n ----------\n filename : str\n The file (paper) to associate this github repository info with.\n git_info : GitData\n The github repository information for document ingestion.\n \"\"\"\n\n filename: str\n git_info: GitData\n "},{"location":"parameter-custom-types/#parameter_search.custom_types.SearchSpace","title":"SearchSpace ","text":" Bases: TypedDict Search space used for hyperparameter search. Attributes: Name Type Description filenames list[str] The file (paper) name's to process. loader list[str] The list of available data loaders to test. chunking_config list[str] The chunking strategies to test. embedding_model list[str] The embedding models to test. vector_store list[str] The vector stores to test. similarity_top_k list[int] The similarity top k values to test. llm list[str] The LLMs to test. git_data Optional[list[GitDataFileConfig]] The git data information. other_docs Optional[dict[str, list[str]]] Any other documents to include (keys are the paper name to associate with). Source code in parameter_search/custom_types.py class SearchSpace(TypedDict):\n \"\"\"Search space used for hyperparameter search.\n\n Attributes\n ----------\n filenames : list[str]\n The file (paper) name's to process.\n loader : list[str]\n The list of available data loaders to test.\n chunking_config : list[str]\n The chunking strategies to test.\n embedding_model : list[str]\n The embedding models to test.\n vector_store : list[str]\n The vector stores to test.\n similarity_top_k : list[int]\n The similarity top k values to test.\n llm : list[str]\n The LLMs to test.\n git_data : Optional[list[GitDataFileConfig]]\n The git data information.\n other_docs : Optional[dict[str, list[str]]]\n Any other documents to include (keys are the paper name to\n associate with).\n \"\"\"\n\n filenames: list[str]\n loader: list[str]\n chunking_config: list[str]\n embedding_model: list[str]\n vector_store: list[str]\n similarity_top_k: list[int]\n llm: list[str]\n git_data: Optional[list[GitDataFileConfig]]\n other_docs: Optional[dict[str, list[str]]]\n "},{"location":"parameter-custom-types/#parameter_search.custom_types.create_git_data_file_config","title":"create_git_data_file_config(filename, git_info) ","text":"Constructor for the GitDataFileConfig TypedDict. Parameters: Name Type Description Default filename str The file (paper) to associate this github repository info with. required git_info GitData The github repository information for document ingestion. required Returns: Type Description GitDataFileConfig Source code in parameter_search/custom_types.py def create_git_data_file_config(filename: str, git_info: GitData) -> GitDataFileConfig:\n \"\"\"Constructor for the GitDataFileConfig TypedDict.\n\n Parameters\n ----------\n filename : str\n The file (paper) to associate this github repository info with.\n git_info : GitData\n The github repository information for document ingestion.\n\n Returns\n -------\n GitDataFileConfig\n \"\"\"\n return_data: GitDataFileConfig = {\"filename\": filename, \"git_info\": git_info}\n return return_data\n "},{"location":"parameter-custom-types/#parameter_search.custom_types.init_search_space","title":"init_search_space(filenames=None, loader=None, chunking_config=None, embedding_model=None, vector_store=None, similarity_top_k=None, llm=None, git_data=None, other_docs=None) ","text":"Creates a search space instance. Parameters: Name Type Description Default filenames list[str] | str | None The filenames to test over for the search space (if None , defaults to all the filenames in the bcorag/test_papers/ directory). Note, many files can increase run time significantly as a full parameter search will be executed on each paper sequentially. None loader list[str] | str | None The data loaders for the search space (if None , defaults to the full list as defined in the conf.json list). None chunking_config list[str] | str | or None The chunking strategies for the search space (if None , defaults to the full list as defined in the conf.json list). None embedding_model list[str] | str | or None The embedding model for the search space (if None , defaults to the full list as defined in the conf.json list). None vector_store list[str] | str | or None The vector store for the search space (if None , defaults to the full list as defined in the conf.json list). None similarity_top_k list[int] | int | or None The similarity top k for the search space (if None , defaults to the full list as defined in the conf.json list). None llm list[str] | str | or None The llm for the search space (if None , defaults to the full list as defined in the conf.json list). None git_data list[GitDataFileConfig] | GitDataFileConfig | None The git data for each file (if None , assumes no git data for any files). None other_docs dict[str, list[str]] | None The other documents to include (if None , assumes no other docs for any files). None Returns: Type Description SearchSpace The search space grid. Source code in parameter_search/custom_types.py def init_search_space(\n filenames: Optional[list[str] | str] = None,\n loader: Optional[list[str] | str] = None,\n chunking_config: Optional[list[str] | str] = None,\n embedding_model: Optional[list[str] | str] = None,\n vector_store: Optional[list[str] | str] = None,\n similarity_top_k: Optional[list[int] | int] = None,\n llm: Optional[list[str] | str] = None,\n git_data: Optional[list[GitDataFileConfig]] = None,\n other_docs: Optional[dict[str, list[str]]] = None,\n) -> SearchSpace:\n \"\"\"Creates a search space instance.\n\n Parameters\n ----------\n filenames : list[str] | str | None, optional\n The filenames to test over for the search space (if `None`,\n defaults to all the filenames in the `bcorag/test_papers/`\n directory). Note, many files can increase run time\n significantly as a full parameter search will be executed\n on each paper sequentially.\n loader : list[str] | str | None, optional\n The data loaders for the search space (if `None`, defaults to\n the full list as defined in the `conf.json` list).\n chunking_config : list[str] | str | or None, optional\n The chunking strategies for the search space (if `None`, defaults\n to the full list as defined in the `conf.json` list).\n embedding_model : list[str] | str | or None, optional\n The embedding model for the search space (if `None`, defaults\n to the full list as defined in the `conf.json` list).\n vector_store : list[str] | str | or None, optional\n The vector store for the search space (if `None`, defaults\n to the full list as defined in the `conf.json` list).\n similarity_top_k : list[int] | int | or None, optional\n The similarity top k for the search space (if `None`, defaults\n to the full list as defined in the `conf.json` list).\n llm : list[str] | str | or None, optional\n The llm for the search space (if `None`, defaults\n to the full list as defined in the `conf.json` list).\n git_data : list[GitDataFileConfig] | GitDataFileConfig | None, optional\n The git data for each file (if `None`, assumes no git data for\n any files).\n other_docs : dict[str, list[str]] | None, optional\n The other documents to include (if `None`, assumes no other docs\n for any files).\n\n Returns\n -------\n SearchSpace\n The search space grid.\n \"\"\"\n\n def _validate_options(\n option: OptionKey, option_list: list[str] | list[int]\n ) -> bool:\n if not set(option_list) <= set(_avail_options[option]):\n return False\n return True\n\n match filenames:\n case list():\n filenames_space: list[str] = filenames\n case str():\n filenames_space = [filenames]\n case None:\n filenames_space = get_file_list(\"./bcorag/test_papers\", \"*.pdf\")\n case _:\n graceful_exit(1, \"Invalid type for filenames\")\n for file in filenames_space:\n if not os.path.isfile(file):\n graceful_exit(1, f\"Invalid file `{file}`\")\n\n match loader:\n case list():\n loader_space: list[str] = loader\n if not _validate_options(\"loader\", loader_space):\n graceful_exit(1, \"Invalid or undefined loader in search space\")\n case str():\n loader_space = [loader]\n if not _validate_options(\"loader\", loader_space):\n graceful_exit(1, \"Invalid or undefined loader in search space\")\n case None:\n loader_space = _avail_options[\"loader\"]\n case _:\n graceful_exit(1, \"Invalid type specified for loader\")\n\n match chunking_config:\n case list():\n chunking_space: list[str] = chunking_config\n if not _validate_options(\"chunking_config\", chunking_space):\n graceful_exit(\n 1, \"Invalid or undefined chunking strategy in search space\"\n )\n case str():\n chunking_space = [chunking_config]\n if not _validate_options(\"chunking_config\", chunking_space):\n graceful_exit(\n 1, \"Invalid or undefined chunking strategy in search space\"\n )\n case None:\n chunking_space = _avail_options[\"chunking_config\"]\n case _:\n graceful_exit(1, \"Invalid type specified for chunking_config\")\n\n match embedding_model:\n case list():\n embedding_model_space: list[str] = embedding_model\n if not _validate_options(\"embedding_model\", embedding_model_space):\n graceful_exit(1, \"Invalid or undefined embedding model in search space\")\n case str():\n embedding_model_space = [embedding_model]\n if not _validate_options(\"embedding_model\", embedding_model_space):\n graceful_exit(1, \"Invalid or undefined embedding model in search space\")\n case None:\n embedding_model_space = _avail_options[\"embedding_model\"]\n case _:\n graceful_exit(1, \"Invalid type specified for embedding_model\")\n\n match vector_store:\n case list():\n vector_store_space: list[str] = vector_store\n if not _validate_options(\"vector_store\", vector_store_space):\n graceful_exit(1, \"Invalid or undefined vector store in search space\")\n case str():\n vector_store_space = [vector_store]\n if not _validate_options(\"vector_store\", vector_store_space):\n graceful_exit(1, \"Invalid or undefined vector store in search space\")\n case None:\n vector_store_space = _avail_options[\"vector_store\"]\n case _:\n graceful_exit(1, \"Invalid type specified for vector_store\")\n\n match similarity_top_k:\n case list():\n similarity_top_k_space: list[int] = similarity_top_k\n case int():\n similarity_top_k_space = [similarity_top_k]\n case None:\n similarity_top_k_space = _avail_options[\"similarity_top_k\"]\n case _:\n graceful_exit(1, \"Invalid type for similarity top k\")\n\n match llm:\n case list():\n llm_space: list[str] = llm\n if not _validate_options(\"llm\", llm_space):\n graceful_exit(1, \"Invalid or undefined llm in search space\")\n case str():\n llm_space = [llm]\n if not _validate_options(\"llm\", llm_space):\n graceful_exit(1, \"Invalid or undefined llm in search space\")\n case None:\n llm_space = _avail_options[\"llm\"]\n case _:\n graceful_exit(1, \"Invalid type for llm\")\n\n match git_data:\n case list():\n git_data_space: list[GitDataFileConfig] | None = git_data\n case None:\n git_data_space = None\n\n match other_docs:\n case None:\n other_docs_space = None\n case _:\n if all(isinstance(item, list) for item in list(other_docs.values())):\n other_docs_space = other_docs\n else:\n graceful_exit(1, \"Invalid other docs search space foramt.\")\n\n return_data: SearchSpace = {\n \"filenames\": filenames_space,\n \"loader\": loader_space,\n \"chunking_config\": chunking_space,\n \"embedding_model\": embedding_model_space,\n \"vector_store\": vector_store_space,\n \"similarity_top_k\": similarity_top_k_space,\n \"llm\": llm_space,\n \"git_data\": git_data_space,\n \"other_docs\": other_docs_space,\n }\n\n return return_data\n "},{"location":"parameter-search-abc/","title":"Parent Class","text":"Parameter search base class. "},{"location":"parameter-search-abc/#parameter_search.parameter_search.BcoParameterSearch","title":"BcoParameterSearch ","text":" Bases: ABC Parent class that lays the foundation for the specific parameter search classes. This class shouldn't be instantiated directly. Attributes: Name Type Description _files list[str] The files search space. _loaders list[str] The data loaders search space. _chunking_configs list[str] The chunking strategies search space. _embedding_models list[str] The embedding models search space. _vector_stores list[str] The vector stores search space. _similarity_top_k list[int] The similarity top k search space. _llms list[str] The LLM search space. _git_data Optional[list[GitDataFileConfig]] The git data to associate with test runs. _verbose bool Parameter search verbosity mode. _logger Logger The logger to use. backoff_time int | float The backoff time between runs. Uses exponential backoff time. delay_reset int The amount of runs in between resetting the backoff time. Source code in parameter_search/parameter_search.py class BcoParameterSearch(ABC):\n \"\"\"Parent class that lays the foundation for the specific parameter\n search classes. This class shouldn't be instantiated directly.\n\n Attributes\n ----------\n _files : list[str]\n The files search space.\n _loaders : list[str]\n The data loaders search space.\n _chunking_configs : list[str]\n The chunking strategies search space.\n _embedding_models : list[str]\n The embedding models search space.\n _vector_stores : list[str]\n The vector stores search space.\n _similarity_top_k : list[int]\n The similarity top k search space.\n _llms : list[str]\n The LLM search space.\n _git_data : Optional[list[GitDataFileConfig]]\n The git data to associate with test runs.\n _verbose : bool\n Parameter search verbosity mode.\n _logger : logging.Logger\n The logger to use.\n backoff_time : int | float\n The backoff time between runs. Uses exponential backoff time.\n delay_reset : int\n The amount of runs in between resetting the backoff time. \n \"\"\"\n\n def __init__(\n self,\n search_space: SearchSpace,\n verbose: bool = True,\n ):\n \"\"\"Constructor.\n\n Parameters\n ----------\n search_space : SearchSpace\n The parameter search space.\n verbose : bool, optional\n The verbosity level. False for no output, True for running output.\n \"\"\"\n\n self._files: list[str] = search_space[\"filenames\"]\n self._loaders: list[str] = search_space[\"loader\"]\n self._chunking_configs: list[str] = search_space[\"chunking_config\"]\n self._embedding_models: list[str] = search_space[\"embedding_model\"]\n self._vector_stores: list[str] = search_space[\"vector_store\"]\n self._similarity_top_k: list[int] = search_space[\"similarity_top_k\"]\n self._llms: list[str] = search_space[\"llm\"]\n self._git_data: Optional[list[GitDataFileConfig]] = search_space[\"git_data\"]\n self._other_docs: Optional[dict[str, list[str]]] = search_space[\"other_docs\"]\n self._verbose: bool = verbose\n self._logger = self._setup_logger()\n self.backoff_time: int | float = STANDARD_BACKOFF\n self.delay_reset = 3\n\n def train(self):\n \"\"\"Starts the generation workflow.\"\"\"\n\n param_sets = self._create_param_sets()\n for idx, param_set in enumerate(param_sets):\n\n self._log_output(\n f\"------------ Param Set {idx + 1}/{len(param_sets)} ------------\"\n )\n self._log_output(param_set)\n t0 = time.time()\n\n t1 = time.time()\n bco_rag = self._create_bcorag(param_set)\n self._log_output(f\"RAG created, elapsed time: {time.time() - t1}\")\n\n t2 = time.time()\n self._generate_domains(bco_rag)\n self._log_output(\n f\"Domains generated, total elapsed time: {time.time() - t2}\"\n )\n\n self._log_output(f\"Sleeping for {self.backoff_time}...\")\n time.sleep(self.backoff_time)\n if idx % self.delay_reset == 0:\n self.backoff_time = STANDARD_BACKOFF\n else:\n self.backoff_time *= 2 + random.uniform(0, 1)\n\n self._log_output(f\"Param set elapsed time: {time.time() - t0}\")\n\n @abstractmethod\n def _setup_logger(self, path: str, name: str) -> Logger:\n \"\"\"Sets up the logger.\"\"\"\n pass\n\n @abstractmethod\n def _create_param_sets(self) -> list[UserSelections]:\n \"\"\"Creates a list of parameter sets.\"\"\"\n pass\n\n def _generate_domains(self, bcorag: BcoRag):\n \"\"\"Performs the bcorag query on each domain.\n\n Parameters\n ----------\n bcorag : BcoRag\n The setup BcoRag instance.\n \"\"\"\n\n domain: DomainKey\n for domain in get_args(DomainKey):\n\n t0 = time.time()\n with supress_stdout():\n bcorag.perform_query(domain)\n self._log_output(f\"\\t{domain.upper()} domain generated, elapsed time: {time.time() - t0}\")\n\n def _create_bcorag(\n self, user_selections: UserSelections\n ) -> BcoRag:\n \"\"\"Creates the BcoRag instance.\n\n Parameters\n ----------\n user_selections : UserSelections\n The parameter set.\n\n Returns\n -------\n BcoRag\n The instantiated BcoRag instance.\n \"\"\"\n bcorag = BcoRag(user_selections)\n return bcorag\n\n def _log_output(self, message: str | UserSelections):\n \"\"\"Handles output. If the logger was passed in handles logging, if\n verbose is `True` handles printing (only info level logging).\n\n Parameters\n ----------\n message : str | UserSelections\n The message or param set to log and/or print.\n \"\"\"\n if self._verbose:\n if isinstance(message, str):\n print(message)\n elif isinstance(message, dict):\n pprint.pprint(message)\n if self._logger is not None:\n self._logger.info(message)\n "},{"location":"parameter-search-abc/#parameter_search.parameter_search.BcoParameterSearch.__init__","title":"__init__(search_space, verbose=True) ","text":"Constructor. Parameters: Name Type Description Default search_space SearchSpace The parameter search space. required verbose bool The verbosity level. False for no output, True for running output. True Source code in parameter_search/parameter_search.py def __init__(\n self,\n search_space: SearchSpace,\n verbose: bool = True,\n):\n \"\"\"Constructor.\n\n Parameters\n ----------\n search_space : SearchSpace\n The parameter search space.\n verbose : bool, optional\n The verbosity level. False for no output, True for running output.\n \"\"\"\n\n self._files: list[str] = search_space[\"filenames\"]\n self._loaders: list[str] = search_space[\"loader\"]\n self._chunking_configs: list[str] = search_space[\"chunking_config\"]\n self._embedding_models: list[str] = search_space[\"embedding_model\"]\n self._vector_stores: list[str] = search_space[\"vector_store\"]\n self._similarity_top_k: list[int] = search_space[\"similarity_top_k\"]\n self._llms: list[str] = search_space[\"llm\"]\n self._git_data: Optional[list[GitDataFileConfig]] = search_space[\"git_data\"]\n self._other_docs: Optional[dict[str, list[str]]] = search_space[\"other_docs\"]\n self._verbose: bool = verbose\n self._logger = self._setup_logger()\n self.backoff_time: int | float = STANDARD_BACKOFF\n self.delay_reset = 3\n "},{"location":"parameter-search-abc/#parameter_search.parameter_search.BcoParameterSearch.train","title":"train() ","text":"Starts the generation workflow. Source code in parameter_search/parameter_search.py def train(self):\n \"\"\"Starts the generation workflow.\"\"\"\n\n param_sets = self._create_param_sets()\n for idx, param_set in enumerate(param_sets):\n\n self._log_output(\n f\"------------ Param Set {idx + 1}/{len(param_sets)} ------------\"\n )\n self._log_output(param_set)\n t0 = time.time()\n\n t1 = time.time()\n bco_rag = self._create_bcorag(param_set)\n self._log_output(f\"RAG created, elapsed time: {time.time() - t1}\")\n\n t2 = time.time()\n self._generate_domains(bco_rag)\n self._log_output(\n f\"Domains generated, total elapsed time: {time.time() - t2}\"\n )\n\n self._log_output(f\"Sleeping for {self.backoff_time}...\")\n time.sleep(self.backoff_time)\n if idx % self.delay_reset == 0:\n self.backoff_time = STANDARD_BACKOFF\n else:\n self.backoff_time *= 2 + random.uniform(0, 1)\n\n self._log_output(f\"Param set elapsed time: {time.time() - t0}\")\n "},{"location":"parameter-search-abc/#parameter_search.parameter_search.BcoParameterSearch._setup_logger","title":"_setup_logger(path, name) abstractmethod ","text":"Sets up the logger. Source code in parameter_search/parameter_search.py @abstractmethod\ndef _setup_logger(self, path: str, name: str) -> Logger:\n \"\"\"Sets up the logger.\"\"\"\n pass\n "},{"location":"parameter-search-abc/#parameter_search.parameter_search.BcoParameterSearch._create_param_sets","title":"_create_param_sets() abstractmethod ","text":"Creates a list of parameter sets. Source code in parameter_search/parameter_search.py @abstractmethod\ndef _create_param_sets(self) -> list[UserSelections]:\n \"\"\"Creates a list of parameter sets.\"\"\"\n pass\n "},{"location":"parameter-search-abc/#parameter_search.parameter_search.BcoParameterSearch._generate_domains","title":"_generate_domains(bcorag) ","text":"Performs the bcorag query on each domain. Parameters: Name Type Description Default bcorag BcoRag The setup BcoRag instance. required Source code in parameter_search/parameter_search.py def _generate_domains(self, bcorag: BcoRag):\n \"\"\"Performs the bcorag query on each domain.\n\n Parameters\n ----------\n bcorag : BcoRag\n The setup BcoRag instance.\n \"\"\"\n\n domain: DomainKey\n for domain in get_args(DomainKey):\n\n t0 = time.time()\n with supress_stdout():\n bcorag.perform_query(domain)\n self._log_output(f\"\\t{domain.upper()} domain generated, elapsed time: {time.time() - t0}\")\n "},{"location":"parameter-search-abc/#parameter_search.parameter_search.BcoParameterSearch._create_bcorag","title":"_create_bcorag(user_selections) ","text":"Creates the BcoRag instance. Parameters: Name Type Description Default user_selections UserSelections The parameter set. required Returns: Type Description BcoRag The instantiated BcoRag instance. Source code in parameter_search/parameter_search.py def _create_bcorag(\n self, user_selections: UserSelections\n) -> BcoRag:\n \"\"\"Creates the BcoRag instance.\n\n Parameters\n ----------\n user_selections : UserSelections\n The parameter set.\n\n Returns\n -------\n BcoRag\n The instantiated BcoRag instance.\n \"\"\"\n bcorag = BcoRag(user_selections)\n return bcorag\n "},{"location":"parameter-search-abc/#parameter_search.parameter_search.BcoParameterSearch._log_output","title":"_log_output(message) ","text":"Handles output. If the logger was passed in handles logging, if verbose is True handles printing (only info level logging). Parameters: Name Type Description Default message str | UserSelections The message or param set to log and/or print. required Source code in parameter_search/parameter_search.py def _log_output(self, message: str | UserSelections):\n \"\"\"Handles output. If the logger was passed in handles logging, if\n verbose is `True` handles printing (only info level logging).\n\n Parameters\n ----------\n message : str | UserSelections\n The message or param set to log and/or print.\n \"\"\"\n if self._verbose:\n if isinstance(message, str):\n print(message)\n elif isinstance(message, dict):\n pprint.pprint(message)\n if self._logger is not None:\n self._logger.info(message)\n "},{"location":"parameter-search/","title":"Parameter Search","text":" - Search Space
- Grid Search
- Random Search
If wanting to test multiple parameter sets and/or papers, the BcoRag tool has an accompanying wrapper tool that implements a similar concept to hyperparameter tuning, offering grid and random parameter set search capabilities. "},{"location":"parameter-search/#search-space","title":"Search Space","text":"The parameter search tool uses a custom data type called a SearchSpace , which is defined as such: class SearchSpace(TypedDict):\n \"\"\"Search space used for parameter searches.\"\"\"\n\n filenames: list[str]\n loader: list[str]\n chunking_config: list[str]\n embedding_model: list[str]\n vector_store: list[str]\n similarity_top_k: list[str]\n llm: list[str]\n git_data: Optional[list[GitDataFileConfig]]\n The SearchSpace type has a corresponding initialization function to help with creating a search space. The init_search_space function is defined as such: def init_search_space(\n filenames: Optional[list[str] | str] = None,\n loader: Optional[list[str] | str] = None,\n chunking_config: Optional[list[str] | str] = None,\n embedding_model: Optional[list[str] | str] = None,\n vector_store: Optional[list[str] | str] = None,\n similarity_top_k: Optional[list[int] | int] = None,\n llm: Optional[list[str] | str] = None,\n git_data: Optional[list[GitDataFileConfig]] = None,\n) -> SearchSpace:\n \"\"\"Creates a search space instance.\n\n Parameters\n ----------\n filenames : list[str], str, or None (default: None)\n The filenames to test over for the search space (if None,\n defaults to all the filenames in the `bcorag/test_papers/`\n directory). Note, many files can increase run time\n significantly as a full parameter search will be executed\n on each paper sequentially.\n loader : list[str], str, or None (default: None)\n The data loaders for the search space (if None, defaults to\n the full list as defined in the conf.json list).\n chunking_config : list[str], str, or None (default: None)\n The chunking strategies for the search space (if None, defaults\n to the full list as defined in the conf.json list).\n embedding_model : list[str], str, or None (default: None)\n The embedding model for the search space (if None, defaults\n to the full list as defined in the conf.json list).\n vector_store : list[str], str, or None (default: None)\n The vector store for the search space (if None, defaults\n to the full list as defined in the conf.json list).\n similarity_top_k : list[int], int, or None (default: None)\n The similarity top k for the search space (if None, defaults\n to the full list as defined in the conf.json list).\n llm : list[str], str, or None (default: None)\n The llm for the search space (if None, defaults\n to the full list as defined in the conf.json list).\n git_data : list[GitDataFileConfig], GitDataFileConfig or None (default: None)\n The git data for each file (if None, assumes no git data for\n any files).\n\n Returns\n -------\n SearchSpace\n The search space grid.\n \"\"\"\n # initialization function\n "},{"location":"parameter-search/#grid-search","title":"Grid Search","text":"A grid search can be run from the main.py entrypoint using the grid-search positional argument like so: (env) python main.py grid-search\n This will run a grid search with the default parameter search space defined in the _create_search_space function. "},{"location":"parameter-search/#random-search","title":"Random Search","text":"A random search can be run from the main.py entrypoint using the random-search positional argument like so: (env) python main.py random-search\n This will run a random search with the default parameter search space defined in the _create_search_space function using a parameter subset value of 5 . "},{"location":"prompts/","title":"Prompts","text":"The standardized query prompts. QUERY_PROMPT : The standard wrapper used for each prompt. _TOP_LEVEL_SCHEMA : The entire top level 2791 object schema. SUPPLEMENT_PROMPT : Supplementary prompt for the domains that require the top level schema. USABILITY_DOMAIN : The usability domain specific prompt and schema. IO_DOMAIN : The IO domain specific prompt and schema. DESCRIPTION_DOMAIN : The description domain specific prompt and schema. EXECUTION_DOMAIN : The execution domain specific prompt and schema. PARAMETRIC_DOMAIN : The parametric domain specific prompt and schema. ERROR_DOMAIN : The error domain specific prompt and schema. DOMAIN_MAP : The domain map for the BcoRag object. "},{"location":"random-search/","title":"Random Search","text":"Random search class. "},{"location":"random-search/#parameter_search.random_search.BcoRandomSearch","title":"BcoRandomSearch ","text":" Bases: BcoParameterSearch BCO random search class. Subclass of BcoParameterSearch . Source code in parameter_search/random_search.py class BcoRandomSearch(BcoParameterSearch):\n \"\"\"BCO random search class. Subclass of `BcoParameterSearch`.\"\"\"\n\n def __init__(self, search_space: SearchSpace, subset_size: int = 5):\n \"\"\"Constructor.\n\n Parameters\n ----------\n search_space : SearchSpace\n The parameter search space.\n subset_size : int (default: 5)\n The number of parameter sets to search.\n \"\"\"\n super().__init__(search_space)\n self.subset_size = subset_size\n\n def _setup_logger(\n self, path: str = \"./logs\", name: str = \"random-search\"\n ) -> Logger:\n \"\"\"Sets up the logger.\n\n Parameters\n ----------\n path : str, optional\n File path for the logger.\n name : str, optional\n Name for the logger output.\n\n Returns\n -------\n Logger\n The grid search logger.\n \"\"\"\n check_dir(path)\n if not name.endswith(\".log\"):\n name = f\"{name}.log\"\n return setup_root_logger(os.path.join(path, name))\n\n def _create_param_sets(self) -> list[UserSelections]:\n \"\"\"Creates a random subset of the parameter space.\n\n Returns\n -------\n list[UserSelections]\n A random subset of the search space combinations.\n \"\"\"\n param_sets: list[UserSelections] = []\n\n for (\n llm,\n embedding_model,\n filepath,\n loader,\n chunking_config,\n vector_store,\n similarity_top_k,\n ) in product(\n self._llms,\n self._embedding_models,\n self._files,\n self._loaders,\n self._chunking_configs,\n self._vector_stores,\n self._similarity_top_k,\n ):\n base_selections = {\n \"llm\": llm,\n \"embedding_model\": embedding_model,\n \"filename\": os.path.basename(str(filepath)),\n \"filepath\": filepath,\n \"vector_store\": vector_store,\n \"loader\": loader,\n \"mode\": \"production\",\n \"similarity_top_k\": similarity_top_k,\n \"chunking_config\": chunking_config,\n }\n\n if self._git_data is None:\n base_selections[\"git_data\"] = None\n else:\n for git_data in self._git_data:\n if git_data[\"filename\"] == filepath or git_data[\n \"filename\"\n ] == os.path.basename(str(filepath)):\n base_selections[\"git_data\"] = create_git_data(\n user=git_data[\"git_info\"][\"user\"],\n repo=git_data[\"git_info\"][\"repo\"],\n branch=git_data[\"git_info\"][\"branch\"],\n filters=git_data[\"git_info\"][\"filters\"],\n )\n\n if self._other_docs is None:\n base_selections[\"other_docs\"] = None\n else:\n for paper, other_docs in self._other_docs.items():\n if paper == os.path.basename(str(filepath)):\n base_selections[\"other_docs\"] = other_docs\n\n user_selections = create_user_selections(\n base_selections[\"llm\"],\n base_selections[\"embedding_model\"],\n base_selections[\"filename\"],\n base_selections[\"filepath\"],\n base_selections[\"vector_store\"],\n base_selections[\"loader\"],\n base_selections[\"mode\"],\n base_selections[\"similarity_top_k\"],\n base_selections[\"chunking_config\"],\n base_selections[\"git_data\"],\n base_selections[\"other_docs\"],\n )\n param_sets.append(user_selections)\n\n if self.subset_size > len(param_sets):\n self.subset_size = len(param_sets)\n\n param_subset = random.sample(param_sets, self.subset_size)\n\n return param_subset\n "},{"location":"random-search/#parameter_search.random_search.BcoRandomSearch.__init__","title":"__init__(search_space, subset_size=5) ","text":"Constructor. Parameters: Name Type Description Default search_space SearchSpace The parameter search space. required subset_size int (default: 5) The number of parameter sets to search. 5 Source code in parameter_search/random_search.py def __init__(self, search_space: SearchSpace, subset_size: int = 5):\n \"\"\"Constructor.\n\n Parameters\n ----------\n search_space : SearchSpace\n The parameter search space.\n subset_size : int (default: 5)\n The number of parameter sets to search.\n \"\"\"\n super().__init__(search_space)\n self.subset_size = subset_size\n "},{"location":"random-search/#parameter_search.random_search.BcoRandomSearch._setup_logger","title":"_setup_logger(path='./logs', name='random-search') ","text":"Sets up the logger. Parameters: Name Type Description Default path str File path for the logger. './logs' name str Name for the logger output. 'random-search' Returns: Type Description Logger The grid search logger. Source code in parameter_search/random_search.py def _setup_logger(\n self, path: str = \"./logs\", name: str = \"random-search\"\n) -> Logger:\n \"\"\"Sets up the logger.\n\n Parameters\n ----------\n path : str, optional\n File path for the logger.\n name : str, optional\n Name for the logger output.\n\n Returns\n -------\n Logger\n The grid search logger.\n \"\"\"\n check_dir(path)\n if not name.endswith(\".log\"):\n name = f\"{name}.log\"\n return setup_root_logger(os.path.join(path, name))\n "},{"location":"random-search/#parameter_search.random_search.BcoRandomSearch._create_param_sets","title":"_create_param_sets() ","text":"Creates a random subset of the parameter space. Returns: Type Description list[UserSelections] A random subset of the search space combinations. Source code in parameter_search/random_search.py def _create_param_sets(self) -> list[UserSelections]:\n \"\"\"Creates a random subset of the parameter space.\n\n Returns\n -------\n list[UserSelections]\n A random subset of the search space combinations.\n \"\"\"\n param_sets: list[UserSelections] = []\n\n for (\n llm,\n embedding_model,\n filepath,\n loader,\n chunking_config,\n vector_store,\n similarity_top_k,\n ) in product(\n self._llms,\n self._embedding_models,\n self._files,\n self._loaders,\n self._chunking_configs,\n self._vector_stores,\n self._similarity_top_k,\n ):\n base_selections = {\n \"llm\": llm,\n \"embedding_model\": embedding_model,\n \"filename\": os.path.basename(str(filepath)),\n \"filepath\": filepath,\n \"vector_store\": vector_store,\n \"loader\": loader,\n \"mode\": \"production\",\n \"similarity_top_k\": similarity_top_k,\n \"chunking_config\": chunking_config,\n }\n\n if self._git_data is None:\n base_selections[\"git_data\"] = None\n else:\n for git_data in self._git_data:\n if git_data[\"filename\"] == filepath or git_data[\n \"filename\"\n ] == os.path.basename(str(filepath)):\n base_selections[\"git_data\"] = create_git_data(\n user=git_data[\"git_info\"][\"user\"],\n repo=git_data[\"git_info\"][\"repo\"],\n branch=git_data[\"git_info\"][\"branch\"],\n filters=git_data[\"git_info\"][\"filters\"],\n )\n\n if self._other_docs is None:\n base_selections[\"other_docs\"] = None\n else:\n for paper, other_docs in self._other_docs.items():\n if paper == os.path.basename(str(filepath)):\n base_selections[\"other_docs\"] = other_docs\n\n user_selections = create_user_selections(\n base_selections[\"llm\"],\n base_selections[\"embedding_model\"],\n base_selections[\"filename\"],\n base_selections[\"filepath\"],\n base_selections[\"vector_store\"],\n base_selections[\"loader\"],\n base_selections[\"mode\"],\n base_selections[\"similarity_top_k\"],\n base_selections[\"chunking_config\"],\n base_selections[\"git_data\"],\n base_selections[\"other_docs\"],\n )\n param_sets.append(user_selections)\n\n if self.subset_size > len(param_sets):\n self.subset_size = len(param_sets)\n\n param_subset = random.sample(param_sets, self.subset_size)\n\n return param_subset\n "},{"location":"reference-frame/","title":"Reference Frame","text":""},{"location":"reference-frame/#evaluator.frontend.components.evaluation_frames.reference_frame.ReferenceFrame","title":"ReferenceFrame ","text":" Bases: CTkFrame , EvaluationBaseFrame Class for the reference evaluation frame. Source code in evaluator/frontend/components/evaluation_frames/reference_frame.py class ReferenceFrame(ctk.CTkFrame, EvaluationBaseFrame):\n \"\"\"Class for the reference evaluation frame.\"\"\"\n\n def __init__(\n self, master: ctk.CTkFrame, app_state: AppState, run_state: RunState, **kwargs\n ):\n \"\"\"Constructor.\"\"\"\n super().__init__(master, **kwargs)\n self.state = app_state\n self.run = run_state\n self.reference_eval = self.run[\"eval_data\"][\"reference_eval\"]\n\n self.grid_columnconfigure(0, weight=1)\n self.grid_rowconfigure(1, weight=1)\n\n self.main_ref_label = ctk.CTkLabel(\n master=self,\n text=\"Reference Evaluation\",\n font=(self.state[\"font\"], 28, \"bold\"),\n )\n self.main_ref_label.grid(\n row=0, columnspan=2, padx=self.state[\"padding\"], pady=self.state[\"padding\"]\n )\n\n self.ref_eval_label = ctk.CTkLabel(\n master=self,\n text=\"How relevant are the reference nodes?\",\n font=(self.state[\"font\"], 16, \"bold\"),\n )\n self.ref_eval_label.grid(\n row=2,\n column=0,\n padx=(self.state[\"padding\"], self.state[\"padding\"] // 4),\n pady=(self.state[\"padding\"] // 2, self.state[\"padding\"] // 4),\n sticky=\"w\",\n )\n\n self.ref_eval_var = ctk.IntVar(\n value=self.reference_eval.get(\n \"reference_relevancy\", EVAL_DEFAULTS[\"reference_relevancy\"]\n )\n )\n self.ref_eval_button = ctk.CTkSegmentedButton(\n master=self, values=[-1, 0, 1, 2], variable=self.ref_eval_var\n )\n self.ref_eval_button.grid(\n row=2,\n column=1,\n padx=(self.state[\"padding\"] // 4, self.state[\"padding\"]),\n pady=self.state[\"padding\"] // 4,\n )\n\n self.top_ref_var = ctk.StringVar(\n value=reverse_cast_checkbox(\n self.reference_eval.get(\n \"top_reference_retrieval\", EVAL_DEFAULTS[\"top_reference_retrieval\"]\n )\n )\n )\n self.top_ref_checkbox = ctk.CTkCheckBox(\n master=self,\n text=\"Top reference is most relevant?\",\n variable=self.top_ref_var,\n onvalue=\"on\",\n offvalue=\"off\",\n )\n self.top_ref_checkbox.grid(\n row=3,\n column=0,\n padx=self.state[\"padding\"],\n pady=self.state[\"padding\"] // 4,\n sticky=\"w\",\n )\n\n self.ref_notes_label = ctk.CTkLabel(\n master=self, text=\"Notes\", font=(self.state[\"font\"], 16, \"bold\")\n )\n self.ref_notes_label.grid(\n row=4,\n column=0,\n padx=self.state[\"padding\"],\n pady=(self.state[\"padding\"] // 2, self.state[\"padding\"] // 4),\n sticky=\"w\",\n )\n\n self.ref_notes = ctk.CTkTextbox(\n master=self, wrap=\"word\", font=(self.state[\"font\"], 18)\n )\n self.ref_notes.grid(\n row=5,\n columnspan=2,\n padx=self.state[\"padding\"] // 2,\n pady=(self.state[\"padding\"] // 4, self.state[\"padding\"]),\n sticky=\"nsew\",\n )\n\n def update_state(self, app_state: AppState, run_state: RunState) -> None:\n \"\"\"Update the state.\n\n Parameters\n ----------\n app_state : AppState\n The updated app state.\n run_state : RunState\n The updated run state.\n \"\"\"\n self.run = run_state\n self.state = app_state\n self.reference_eval = self.run[\"eval_data\"][\"reference_eval\"]\n\n self.ref_eval_var = ctk.IntVar(\n value=self.reference_eval.get(\n \"reference_relevancy\", EVAL_DEFAULTS[\"reference_relevancy\"]\n )\n )\n self.ref_eval_button.configure(variable=self.ref_eval_var)\n\n self.top_ref_var = ctk.StringVar(\n value=reverse_cast_checkbox(\n self.reference_eval.get(\n \"top_reference_retrieval\", EVAL_DEFAULTS[\"top_reference_retrieval\"]\n )\n )\n )\n self.top_ref_checkbox.configure(variable=self.top_ref_var)\n\n self.ref_notes.delete(0.0, \"end\")\n self.ref_notes.insert(\n 0.0, self.reference_eval.get(\"notes\", EVAL_DEFAULTS[\"notes\"])\n )\n\n def get_results(self) -> RefereceEval:\n \"\"\"Returns the reference evaluations.\n\n Returns\n -------\n ReferenceEval\n The reference evaluation results.\n \"\"\"\n ref_eval_score = self.ref_eval_var.get()\n top_ref_val = self.top_ref_var.get()\n notes = self.ref_notes.get(0.0, \"end\")\n ref_eval = create_reference_eval(\n reference_relevancy=ref_eval_score,\n top_reference_retrieval=top_ref_val,\n notes=notes,\n )\n return ref_eval\n "},{"location":"reference-frame/#evaluator.frontend.components.evaluation_frames.reference_frame.ReferenceFrame.__init__","title":"__init__(master, app_state, run_state, **kwargs) ","text":"Constructor. Source code in evaluator/frontend/components/evaluation_frames/reference_frame.py def __init__(\n self, master: ctk.CTkFrame, app_state: AppState, run_state: RunState, **kwargs\n):\n \"\"\"Constructor.\"\"\"\n super().__init__(master, **kwargs)\n self.state = app_state\n self.run = run_state\n self.reference_eval = self.run[\"eval_data\"][\"reference_eval\"]\n\n self.grid_columnconfigure(0, weight=1)\n self.grid_rowconfigure(1, weight=1)\n\n self.main_ref_label = ctk.CTkLabel(\n master=self,\n text=\"Reference Evaluation\",\n font=(self.state[\"font\"], 28, \"bold\"),\n )\n self.main_ref_label.grid(\n row=0, columnspan=2, padx=self.state[\"padding\"], pady=self.state[\"padding\"]\n )\n\n self.ref_eval_label = ctk.CTkLabel(\n master=self,\n text=\"How relevant are the reference nodes?\",\n font=(self.state[\"font\"], 16, \"bold\"),\n )\n self.ref_eval_label.grid(\n row=2,\n column=0,\n padx=(self.state[\"padding\"], self.state[\"padding\"] // 4),\n pady=(self.state[\"padding\"] // 2, self.state[\"padding\"] // 4),\n sticky=\"w\",\n )\n\n self.ref_eval_var = ctk.IntVar(\n value=self.reference_eval.get(\n \"reference_relevancy\", EVAL_DEFAULTS[\"reference_relevancy\"]\n )\n )\n self.ref_eval_button = ctk.CTkSegmentedButton(\n master=self, values=[-1, 0, 1, 2], variable=self.ref_eval_var\n )\n self.ref_eval_button.grid(\n row=2,\n column=1,\n padx=(self.state[\"padding\"] // 4, self.state[\"padding\"]),\n pady=self.state[\"padding\"] // 4,\n )\n\n self.top_ref_var = ctk.StringVar(\n value=reverse_cast_checkbox(\n self.reference_eval.get(\n \"top_reference_retrieval\", EVAL_DEFAULTS[\"top_reference_retrieval\"]\n )\n )\n )\n self.top_ref_checkbox = ctk.CTkCheckBox(\n master=self,\n text=\"Top reference is most relevant?\",\n variable=self.top_ref_var,\n onvalue=\"on\",\n offvalue=\"off\",\n )\n self.top_ref_checkbox.grid(\n row=3,\n column=0,\n padx=self.state[\"padding\"],\n pady=self.state[\"padding\"] // 4,\n sticky=\"w\",\n )\n\n self.ref_notes_label = ctk.CTkLabel(\n master=self, text=\"Notes\", font=(self.state[\"font\"], 16, \"bold\")\n )\n self.ref_notes_label.grid(\n row=4,\n column=0,\n padx=self.state[\"padding\"],\n pady=(self.state[\"padding\"] // 2, self.state[\"padding\"] // 4),\n sticky=\"w\",\n )\n\n self.ref_notes = ctk.CTkTextbox(\n master=self, wrap=\"word\", font=(self.state[\"font\"], 18)\n )\n self.ref_notes.grid(\n row=5,\n columnspan=2,\n padx=self.state[\"padding\"] // 2,\n pady=(self.state[\"padding\"] // 4, self.state[\"padding\"]),\n sticky=\"nsew\",\n )\n "},{"location":"reference-frame/#evaluator.frontend.components.evaluation_frames.reference_frame.ReferenceFrame.update_state","title":"update_state(app_state, run_state) ","text":"Update the state. Parameters: Name Type Description Default app_state AppState The updated app state. required run_state RunState The updated run state. required Source code in evaluator/frontend/components/evaluation_frames/reference_frame.py def update_state(self, app_state: AppState, run_state: RunState) -> None:\n \"\"\"Update the state.\n\n Parameters\n ----------\n app_state : AppState\n The updated app state.\n run_state : RunState\n The updated run state.\n \"\"\"\n self.run = run_state\n self.state = app_state\n self.reference_eval = self.run[\"eval_data\"][\"reference_eval\"]\n\n self.ref_eval_var = ctk.IntVar(\n value=self.reference_eval.get(\n \"reference_relevancy\", EVAL_DEFAULTS[\"reference_relevancy\"]\n )\n )\n self.ref_eval_button.configure(variable=self.ref_eval_var)\n\n self.top_ref_var = ctk.StringVar(\n value=reverse_cast_checkbox(\n self.reference_eval.get(\n \"top_reference_retrieval\", EVAL_DEFAULTS[\"top_reference_retrieval\"]\n )\n )\n )\n self.top_ref_checkbox.configure(variable=self.top_ref_var)\n\n self.ref_notes.delete(0.0, \"end\")\n self.ref_notes.insert(\n 0.0, self.reference_eval.get(\"notes\", EVAL_DEFAULTS[\"notes\"])\n )\n "},{"location":"reference-frame/#evaluator.frontend.components.evaluation_frames.reference_frame.ReferenceFrame.get_results","title":"get_results() ","text":"Returns the reference evaluations. Returns: Type Description ReferenceEval The reference evaluation results. Source code in evaluator/frontend/components/evaluation_frames/reference_frame.py def get_results(self) -> RefereceEval:\n \"\"\"Returns the reference evaluations.\n\n Returns\n -------\n ReferenceEval\n The reference evaluation results.\n \"\"\"\n ref_eval_score = self.ref_eval_var.get()\n top_ref_val = self.top_ref_var.get()\n notes = self.ref_notes.get(0.0, \"end\")\n ref_eval = create_reference_eval(\n reference_relevancy=ref_eval_score,\n top_reference_retrieval=top_ref_val,\n notes=notes,\n )\n return ref_eval\n "},{"location":"score-frame/","title":"Score Frame","text":""},{"location":"score-frame/#evaluator.frontend.components.evaluation_frames.score_frame.ScoreFrame","title":"ScoreFrame ","text":" Bases: CTkFrame , EvaluationBaseFrame Class for the score evaluation frame. Source code in evaluator/frontend/components/evaluation_frames/score_frame.py class ScoreFrame(ctk.CTkFrame, EvaluationBaseFrame):\n \"\"\"Class for the score evaluation frame.\"\"\"\n\n def __init__(\n self, master: ctk.CTkFrame, app_state: AppState, run_state: RunState, **kwargs\n ):\n \"\"\"Constructor.\"\"\"\n super().__init__(master, **kwargs)\n self.state = app_state\n self.run = run_state\n self.score_eval = run_state[\"eval_data\"][\"score_eval\"]\n\n self.grid_columnconfigure(0, weight=1)\n self.grid_rowconfigure(1, weight=1)\n\n self.main_score_label = ctk.CTkLabel(\n master=self, text=\"Score Evaluation\", font=(self.state[\"font\"], 28, \"bold\")\n )\n self.main_score_label.grid(\n row=0, columnspan=3, padx=self.state[\"padding\"], pady=self.state[\"padding\"]\n )\n\n self.score_label = ctk.CTkLabel(\n master=self, text=\"Score:\", font=(self.state[\"font\"], 16, \"bold\")\n )\n self.score_label.grid(\n row=2,\n column=0,\n padx=(self.state[\"padding\"], self.state[\"padding\"] // 2),\n pady=self.state[\"padding\"] // 4,\n sticky=\"w\",\n )\n\n self.score_text = ctk.CTkLabel(master=self, font=(self.state[\"font\"], 16))\n self.score_text.grid(\n row=2,\n column=1,\n padx=(self.state[\"padding\"] // 2, self.state[\"padding\"]),\n pady=self.state[\"padding\"] // 4,\n sticky=\"w\",\n )\n\n self.score_version_label = ctk.CTkLabel(\n master=self, text=\"Score version:\", font=(self.state[\"font\"], 16, \"bold\")\n )\n self.score_version_label.grid(\n row=3,\n column=0,\n padx=(self.state[\"padding\"], self.state[\"padding\"] // 2),\n pady=self.state[\"padding\"] // 4,\n sticky=\"w\",\n )\n\n self.score_version_text = ctk.CTkLabel(\n master=self, font=(self.state[\"font\"], 16)\n )\n self.score_version_text.grid(\n row=3,\n column=1,\n padx=(self.state[\"padding\"] // 2, self.state[\"padding\"]),\n pady=self.state[\"padding\"] // 4,\n sticky=\"w\",\n )\n\n self.score_eval_label = ctk.CTkLabel(\n master=self,\n text=\"Should the score be higher or lower?\",\n font=(self.state[\"font\"], 16, \"bold\"),\n )\n self.score_eval_label.grid(\n row=2,\n column=2,\n padx=self.state[\"padding\"],\n pady=self.state[\"padding\"] // 4,\n )\n\n self.score_eval_var = ctk.StringVar(value=self.score_eval[\"eval\"])\n self.score_eval_button = ctk.CTkSegmentedButton(\n master=self,\n values=[\"Lower\", \"About right\", \"Higher\"],\n variable=self.score_eval_var,\n )\n self.score_eval_button.grid(\n row=3,\n column=2,\n padx=self.state[\"padding\"],\n pady=self.state[\"padding\"] // 4,\n )\n\n self.score_notes_label = ctk.CTkLabel(\n master=self, text=\"Notes\", font=(self.state[\"font\"], 16, \"bold\")\n )\n self.score_notes_label.grid(\n row=6,\n column=0,\n padx=self.state[\"padding\"],\n pady=(self.state[\"padding\"] // 2, self.state[\"padding\"] // 4),\n sticky=\"w\",\n )\n\n self.score_notes = ctk.CTkTextbox(\n master=self, wrap=\"word\", font=(self.state[\"font\"], 18)\n )\n self.score_notes.grid(\n row=7,\n column=0,\n columnspan=3,\n padx=self.state[\"padding\"] // 2,\n pady=(self.state[\"padding\"] // 4, self.state[\"padding\"]),\n sticky=\"nsew\",\n )\n\n self.update_state(app_state=self.state, run_state=self.run)\n\n def update_state(self, app_state: AppState, run_state: RunState) -> None:\n \"\"\"Update the component state.\n\n Parameters\n ----------\n app_state : AppState\n The updated app state.\n run_state : RunState\n The updated run state.\n \"\"\"\n self.run = run_state\n self.state = app_state\n self.score_eval = self.run[\"eval_data\"][\"score_eval\"]\n\n self.score_text.configure(text=f\"{self.run['score']}\")\n self.score_version_text.configure(text=f\"{self.run['score_version']}\")\n\n self.score_eval_var = ctk.StringVar(\n value=self.score_eval.get(\"eval\", EVAL_DEFAULTS[\"eval\"])\n )\n self.score_eval_button.configure(variable=self.score_eval_var)\n\n self.score_notes.delete(0.0, \"end\")\n self.score_notes.insert(\n 0.0, self.score_eval.get(\"notes\", EVAL_DEFAULTS[\"notes\"])\n )\n\n def get_results(self) -> ScoreEval:\n \"\"\"Returns the score evaluations.\n\n Returns\n -------\n ScoreEval\n The score evaluation results.\n \"\"\"\n score_eval_button_val = cast_score_eval(self.score_eval_var.get())\n score_eval = create_score_eval(\n eval=score_eval_button_val, notes=self.score_notes.get(0.0, \"end\")\n )\n return score_eval\n "},{"location":"score-frame/#evaluator.frontend.components.evaluation_frames.score_frame.ScoreFrame.__init__","title":"__init__(master, app_state, run_state, **kwargs) ","text":"Constructor. Source code in evaluator/frontend/components/evaluation_frames/score_frame.py def __init__(\n self, master: ctk.CTkFrame, app_state: AppState, run_state: RunState, **kwargs\n):\n \"\"\"Constructor.\"\"\"\n super().__init__(master, **kwargs)\n self.state = app_state\n self.run = run_state\n self.score_eval = run_state[\"eval_data\"][\"score_eval\"]\n\n self.grid_columnconfigure(0, weight=1)\n self.grid_rowconfigure(1, weight=1)\n\n self.main_score_label = ctk.CTkLabel(\n master=self, text=\"Score Evaluation\", font=(self.state[\"font\"], 28, \"bold\")\n )\n self.main_score_label.grid(\n row=0, columnspan=3, padx=self.state[\"padding\"], pady=self.state[\"padding\"]\n )\n\n self.score_label = ctk.CTkLabel(\n master=self, text=\"Score:\", font=(self.state[\"font\"], 16, \"bold\")\n )\n self.score_label.grid(\n row=2,\n column=0,\n padx=(self.state[\"padding\"], self.state[\"padding\"] // 2),\n pady=self.state[\"padding\"] // 4,\n sticky=\"w\",\n )\n\n self.score_text = ctk.CTkLabel(master=self, font=(self.state[\"font\"], 16))\n self.score_text.grid(\n row=2,\n column=1,\n padx=(self.state[\"padding\"] // 2, self.state[\"padding\"]),\n pady=self.state[\"padding\"] // 4,\n sticky=\"w\",\n )\n\n self.score_version_label = ctk.CTkLabel(\n master=self, text=\"Score version:\", font=(self.state[\"font\"], 16, \"bold\")\n )\n self.score_version_label.grid(\n row=3,\n column=0,\n padx=(self.state[\"padding\"], self.state[\"padding\"] // 2),\n pady=self.state[\"padding\"] // 4,\n sticky=\"w\",\n )\n\n self.score_version_text = ctk.CTkLabel(\n master=self, font=(self.state[\"font\"], 16)\n )\n self.score_version_text.grid(\n row=3,\n column=1,\n padx=(self.state[\"padding\"] // 2, self.state[\"padding\"]),\n pady=self.state[\"padding\"] // 4,\n sticky=\"w\",\n )\n\n self.score_eval_label = ctk.CTkLabel(\n master=self,\n text=\"Should the score be higher or lower?\",\n font=(self.state[\"font\"], 16, \"bold\"),\n )\n self.score_eval_label.grid(\n row=2,\n column=2,\n padx=self.state[\"padding\"],\n pady=self.state[\"padding\"] // 4,\n )\n\n self.score_eval_var = ctk.StringVar(value=self.score_eval[\"eval\"])\n self.score_eval_button = ctk.CTkSegmentedButton(\n master=self,\n values=[\"Lower\", \"About right\", \"Higher\"],\n variable=self.score_eval_var,\n )\n self.score_eval_button.grid(\n row=3,\n column=2,\n padx=self.state[\"padding\"],\n pady=self.state[\"padding\"] // 4,\n )\n\n self.score_notes_label = ctk.CTkLabel(\n master=self, text=\"Notes\", font=(self.state[\"font\"], 16, \"bold\")\n )\n self.score_notes_label.grid(\n row=6,\n column=0,\n padx=self.state[\"padding\"],\n pady=(self.state[\"padding\"] // 2, self.state[\"padding\"] // 4),\n sticky=\"w\",\n )\n\n self.score_notes = ctk.CTkTextbox(\n master=self, wrap=\"word\", font=(self.state[\"font\"], 18)\n )\n self.score_notes.grid(\n row=7,\n column=0,\n columnspan=3,\n padx=self.state[\"padding\"] // 2,\n pady=(self.state[\"padding\"] // 4, self.state[\"padding\"]),\n sticky=\"nsew\",\n )\n\n self.update_state(app_state=self.state, run_state=self.run)\n "},{"location":"score-frame/#evaluator.frontend.components.evaluation_frames.score_frame.ScoreFrame.update_state","title":"update_state(app_state, run_state) ","text":"Update the component state. Parameters: Name Type Description Default app_state AppState The updated app state. required run_state RunState The updated run state. required Source code in evaluator/frontend/components/evaluation_frames/score_frame.py def update_state(self, app_state: AppState, run_state: RunState) -> None:\n \"\"\"Update the component state.\n\n Parameters\n ----------\n app_state : AppState\n The updated app state.\n run_state : RunState\n The updated run state.\n \"\"\"\n self.run = run_state\n self.state = app_state\n self.score_eval = self.run[\"eval_data\"][\"score_eval\"]\n\n self.score_text.configure(text=f\"{self.run['score']}\")\n self.score_version_text.configure(text=f\"{self.run['score_version']}\")\n\n self.score_eval_var = ctk.StringVar(\n value=self.score_eval.get(\"eval\", EVAL_DEFAULTS[\"eval\"])\n )\n self.score_eval_button.configure(variable=self.score_eval_var)\n\n self.score_notes.delete(0.0, \"end\")\n self.score_notes.insert(\n 0.0, self.score_eval.get(\"notes\", EVAL_DEFAULTS[\"notes\"])\n )\n "},{"location":"score-frame/#evaluator.frontend.components.evaluation_frames.score_frame.ScoreFrame.get_results","title":"get_results() ","text":"Returns the score evaluations. Returns: Type Description ScoreEval The score evaluation results. Source code in evaluator/frontend/components/evaluation_frames/score_frame.py def get_results(self) -> ScoreEval:\n \"\"\"Returns the score evaluations.\n\n Returns\n -------\n ScoreEval\n The score evaluation results.\n \"\"\"\n score_eval_button_val = cast_score_eval(self.score_eval_var.get())\n score_eval = create_score_eval(\n eval=score_eval_button_val, notes=self.score_notes.get(0.0, \"end\")\n )\n return score_eval\n "},{"location":"sidebar/","title":"Sidebar","text":""},{"location":"sidebar/#evaluator.frontend.components.sidebar.SideBar","title":"SideBar ","text":" Bases: CTkFrame Class for the navigation sidebar. Source code in evaluator/frontend/components/sidebar.py class SideBar(ctk.CTkFrame):\n \"\"\"Class for the navigation sidebar.\"\"\"\n\n def __init__(\n self,\n master: ctk.CTkFrame,\n app_state: AppState,\n run_state: RunState,\n navigate: Callable[[Literal[-1, 1], int, AppState], None],\n on_save: Callable[[AppState], None],\n on_exit: Callable[[], NoReturn],\n **kwargs,\n ):\n \"\"\"Constructor.\"\"\"\n super().__init__(master, **kwargs)\n\n self.state = app_state\n self.run = run_state\n self.navigate = navigate\n self.save = on_save\n self.exit = on_exit\n\n self.sidebar_frame = ctk.CTkFrame(master=master, width=140, corner_radius=0)\n self.sidebar_frame.grid(row=0, column=1, sticky=\"nsew\")\n self.sidebar_frame.grid_rowconfigure(7, weight=1)\n\n padding = self.state[\"padding\"]\n half_padding = padding // 2\n\n self.navigate_label = ctk.CTkLabel(\n master=self.sidebar_frame,\n text=\"Navigate\",\n font=(self.state[\"font\"], 16, \"bold\"),\n )\n self.navigate_label.grid(\n row=0, column=0, padx=padding, pady=(padding, half_padding)\n )\n\n self.prev_button = ctk.CTkButton(\n master=self.sidebar_frame,\n text=\"Previous\",\n command=self._previous,\n state=(\"normal\" if self.run[\"run_index\"] > 0 else \"disabled\"),\n )\n self.prev_button.grid(row=1, column=0, padx=padding, pady=half_padding)\n\n self.next_button = ctk.CTkButton(\n master=self.sidebar_frame,\n text=\"Next\",\n command=self._next,\n state=(\n \"normal\"\n if self.run[\"run_index\"] < self.run[\"total_runs\"] - 1\n else \"disabled\"\n ),\n )\n self.next_button.grid(row=2, column=0, padx=padding, pady=half_padding)\n\n self.run_counter_label = ctk.CTkLabel(\n master=self.sidebar_frame,\n text=f\"Run: {self.run['run_index'] + 1} / {self.run['total_runs']}\",\n font=(self.state[\"font\"], 16, \"bold\"),\n )\n self.run_counter_label.grid(row=3, column=0, padx=padding, pady=half_padding)\n\n self.already_evaluated_label = ctk.CTkLabel(\n master=self.sidebar_frame,\n text=\"Already Evaluated\" if self.run[\"already_evaluated\"] else \"\",\n font=(self.state[\"font\"], 16, \"bold\"),\n text_color=\"red\",\n )\n self.already_evaluated_label.grid(\n row=4, column=0, padx=padding, pady=half_padding\n )\n\n self.save_button = ctk.CTkButton(\n master=self.sidebar_frame, text=\"Save\", command=self._save\n )\n self.save_button.grid(row=5, column=0, padx=padding, pady=half_padding)\n\n self.exit_button = ctk.CTkButton(\n master=self.sidebar_frame, text=\"Exit\", command=self._exit\n )\n self.exit_button.grid(row=6, column=0, padx=padding, pady=half_padding)\n\n self.appearance_label = ctk.CTkLabel(\n self.sidebar_frame,\n text=\"Appearance\",\n font=(self.state[\"font\"], 16, \"bold\"),\n )\n self.appearance_label.grid(\n row=8, column=0, padx=padding, pady=(padding, half_padding)\n )\n\n self.appearance_option_menu = ctk.CTkOptionMenu(\n self.sidebar_frame,\n values=[\"System\", \"Light\", \"Dark\"],\n command=self._change_appearance_mode,\n )\n self.appearance_option_menu.grid(\n row=9, column=0, padx=padding, pady=half_padding\n )\n\n self.scaling_label = ctk.CTkLabel(\n master=self.sidebar_frame,\n text=\"UI Scaling\",\n font=(self.state[\"font\"], 16, \"bold\"),\n )\n self.scaling_label.grid(row=10, column=0, padx=padding, pady=half_padding)\n\n self.scaling_option_menu = ctk.CTkOptionMenu(\n master=self.sidebar_frame,\n values=[\"70%\", \"80%\", \"90%\", \"100%\", \"110%\", \"120%\", \"130%\"],\n command=self._change_scaling_value,\n )\n self.scaling_option_menu.grid(\n row=11, column=0, padx=padding, pady=(half_padding, padding)\n )\n\n def update_state(self, run_state: RunState) -> None:\n \"\"\"Updates the run state for consistency.\"\"\"\n self.run = run_state\n self.run_counter_label.configure(\n text=f\"Run: {self.run['run_index'] + 1} / {self.run['total_runs']}\"\n )\n self.already_evaluated_label.configure(\n text=\"Already evaluated\" if self.run[\"already_evaluated\"] else \"\"\n )\n\n def _previous(self) -> None:\n \"\"\"Callback for the previous button press.\"\"\"\n new_run_index = self.run[\"run_index\"] - 1\n if new_run_index == 0:\n self.prev_button.configure(state=\"disabled\")\n else:\n self.prev_button.configure(state=\"normal\")\n self.next_button.configure(state=\"normal\")\n self.navigate(-1, new_run_index, self.state)\n\n def _next(self) -> None:\n \"\"\"Callback for the next button press.\"\"\"\n new_run_index = self.run[\"run_index\"] + 1\n if new_run_index >= self.run[\"total_runs\"] - 1:\n self.next_button.configure(state=\"disabled\")\n else:\n self.next_button.configure(state=\"normal\")\n self.prev_button.configure(state=\"normal\")\n self.navigate(1, new_run_index, self.state)\n\n def _change_appearance_mode(self, new_appearance_mode: str) -> None:\n \"\"\"Changes the UI color mode.\"\"\"\n ctk.set_appearance_mode(new_appearance_mode)\n\n def _change_scaling_value(self, new_scaling: str) -> None:\n \"\"\"Changes the UI scaling.\"\"\"\n new_scaling_val = int(new_scaling.replace(\"%\", \"\")) / 100\n ctk.set_widget_scaling(new_scaling_val)\n\n def _save(self) -> None:\n \"\"\"Calls the save state function.\"\"\"\n self.save(self.state)\n\n def _exit(self) -> NoReturn:\n \"\"\"Calls the exit function.\"\"\"\n self.save(self.state)\n self.exit()\n "},{"location":"sidebar/#evaluator.frontend.components.sidebar.SideBar.__init__","title":"__init__(master, app_state, run_state, navigate, on_save, on_exit, **kwargs) ","text":"Constructor. Source code in evaluator/frontend/components/sidebar.py def __init__(\n self,\n master: ctk.CTkFrame,\n app_state: AppState,\n run_state: RunState,\n navigate: Callable[[Literal[-1, 1], int, AppState], None],\n on_save: Callable[[AppState], None],\n on_exit: Callable[[], NoReturn],\n **kwargs,\n):\n \"\"\"Constructor.\"\"\"\n super().__init__(master, **kwargs)\n\n self.state = app_state\n self.run = run_state\n self.navigate = navigate\n self.save = on_save\n self.exit = on_exit\n\n self.sidebar_frame = ctk.CTkFrame(master=master, width=140, corner_radius=0)\n self.sidebar_frame.grid(row=0, column=1, sticky=\"nsew\")\n self.sidebar_frame.grid_rowconfigure(7, weight=1)\n\n padding = self.state[\"padding\"]\n half_padding = padding // 2\n\n self.navigate_label = ctk.CTkLabel(\n master=self.sidebar_frame,\n text=\"Navigate\",\n font=(self.state[\"font\"], 16, \"bold\"),\n )\n self.navigate_label.grid(\n row=0, column=0, padx=padding, pady=(padding, half_padding)\n )\n\n self.prev_button = ctk.CTkButton(\n master=self.sidebar_frame,\n text=\"Previous\",\n command=self._previous,\n state=(\"normal\" if self.run[\"run_index\"] > 0 else \"disabled\"),\n )\n self.prev_button.grid(row=1, column=0, padx=padding, pady=half_padding)\n\n self.next_button = ctk.CTkButton(\n master=self.sidebar_frame,\n text=\"Next\",\n command=self._next,\n state=(\n \"normal\"\n if self.run[\"run_index\"] < self.run[\"total_runs\"] - 1\n else \"disabled\"\n ),\n )\n self.next_button.grid(row=2, column=0, padx=padding, pady=half_padding)\n\n self.run_counter_label = ctk.CTkLabel(\n master=self.sidebar_frame,\n text=f\"Run: {self.run['run_index'] + 1} / {self.run['total_runs']}\",\n font=(self.state[\"font\"], 16, \"bold\"),\n )\n self.run_counter_label.grid(row=3, column=0, padx=padding, pady=half_padding)\n\n self.already_evaluated_label = ctk.CTkLabel(\n master=self.sidebar_frame,\n text=\"Already Evaluated\" if self.run[\"already_evaluated\"] else \"\",\n font=(self.state[\"font\"], 16, \"bold\"),\n text_color=\"red\",\n )\n self.already_evaluated_label.grid(\n row=4, column=0, padx=padding, pady=half_padding\n )\n\n self.save_button = ctk.CTkButton(\n master=self.sidebar_frame, text=\"Save\", command=self._save\n )\n self.save_button.grid(row=5, column=0, padx=padding, pady=half_padding)\n\n self.exit_button = ctk.CTkButton(\n master=self.sidebar_frame, text=\"Exit\", command=self._exit\n )\n self.exit_button.grid(row=6, column=0, padx=padding, pady=half_padding)\n\n self.appearance_label = ctk.CTkLabel(\n self.sidebar_frame,\n text=\"Appearance\",\n font=(self.state[\"font\"], 16, \"bold\"),\n )\n self.appearance_label.grid(\n row=8, column=0, padx=padding, pady=(padding, half_padding)\n )\n\n self.appearance_option_menu = ctk.CTkOptionMenu(\n self.sidebar_frame,\n values=[\"System\", \"Light\", \"Dark\"],\n command=self._change_appearance_mode,\n )\n self.appearance_option_menu.grid(\n row=9, column=0, padx=padding, pady=half_padding\n )\n\n self.scaling_label = ctk.CTkLabel(\n master=self.sidebar_frame,\n text=\"UI Scaling\",\n font=(self.state[\"font\"], 16, \"bold\"),\n )\n self.scaling_label.grid(row=10, column=0, padx=padding, pady=half_padding)\n\n self.scaling_option_menu = ctk.CTkOptionMenu(\n master=self.sidebar_frame,\n values=[\"70%\", \"80%\", \"90%\", \"100%\", \"110%\", \"120%\", \"130%\"],\n command=self._change_scaling_value,\n )\n self.scaling_option_menu.grid(\n row=11, column=0, padx=padding, pady=(half_padding, padding)\n )\n "},{"location":"sidebar/#evaluator.frontend.components.sidebar.SideBar.update_state","title":"update_state(run_state) ","text":"Updates the run state for consistency. Source code in evaluator/frontend/components/sidebar.py def update_state(self, run_state: RunState) -> None:\n \"\"\"Updates the run state for consistency.\"\"\"\n self.run = run_state\n self.run_counter_label.configure(\n text=f\"Run: {self.run['run_index'] + 1} / {self.run['total_runs']}\"\n )\n self.already_evaluated_label.configure(\n text=\"Already evaluated\" if self.run[\"already_evaluated\"] else \"\"\n )\n "},{"location":"sidebar/#evaluator.frontend.components.sidebar.SideBar._previous","title":"_previous() ","text":"Callback for the previous button press. Source code in evaluator/frontend/components/sidebar.py def _previous(self) -> None:\n \"\"\"Callback for the previous button press.\"\"\"\n new_run_index = self.run[\"run_index\"] - 1\n if new_run_index == 0:\n self.prev_button.configure(state=\"disabled\")\n else:\n self.prev_button.configure(state=\"normal\")\n self.next_button.configure(state=\"normal\")\n self.navigate(-1, new_run_index, self.state)\n "},{"location":"sidebar/#evaluator.frontend.components.sidebar.SideBar._next","title":"_next() ","text":"Callback for the next button press. Source code in evaluator/frontend/components/sidebar.py def _next(self) -> None:\n \"\"\"Callback for the next button press.\"\"\"\n new_run_index = self.run[\"run_index\"] + 1\n if new_run_index >= self.run[\"total_runs\"] - 1:\n self.next_button.configure(state=\"disabled\")\n else:\n self.next_button.configure(state=\"normal\")\n self.prev_button.configure(state=\"normal\")\n self.navigate(1, new_run_index, self.state)\n "},{"location":"sidebar/#evaluator.frontend.components.sidebar.SideBar._change_appearance_mode","title":"_change_appearance_mode(new_appearance_mode) ","text":"Changes the UI color mode. Source code in evaluator/frontend/components/sidebar.py def _change_appearance_mode(self, new_appearance_mode: str) -> None:\n \"\"\"Changes the UI color mode.\"\"\"\n ctk.set_appearance_mode(new_appearance_mode)\n "},{"location":"sidebar/#evaluator.frontend.components.sidebar.SideBar._change_scaling_value","title":"_change_scaling_value(new_scaling) ","text":"Changes the UI scaling. Source code in evaluator/frontend/components/sidebar.py def _change_scaling_value(self, new_scaling: str) -> None:\n \"\"\"Changes the UI scaling.\"\"\"\n new_scaling_val = int(new_scaling.replace(\"%\", \"\")) / 100\n ctk.set_widget_scaling(new_scaling_val)\n "},{"location":"sidebar/#evaluator.frontend.components.sidebar.SideBar._save","title":"_save() ","text":"Calls the save state function. Source code in evaluator/frontend/components/sidebar.py def _save(self) -> None:\n \"\"\"Calls the save state function.\"\"\"\n self.save(self.state)\n "},{"location":"sidebar/#evaluator.frontend.components.sidebar.SideBar._exit","title":"_exit() ","text":"Calls the exit function. Source code in evaluator/frontend/components/sidebar.py def _exit(self) -> NoReturn:\n \"\"\"Calls the exit function.\"\"\"\n self.save(self.state)\n self.exit()\n "},{"location":"state/","title":"State","text":"Handles all app and run state changes. "},{"location":"state/#evaluator.backend.state.create_new_user","title":"create_new_user(app_state, first_name, last_name) ","text":"Creates a new user. Parameters: Name Type Description Default app_state AppState The current app state. required first_name str The user's first name. required last_name str The user's last name. required Returns: Type Description AppState The updated app state. Source code in evaluator/backend/state.py def create_new_user(app_state: AppState, first_name: str, last_name: str) -> AppState:\n \"\"\"Creates a new user.\n\n Parameters\n ----------\n app_state : AppState\n The current app state.\n first_name : str\n The user's first name.\n last_name : str\n The user's last name.\n\n Returns\n -------\n AppState\n The updated app state.\n \"\"\"\n app_state[\"logger\"].info(f\"Creating new user for {last_name}, {first_name}\")\n app_state[\"users_data\"][app_state[\"user_hash\"]] = {\n \"first_name\": first_name,\n \"last_name\": last_name,\n }\n app_state[\"user_results_data\"][app_state[\"user_hash\"]] = None\n return app_state\n "},{"location":"state/#evaluator.backend.state.set_resume_session","title":"set_resume_session(app_state, resume_session) ","text":"Sets the resume session boolean. Parameters: Name Type Description Default app_state AppState The current app state. required resume_session bool The resume_session value to set. required Returns: Type Description AppState The updated app state. Source code in evaluator/backend/state.py def set_resume_session(app_state: AppState, resume_session: bool) -> AppState:\n \"\"\"Sets the resume session boolean.\n\n Parameters\n ----------\n app_state : AppState\n The current app state.\n resume_session : bool\n The resume_session value to set.\n\n Returns\n -------\n AppState\n The updated app state.\n \"\"\"\n app_state[\"resume_session\"] = resume_session\n return app_state\n "},{"location":"state/#evaluator.backend.state.save_state","title":"save_state(app_state) ","text":"Saves the state. Parameters: Name Type Description Default app_state AppState The app state to save. required Source code in evaluator/backend/state.py def save_state(app_state: AppState) -> None:\n \"\"\"Saves the state.\n\n Parameters\n ----------\n app_state : AppState\n The app state to save.\n \"\"\"\n app_state[\"logger\"].info(\"Writing data...\")\n misc_fns.write_json(\n output_path=os.path.join(\n app_state[\"results_dir_path\"], app_state[\"bco_results_file_name\"]\n ),\n data=app_state[\"bco_results_data\"],\n )\n misc_fns.write_json(\n output_path=os.path.join(\n app_state[\"results_dir_path\"], app_state[\"user_results_file_name\"]\n ),\n data=app_state[\"user_results_data\"],\n )\n misc_fns.write_json(\n output_path=os.path.join(\n app_state[\"results_dir_path\"], app_state[\"users_file_name\"]\n ),\n data=app_state[\"users_data\"],\n )\n "},{"location":"state/#evaluator.backend.state.submit_eval_state","title":"submit_eval_state(app_state, run_state) ","text":"Updates the app state with the submitted evaluation data. If the eval state is the default eval state this function will silently not perform the update. Parameters: Name Type Description Default app_state AppState The app state to update. required run_state RunState The run state to update from. required Returns: Type Description AppState The updated app state. Source code in evaluator/backend/state.py def submit_eval_state(app_state: AppState, run_state: RunState) -> AppState:\n \"\"\"Updates the app state with the submitted evaluation data. If the\n eval state is the default eval state this function will silently not\n perform the update.\n\n Parameters\n ----------\n app_state : AppState\n The app state to update.\n run_state : RunState\n The run state to update from.\n\n Returns\n -------\n AppState\n The updated app state.\n \"\"\"\n if not check_default_eval(run_state[\"eval_data\"]):\n\n user_hash = app_state[\"user_hash\"]\n file_name = os.path.basename(run_state[\"generated_file_path\"])\n file_eval = run_state[\"eval_data\"]\n\n ## update the users evaluation data file\n\n if user_hash not in app_state[\"user_results_data\"]:\n misc_fns.graceful_exit(\n 1,\n f\"Error: User hash `{user_hash}` not found in user results data on submit eval.\",\n )\n\n user_data = app_state[\"user_results_data\"][user_hash]\n if user_data is None:\n user_data = {}\n\n user_data = cast(dict[str, Optional[EvalData]], user_data)\n user_data[file_name] = file_eval\n\n app_state[\"user_results_data\"][user_hash] = user_data\n\n ## update the evaluations data file\n # TODO \n\n app_state[\"logger\"].info(\"Eval state updated...\")\n\n else:\n\n app_state[\"logger\"].info(\"Default eval set detected, not updating.\")\n\n return app_state\n "},{"location":"state/#evaluator.backend.state.load_run_state","title":"load_run_state(run_index, total_runs, app_state) ","text":"Create run state. TODO : This function is messy, should be cleaned up at some point. Parameters: Name Type Description Default run_index int The run index to load from. required total_runs int The total number of potential evaluation runs. required app_state AppState The current app state. required Returns: Type Description RunState The run state for the run at the specified index. Source code in evaluator/backend/state.py def load_run_state(run_index: int, total_runs: int, app_state: AppState) -> RunState:\n \"\"\"Create run state.\n\n TODO : This function is messy, should be cleaned up at some point.\n\n Parameters\n ----------\n run_index : int\n The run index to load from.\n total_runs : int\n The total number of potential evaluation runs.\n app_state : AppState\n The current app state.\n\n Returns\n -------\n RunState \n The run state for the run at the specified index.\n \"\"\"\n current_run = 0\n\n for directory in app_state[\"generated_directory_paths\"]:\n\n current_paper = os.path.basename(directory)\n\n output_map = misc_fns.load_json(os.path.join(directory, \"output_map.json\"))\n if output_map is None:\n misc_fns.graceful_exit(\n 1, f\"Error: Output map not found in directory `{directory}`\"\n )\n\n for domain in output_map:\n for domain_param_set in output_map[domain]:\n for domain_run in domain_param_set[\"entries\"][\"runs\"]:\n\n if current_run == run_index:\n\n generated_domain_path = str(domain_run[\"json_file\"])\n generated_domain: dict | str | None = None\n if os.path.isfile(generated_domain_path):\n generated_domain = misc_fns.load_json(generated_domain_path)\n if generated_domain is None:\n misc_fns.graceful_exit(\n 1,\n f\"Unable to load generated JSON data at `{generated_domain_path}`.\",\n )\n else:\n generated_domain_path = domain_run[\"txt_file\"]\n raw_txt = open(generated_domain_path, \"r\").read()\n generated_domain = f\"Failed JSON serialization. Raw text output:\\n\\n{raw_txt}\"\n\n domain = os.path.basename(generated_domain_path.split(\"-\")[0])\n\n human_curated_path = os.path.join(\n app_state[\"generated_output_dir_root\"],\n \"human_curated\",\n f\"{os.path.basename(directory)}.json\",\n )\n if not os.path.isfile(human_curated_path):\n misc_fns.graceful_exit(\n 1,\n f\"Human curated BCO file not found at filepath `{human_curated_path}`.\",\n )\n human_curated_json = misc_fns.load_json(human_curated_path)\n if human_curated_json is None:\n misc_fns.graceful_exit(\n 1,\n f\"Unable to load human curated JSON at path `{human_curated_path}`.\",\n )\n human_curated_domain_formatted_json = {\n f\"{domain}_domain\": human_curated_json[f\"{domain}_domain\"]\n }\n human_curated_domain = json.dumps(\n human_curated_domain_formatted_json, indent=4\n )\n\n param_set = json.dumps(\n domain_param_set[\"entries\"][\"params\"], indent=4\n )\n\n reference_nodes = open(\n domain_run[\"source_node_file\"], \"r\"\n ).read()\n\n already_evaluated = False\n eval_data = default_eval()\n if (\n app_state[\"user_results_data\"][app_state[\"user_hash\"]]\n is not None\n ):\n user_eval_data = app_state[\"user_results_data\"][\n app_state[\"user_hash\"]\n ]\n if (user_eval_data is not None) and (\n os.path.basename(generated_domain_path)\n in user_eval_data\n ):\n user_file_eval = user_eval_data[\n os.path.basename(generated_domain_path)\n ]\n if user_file_eval is not None:\n already_evaluated = True\n eval_data = user_file_eval\n\n run_state = create_run_state(\n paper=current_paper,\n domain=domain,\n generated_domain=generated_domain,\n generated_file_path=generated_domain_path,\n human_curated_domain=human_curated_domain,\n param_set=param_set,\n reference_nodes=reference_nodes,\n run_index=run_index,\n total_runs=total_runs,\n already_evaluated=already_evaluated,\n logger=app_state[\"logger\"],\n eval_data=eval_data,\n )\n\n log_state(run_state, \"run\")\n return run_state\n\n current_run += 1\n\n misc_fns.graceful_exit(1, f\"Failed to load run state for run index `{run_index}`.\")\n "},{"location":"tab-view/","title":"Tab View","text":""},{"location":"tab-view/#evaluator.frontend.components.tab_view.TabView","title":"TabView ","text":" Bases: CTkTabview , EvaluationBaseFrame Class for the view page tab view. Source code in evaluator/frontend/components/tab_view.py class TabView(ctk.CTkTabview, EvaluationBaseFrame):\n \"\"\"Class for the view page tab view.\"\"\"\n\n def __init__(\n self,\n master: ctk.CTkFrame,\n app_state: AppState,\n run_state: RunState,\n on_submit: Callable,\n **kwargs\n ):\n \"\"\"Constructor.\"\"\"\n super().__init__(master, **kwargs)\n\n self.state = app_state\n self.run = run_state\n self.on_submit = on_submit\n\n self.add(\"Compare JSON\")\n self.add(\"Source Nodes\")\n self.add(\"Parameter Set\")\n self.add(\"Evaluate\")\n\n self._create_compare_json_tab()\n self._create_source_node_tab()\n self._create_parameter_set_tab()\n self._create_evaluate_tab()\n\n self.update_state(app_state=self.state, run_state=self.run)\n\n def update_state(self, app_state: AppState, run_state: RunState) -> None:\n \"\"\"Loads the run data and updates the state.\n\n Parameters\n ----------\n run_state : RunState\n The run to laod.\n \"\"\"\n self.run = run_state\n self.state = app_state\n\n self.left_json_text.configure(state=\"normal\")\n self.left_json_text.delete(0.0, \"end\")\n self.left_json_text.insert(0.0, self.run[\"human_curated_domain\"])\n self.left_json_text.configure(state=\"disabled\")\n\n self.right_json_text.configure(state=\"normal\")\n self.right_json_text.delete(0.0, \"end\")\n self.right_json_text.insert(\"0.0\", self.run[\"generated_domain\"])\n self.right_json_text.configure(state=\"disabled\")\n\n self.source_node_text.configure(state=\"normal\")\n self.source_node_text.delete(0.0, \"end\")\n self.source_node_text.insert(0.0, self.run[\"reference_nodes\"])\n self.source_node_text.configure(state=\"disabled\")\n\n self.parameter_set_text.configure(state=\"normal\")\n self.parameter_set_text.delete(0.0, \"end\")\n self.parameter_set_text.insert(\"0.0\", self.run[\"param_set\"])\n self.parameter_set_text.configure(state=\"disabled\")\n\n self.score_frame.update_state(app_state=self.state, run_state=self.run)\n self.err_frame.update_state(app_state=self.state, run_state=self.run)\n self.ref_frame.update_state(app_state=self.state, run_state=self.run)\n self.general_frame.update_state(app_state=self.state, run_state=self.run)\n self.misc_frame.update_state(app_state=self.state, run_state=self.run)\n\n def get_results(self) -> EvalData:\n \"\"\"Returns the score evaluations.\"\"\"\n score_eval = self.score_frame.get_results()\n error_eval = self.err_frame.get_results()\n reference_eval = self.ref_frame.get_results()\n general_eval = self.general_frame.get_results()\n misc_eval = self.misc_frame.get_results()\n eval_data = create_full_eval(\n score_eval=score_eval,\n error_eval=error_eval,\n reference_eval=reference_eval,\n general_eval=general_eval,\n misc_eval=misc_eval,\n )\n return eval_data\n\n def _create_evaluate_tab(self) -> None:\n \"\"\"Creates the evaluate tab view.\"\"\"\n self.evaluate_frame = self.tab(\"Evaluate\")\n self.evaluate_frame.grid_columnconfigure((0, 1, 2), weight=1)\n self.evaluate_frame.grid_rowconfigure((0, 1), weight=1)\n\n self.score_frame = ScoreFrame(\n master=self.evaluate_frame, app_state=self.state, run_state=self.run\n )\n self.score_frame.grid(\n row=0,\n column=0,\n padx=self.state[\"padding\"],\n pady=self.state[\"padding\"],\n sticky=\"nsew\",\n )\n\n self.err_frame = ErrorFrame(\n master=self.evaluate_frame, app_state=self.state, run_state=self.run\n )\n self.err_frame.grid(\n row=0,\n column=1,\n padx=self.state[\"padding\"],\n pady=self.state[\"padding\"],\n sticky=\"nsew\",\n )\n\n self.ref_frame = ReferenceFrame(\n master=self.evaluate_frame, app_state=self.state, run_state=self.run\n )\n self.ref_frame.grid(\n row=0,\n column=2,\n padx=self.state[\"padding\"],\n pady=self.state[\"padding\"],\n sticky=\"nsew\",\n )\n\n self.general_frame = GeneralFrame(\n master=self.evaluate_frame, app_state=self.state, run_state=self.run\n )\n self.general_frame.grid(\n row=1,\n column=0,\n padx=self.state[\"padding\"],\n pady=self.state[\"padding\"],\n sticky=\"nsew\",\n )\n\n self.misc_frame = MiscFrame(\n master=self.evaluate_frame, app_state=self.state, run_state=self.run\n )\n self.misc_frame.grid(\n row=1,\n column=1,\n columnspan=2,\n padx=self.state[\"padding\"],\n pady=self.state[\"padding\"],\n sticky=\"nsew\",\n )\n\n self.submit_button = ctk.CTkButton(\n master=self.evaluate_frame, text=\"Submit\", command=self.on_submit\n )\n self.submit_button.grid(\n row=6,\n column=2,\n padx=self.state[\"padding\"],\n pady=self.state[\"padding\"],\n sticky=\"se\",\n )\n\n def _create_compare_json_tab(self) -> None:\n \"\"\"Creates the compare JSON tab view.\"\"\"\n self.compare_frame = self.tab(\"Compare JSON\")\n self.compare_frame.grid_columnconfigure(0, weight=1)\n self.compare_frame.grid_columnconfigure(1, weight=1)\n self.compare_frame.grid_rowconfigure(0, weight=0)\n self.compare_frame.grid_rowconfigure(1, weight=1)\n\n self.left_label = ctk.CTkLabel(\n master=self.compare_frame,\n text=\"Human Curated Domain\",\n font=(self.state[\"font\"], 18, \"bold\"),\n )\n self.left_label.grid(\n row=0, column=0, padx=self.state[\"padding\"], pady=0, sticky=\"w\"\n )\n\n self.left_json_text = ctk.CTkTextbox(\n master=self.compare_frame, wrap=\"none\", font=(self.state[\"font\"], 18)\n )\n self.left_json_text.grid(\n row=1,\n column=0,\n padx=(self.state[\"padding\"], self.state[\"padding\"] // 2),\n pady=(0, self.state[\"padding\"] // 2),\n sticky=\"nsew\",\n )\n self.left_json_text.configure(state=\"disabled\")\n\n self.right_label = ctk.CTkLabel(\n master=self.compare_frame,\n text=\"Generated Domain\",\n font=(self.state[\"font\"], 18, \"bold\"),\n )\n self.right_label.grid(\n row=0,\n column=1,\n padx=(self.state[\"padding\"] // 2, self.state[\"padding\"]),\n pady=0,\n sticky=\"w\",\n )\n\n self.right_json_text = ctk.CTkTextbox(\n master=self.compare_frame, wrap=\"none\", font=(self.state[\"font\"], 18)\n )\n self.right_json_text.grid(\n row=1,\n column=1,\n padx=(self.state[\"padding\"] // 2, self.state[\"padding\"]),\n pady=(0, self.state[\"padding\"] // 2),\n sticky=\"nsew\",\n )\n self.right_json_text.configure(state=\"disabled\")\n\n def _create_source_node_tab(self) -> None:\n \"\"\"Creates the source node tab.\"\"\"\n self.source_node_frame = self.tab(\"Source Nodes\")\n self.source_node_frame.grid_columnconfigure(0, weight=1)\n self.source_node_frame.grid_rowconfigure(0, weight=1)\n\n self.source_node_text = ctk.CTkTextbox(\n master=self.source_node_frame, wrap=\"none\", font=(self.state[\"font\"], 18)\n )\n self.source_node_text.grid(\n row=0,\n column=0,\n padx=self.state[\"padding\"],\n pady=self.state[\"padding\"],\n sticky=\"nsew\",\n )\n self.source_node_text.configure(state=\"disabled\")\n\n def _create_parameter_set_tab(self) -> None:\n \"\"\"Creates the parameter set tab.\"\"\"\n self.parameter_set_frame = self.tab(\"Parameter Set\")\n self.parameter_set_frame.grid_columnconfigure(0, weight=1)\n self.parameter_set_frame.grid_rowconfigure(0, weight=1)\n\n self.parameter_set_text = ctk.CTkTextbox(\n master=self.parameter_set_frame, wrap=\"none\", font=(self.state[\"font\"], 18)\n )\n self.parameter_set_text.grid(\n row=0,\n column=0,\n padx=self.state[\"padding\"],\n pady=self.state[\"padding\"],\n sticky=\"nsew\",\n )\n self.parameter_set_text.configure(state=\"disabled\")\n "},{"location":"tab-view/#evaluator.frontend.components.tab_view.TabView.__init__","title":"__init__(master, app_state, run_state, on_submit, **kwargs) ","text":"Constructor. Source code in evaluator/frontend/components/tab_view.py def __init__(\n self,\n master: ctk.CTkFrame,\n app_state: AppState,\n run_state: RunState,\n on_submit: Callable,\n **kwargs\n):\n \"\"\"Constructor.\"\"\"\n super().__init__(master, **kwargs)\n\n self.state = app_state\n self.run = run_state\n self.on_submit = on_submit\n\n self.add(\"Compare JSON\")\n self.add(\"Source Nodes\")\n self.add(\"Parameter Set\")\n self.add(\"Evaluate\")\n\n self._create_compare_json_tab()\n self._create_source_node_tab()\n self._create_parameter_set_tab()\n self._create_evaluate_tab()\n\n self.update_state(app_state=self.state, run_state=self.run)\n "},{"location":"tab-view/#evaluator.frontend.components.tab_view.TabView.update_state","title":"update_state(app_state, run_state) ","text":"Loads the run data and updates the state. Parameters: Name Type Description Default run_state RunState The run to laod. required Source code in evaluator/frontend/components/tab_view.py def update_state(self, app_state: AppState, run_state: RunState) -> None:\n \"\"\"Loads the run data and updates the state.\n\n Parameters\n ----------\n run_state : RunState\n The run to laod.\n \"\"\"\n self.run = run_state\n self.state = app_state\n\n self.left_json_text.configure(state=\"normal\")\n self.left_json_text.delete(0.0, \"end\")\n self.left_json_text.insert(0.0, self.run[\"human_curated_domain\"])\n self.left_json_text.configure(state=\"disabled\")\n\n self.right_json_text.configure(state=\"normal\")\n self.right_json_text.delete(0.0, \"end\")\n self.right_json_text.insert(\"0.0\", self.run[\"generated_domain\"])\n self.right_json_text.configure(state=\"disabled\")\n\n self.source_node_text.configure(state=\"normal\")\n self.source_node_text.delete(0.0, \"end\")\n self.source_node_text.insert(0.0, self.run[\"reference_nodes\"])\n self.source_node_text.configure(state=\"disabled\")\n\n self.parameter_set_text.configure(state=\"normal\")\n self.parameter_set_text.delete(0.0, \"end\")\n self.parameter_set_text.insert(\"0.0\", self.run[\"param_set\"])\n self.parameter_set_text.configure(state=\"disabled\")\n\n self.score_frame.update_state(app_state=self.state, run_state=self.run)\n self.err_frame.update_state(app_state=self.state, run_state=self.run)\n self.ref_frame.update_state(app_state=self.state, run_state=self.run)\n self.general_frame.update_state(app_state=self.state, run_state=self.run)\n self.misc_frame.update_state(app_state=self.state, run_state=self.run)\n "},{"location":"tab-view/#evaluator.frontend.components.tab_view.TabView.get_results","title":"get_results() ","text":"Returns the score evaluations. Source code in evaluator/frontend/components/tab_view.py def get_results(self) -> EvalData:\n \"\"\"Returns the score evaluations.\"\"\"\n score_eval = self.score_frame.get_results()\n error_eval = self.err_frame.get_results()\n reference_eval = self.ref_frame.get_results()\n general_eval = self.general_frame.get_results()\n misc_eval = self.misc_frame.get_results()\n eval_data = create_full_eval(\n score_eval=score_eval,\n error_eval=error_eval,\n reference_eval=reference_eval,\n general_eval=general_eval,\n misc_eval=misc_eval,\n )\n return eval_data\n "},{"location":"tab-view/#evaluator.frontend.components.tab_view.TabView._create_evaluate_tab","title":"_create_evaluate_tab() ","text":"Creates the evaluate tab view. Source code in evaluator/frontend/components/tab_view.py def _create_evaluate_tab(self) -> None:\n \"\"\"Creates the evaluate tab view.\"\"\"\n self.evaluate_frame = self.tab(\"Evaluate\")\n self.evaluate_frame.grid_columnconfigure((0, 1, 2), weight=1)\n self.evaluate_frame.grid_rowconfigure((0, 1), weight=1)\n\n self.score_frame = ScoreFrame(\n master=self.evaluate_frame, app_state=self.state, run_state=self.run\n )\n self.score_frame.grid(\n row=0,\n column=0,\n padx=self.state[\"padding\"],\n pady=self.state[\"padding\"],\n sticky=\"nsew\",\n )\n\n self.err_frame = ErrorFrame(\n master=self.evaluate_frame, app_state=self.state, run_state=self.run\n )\n self.err_frame.grid(\n row=0,\n column=1,\n padx=self.state[\"padding\"],\n pady=self.state[\"padding\"],\n sticky=\"nsew\",\n )\n\n self.ref_frame = ReferenceFrame(\n master=self.evaluate_frame, app_state=self.state, run_state=self.run\n )\n self.ref_frame.grid(\n row=0,\n column=2,\n padx=self.state[\"padding\"],\n pady=self.state[\"padding\"],\n sticky=\"nsew\",\n )\n\n self.general_frame = GeneralFrame(\n master=self.evaluate_frame, app_state=self.state, run_state=self.run\n )\n self.general_frame.grid(\n row=1,\n column=0,\n padx=self.state[\"padding\"],\n pady=self.state[\"padding\"],\n sticky=\"nsew\",\n )\n\n self.misc_frame = MiscFrame(\n master=self.evaluate_frame, app_state=self.state, run_state=self.run\n )\n self.misc_frame.grid(\n row=1,\n column=1,\n columnspan=2,\n padx=self.state[\"padding\"],\n pady=self.state[\"padding\"],\n sticky=\"nsew\",\n )\n\n self.submit_button = ctk.CTkButton(\n master=self.evaluate_frame, text=\"Submit\", command=self.on_submit\n )\n self.submit_button.grid(\n row=6,\n column=2,\n padx=self.state[\"padding\"],\n pady=self.state[\"padding\"],\n sticky=\"se\",\n )\n "},{"location":"tab-view/#evaluator.frontend.components.tab_view.TabView._create_compare_json_tab","title":"_create_compare_json_tab() ","text":"Creates the compare JSON tab view. Source code in evaluator/frontend/components/tab_view.py def _create_compare_json_tab(self) -> None:\n \"\"\"Creates the compare JSON tab view.\"\"\"\n self.compare_frame = self.tab(\"Compare JSON\")\n self.compare_frame.grid_columnconfigure(0, weight=1)\n self.compare_frame.grid_columnconfigure(1, weight=1)\n self.compare_frame.grid_rowconfigure(0, weight=0)\n self.compare_frame.grid_rowconfigure(1, weight=1)\n\n self.left_label = ctk.CTkLabel(\n master=self.compare_frame,\n text=\"Human Curated Domain\",\n font=(self.state[\"font\"], 18, \"bold\"),\n )\n self.left_label.grid(\n row=0, column=0, padx=self.state[\"padding\"], pady=0, sticky=\"w\"\n )\n\n self.left_json_text = ctk.CTkTextbox(\n master=self.compare_frame, wrap=\"none\", font=(self.state[\"font\"], 18)\n )\n self.left_json_text.grid(\n row=1,\n column=0,\n padx=(self.state[\"padding\"], self.state[\"padding\"] // 2),\n pady=(0, self.state[\"padding\"] // 2),\n sticky=\"nsew\",\n )\n self.left_json_text.configure(state=\"disabled\")\n\n self.right_label = ctk.CTkLabel(\n master=self.compare_frame,\n text=\"Generated Domain\",\n font=(self.state[\"font\"], 18, \"bold\"),\n )\n self.right_label.grid(\n row=0,\n column=1,\n padx=(self.state[\"padding\"] // 2, self.state[\"padding\"]),\n pady=0,\n sticky=\"w\",\n )\n\n self.right_json_text = ctk.CTkTextbox(\n master=self.compare_frame, wrap=\"none\", font=(self.state[\"font\"], 18)\n )\n self.right_json_text.grid(\n row=1,\n column=1,\n padx=(self.state[\"padding\"] // 2, self.state[\"padding\"]),\n pady=(0, self.state[\"padding\"] // 2),\n sticky=\"nsew\",\n )\n self.right_json_text.configure(state=\"disabled\")\n "},{"location":"tab-view/#evaluator.frontend.components.tab_view.TabView._create_source_node_tab","title":"_create_source_node_tab() ","text":"Creates the source node tab. Source code in evaluator/frontend/components/tab_view.py def _create_source_node_tab(self) -> None:\n \"\"\"Creates the source node tab.\"\"\"\n self.source_node_frame = self.tab(\"Source Nodes\")\n self.source_node_frame.grid_columnconfigure(0, weight=1)\n self.source_node_frame.grid_rowconfigure(0, weight=1)\n\n self.source_node_text = ctk.CTkTextbox(\n master=self.source_node_frame, wrap=\"none\", font=(self.state[\"font\"], 18)\n )\n self.source_node_text.grid(\n row=0,\n column=0,\n padx=self.state[\"padding\"],\n pady=self.state[\"padding\"],\n sticky=\"nsew\",\n )\n self.source_node_text.configure(state=\"disabled\")\n "},{"location":"tab-view/#evaluator.frontend.components.tab_view.TabView._create_parameter_set_tab","title":"_create_parameter_set_tab() ","text":"Creates the parameter set tab. Source code in evaluator/frontend/components/tab_view.py def _create_parameter_set_tab(self) -> None:\n \"\"\"Creates the parameter set tab.\"\"\"\n self.parameter_set_frame = self.tab(\"Parameter Set\")\n self.parameter_set_frame.grid_columnconfigure(0, weight=1)\n self.parameter_set_frame.grid_rowconfigure(0, weight=1)\n\n self.parameter_set_text = ctk.CTkTextbox(\n master=self.parameter_set_frame, wrap=\"none\", font=(self.state[\"font\"], 18)\n )\n self.parameter_set_text.grid(\n row=0,\n column=0,\n padx=self.state[\"padding\"],\n pady=self.state[\"padding\"],\n sticky=\"nsew\",\n )\n self.parameter_set_text.configure(state=\"disabled\")\n "},{"location":"unit-testing/","title":"Automated Testing","text":"The test_bco_rag.py script contains a suite of tests designed to evaluate the functionality of the BcoRag tool using the pytest framework and the open source LLM evaluation framework DeepEval. "},{"location":"unit-testing/#test-cases","title":"Test Cases","text":"There is one test case for each domain: test_usability test_io test_description test_execution test_parametric test_error "},{"location":"unit-testing/#test-metrics","title":"Test Metrics","text":"The test suite evaluates two different metrics: Answer Relevancy: The answer relevancy metric is used to evaluate how relevant the finalized generated output (in our case, the generated domain) is to the original input prompt. It attempts to evaluate relevancy (does the generated content directly relate to the question at hand), appropriateness (is the content appropriate given the context of the input) and focus (does the content stay on topic). The answer relevancy metric measures the quality of your RAG pipeline's generator by evaluating how relevant the actual_output of your LLM application is compared to the provided input. Faithfulness: The faithfulness metric assesses how accurate and truthful the finalized generated output (in our case, the generated domain) is concerning the source material (the retrieved content). It attempts to ensure that the content is relevant, factual, and does not contradict the information gathered from the retrieval step. The faithfulness metric measures the quality of your RAG pipeline's generator by evaluating whether the actual_output factually aligns with the contents of your retrieval_context . "},{"location":"unit-testing/#running-the-tests","title":"Running The Tests","text":"It is not recommended to run all the tests at once. The test suite uses gpt-4o in the backend to evaluate the above metrics. To run one test at a time: deepeval test run test_bco_rag.py::test_{domain} To run all the tests at once: deepeval test run test_bco_rag.py "},{"location":"view-page/","title":"View Page","text":""},{"location":"view-page/#evaluator.frontend.components.view_page.ViewPage","title":"ViewPage ","text":" Bases: CTkFrame Class for the view/evaluate page. Source code in evaluator/frontend/components/view_page.py class ViewPage(ctk.CTkFrame):\n \"\"\"Class for the view/evaluate page.\"\"\"\n\n def __init__(\n self,\n master: ctk.CTk,\n app_state: AppState,\n run_state: RunState,\n navigate: Callable[[Literal[-1, 1], int, AppState], None],\n on_save: Callable[[AppState], None],\n on_exit: Callable[[], NoReturn],\n **kwargs\n ):\n \"\"\"Constructor.\"\"\"\n super().__init__(master, **kwargs)\n\n self.state = app_state\n self.run = run_state\n self.navigate = navigate\n\n self.grid(row=0, column=0, padx=0, pady=0, sticky=\"nsew\")\n self.grid_columnconfigure(0, weight=1)\n self.grid_columnconfigure(1, weight=0)\n self.grid_rowconfigure(0, weight=1)\n\n self.sidebar = SideBar(\n master=self,\n app_state=self.state,\n run_state=self.run,\n navigate=self.navigate,\n on_save=on_save,\n on_exit=on_exit,\n )\n\n self.tab_view = TabView(master=self, app_state=self.state, run_state=self.run, on_submit=self.on_submit)\n self.tab_view.grid(row=0, column=0, padx=0, pady=0, sticky=\"nsew\")\n\n def update_state(self, app_state: AppState, run_state: RunState) -> None:\n \"\"\"Updates the state.\"\"\"\n self.run = run_state\n self.state = app_state\n self.sidebar.update_state(self.run)\n self.tab_view.update_state(app_state=app_state, run_state=self.run)\n\n def on_submit(self) -> None:\n \"\"\"Submits the user evaluation.\"\"\"\n self.run[\"eval_data\"] = self.tab_view.get_results()\n updated_app_state = submit_eval_state(self.state, self.run)\n self.update_state(app_state=updated_app_state, run_state=self.run)\n "},{"location":"view-page/#evaluator.frontend.components.view_page.ViewPage.__init__","title":"__init__(master, app_state, run_state, navigate, on_save, on_exit, **kwargs) ","text":"Constructor. Source code in evaluator/frontend/components/view_page.py def __init__(\n self,\n master: ctk.CTk,\n app_state: AppState,\n run_state: RunState,\n navigate: Callable[[Literal[-1, 1], int, AppState], None],\n on_save: Callable[[AppState], None],\n on_exit: Callable[[], NoReturn],\n **kwargs\n):\n \"\"\"Constructor.\"\"\"\n super().__init__(master, **kwargs)\n\n self.state = app_state\n self.run = run_state\n self.navigate = navigate\n\n self.grid(row=0, column=0, padx=0, pady=0, sticky=\"nsew\")\n self.grid_columnconfigure(0, weight=1)\n self.grid_columnconfigure(1, weight=0)\n self.grid_rowconfigure(0, weight=1)\n\n self.sidebar = SideBar(\n master=self,\n app_state=self.state,\n run_state=self.run,\n navigate=self.navigate,\n on_save=on_save,\n on_exit=on_exit,\n )\n\n self.tab_view = TabView(master=self, app_state=self.state, run_state=self.run, on_submit=self.on_submit)\n self.tab_view.grid(row=0, column=0, padx=0, pady=0, sticky=\"nsew\")\n "},{"location":"view-page/#evaluator.frontend.components.view_page.ViewPage.update_state","title":"update_state(app_state, run_state) ","text":"Updates the state. Source code in evaluator/frontend/components/view_page.py def update_state(self, app_state: AppState, run_state: RunState) -> None:\n \"\"\"Updates the state.\"\"\"\n self.run = run_state\n self.state = app_state\n self.sidebar.update_state(self.run)\n self.tab_view.update_state(app_state=app_state, run_state=self.run)\n "},{"location":"view-page/#evaluator.frontend.components.view_page.ViewPage.on_submit","title":"on_submit() ","text":"Submits the user evaluation. Source code in evaluator/frontend/components/view_page.py def on_submit(self) -> None:\n \"\"\"Submits the user evaluation.\"\"\"\n self.run[\"eval_data\"] = self.tab_view.get_results()\n updated_app_state = submit_eval_state(self.state, self.run)\n self.update_state(app_state=updated_app_state, run_state=self.run)\n "}]}
\ No newline at end of file
diff --git a/sidebar/index.html b/sidebar/index.html
index 17a8940..b1ad151 100644
--- a/sidebar/index.html
+++ b/sidebar/index.html
@@ -217,7 +217,7 @@
- Setup
+ Setup and Quickstart
diff --git a/sitemap.xml.gz b/sitemap.xml.gz
index da8256a..ee69b1e 100644
Binary files a/sitemap.xml.gz and b/sitemap.xml.gz differ
diff --git a/state/index.html b/state/index.html
index 8574d0e..9a3a6f3 100644
--- a/state/index.html
+++ b/state/index.html
@@ -215,7 +215,7 @@
- Setup
+ Setup and Quickstart
diff --git a/tab-view/index.html b/tab-view/index.html
index 21fb599..9ff3014 100644
--- a/tab-view/index.html
+++ b/tab-view/index.html
@@ -217,7 +217,7 @@
- Setup
+ Setup and Quickstart
diff --git a/unit-testing/index.html b/unit-testing/index.html
index 8e8d75f..bd46a93 100644
--- a/unit-testing/index.html
+++ b/unit-testing/index.html
@@ -217,7 +217,7 @@
- Setup
+ Setup and Quickstart
diff --git a/view-page/index.html b/view-page/index.html
index 42403ee..f621a02 100644
--- a/view-page/index.html
+++ b/view-page/index.html
@@ -217,7 +217,7 @@
- Setup
+ Setup and Quickstart
|