[RFC][wip] make room for batch eval

This moves the existing eval library to "test_suite_eval" and starts the equivalent for batch runs. Also makes the interface a little clearer. Essentially, the differences are: - each metric runs on a _list_ of inputs, not just one - each input can be paired with a reference. This is possible in the "test suite" setup, but it is clunkier.
lastmile-ai · Jan 23, 2024 · c8155a8 · c8155a8
1 parent 4f024fa
commit c8155a8
Show file tree

Hide file tree

Showing 71 changed files with 4,014 additions and 1,372 deletions.
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -25,9 +25,9 @@
   "[python]": {
     "editor.defaultFormatter": "ms-python.black-formatter",
     "editor.formatOnSave": true,
-    "editor.rulers": [150]
+    "editor.rulers": [79]
   },
-  "black-formatter.args": ["--line-length=150"],
+  "black-formatter.args": ["--line-length=79"],
   // example: "--disable=C0114,C0115,C0116"
   "pylint.args": []
 }
diff --git a/cookbooks/Basic-Prompt-Routing/assistant_app.py b/cookbooks/Basic-Prompt-Routing/assistant_app.py
@@ -32,8 +32,12 @@ async def assistant_response(prompt):
 
 # Streamlit Setup
 st.title("AI Teaching Assistant")
-st.markdown("Ask a math, physics, or general question. Based on your question, an AI math prof, physics prof, or general assistant will respond.")
-st.markdown("**This is a simple demo of prompt routing - based on your question, an LLM decides which AI teacher responds.**")
+st.markdown(
+    "Ask a math, physics, or general question. Based on your question, an AI math prof, physics prof, or general assistant will respond."
+)
+st.markdown(
+    "**This is a simple demo of prompt routing - based on your question, an LLM decides which AI teacher responds.**"
+)
 
 # Chat setup
 if "messages" not in st.session_state:
@@ -54,4 +58,6 @@ async def assistant_response(prompt):
     with st.chat_message("assistant"):
         st.markdown(response)
 
-    st.session_state.messages.append({"role": "assistant", "content": response})
+    st.session_state.messages.append(
+        {"role": "assistant", "content": response}
+    )
diff --git a/cookbooks/Basic-Prompt-Routing/create_config.py b/cookbooks/Basic-Prompt-Routing/create_config.py
@@ -1,10 +1,17 @@
 from aiconfig import AIConfigRuntime, Prompt
 
-aiconfig = AIConfigRuntime.create("assistant_config", "teaching assistant config")
+aiconfig = AIConfigRuntime.create(
+    "assistant_config", "teaching assistant config"
+)
 
 # Set GPT-4 as default model from Teaching Assistant prompts
 model_name = "gpt-4"
-model_settings = {"top_k": 40, "top_p": 1, "model": "gpt-4", "temperature": 0.0}
+model_settings = {
+    "top_k": 40,
+    "top_p": 1,
+    "model": "gpt-4",
+    "temperature": 0.0,
+}
 aiconfig.add_model(model_name, model_settings)
 
 

diff --git a/cookbooks/Cli-Mate/cli-mate.py b/cookbooks/Cli-Mate/cli-mate.py
@@ -53,13 +53,20 @@ async def query(aiconfig_path: str, question: str) -> list[ExecuteResult]:
     return result
 
 
-async def get_mod_result(aiconfig_path: str, source_code: str, question: str) -> list[ExecuteResult]:
+async def get_mod_result(
+    aiconfig_path: str, source_code: str, question: str
+) -> list[ExecuteResult]:
     question_about_code = f"QUERY ABOUT SOURCE CODE:\n{question}\nSOURCE CODE:\n```{source_code}\n```"
 
     return await query(aiconfig_path, question_about_code)
 
 
-async def mod_code(aiconfig_path: str, source_code_file: str, question: str, update_file: bool = False):
+async def mod_code(
+    aiconfig_path: str,
+    source_code_file: str,
+    question: str,
+    update_file: bool = False,
+):
     # read source code from file
     with open(source_code_file, "r", encoding="utf8") as file:
         source_code = file.read()
@@ -93,7 +100,9 @@ def signal_handler(_: int, __: FrameType | None):
     i = 0
     while True:
         try:
-            user_input = await event_loop.run_in_executor(None, session.prompt, "Query: [ctrl-D to exit] ")
+            user_input = await event_loop.run_in_executor(
+                None, session.prompt, "Query: [ctrl-D to exit] "
+            )
         except KeyboardInterrupt:
             continue
         except EOFError:
@@ -113,7 +122,9 @@ def signal_handler(_: int, __: FrameType | None):
             prompt = user_input
 
         # Dynamically generate the prompt name and prompt object
-        new_prompt_name = f"prompt{len(runtime.prompts)+1}"  # Prompt{number of prompts}
+        new_prompt_name = (
+            f"prompt{len(runtime.prompts)+1}"  # Prompt{number of prompts}
+        )
         new_prompt = Prompt(name=new_prompt_name, input=prompt)
 
         # Add the new prompt and run the model
@@ -144,7 +155,9 @@ async def main():
     subparsers = parser.add_subparsers(dest="command")
 
     loop_parser = subparsers.add_parser("loop")
-    loop_parser.add_argument("-scf", "--source-code-file", help="Specify a source code file.")
+    loop_parser.add_argument(
+        "-scf", "--source-code-file", help="Specify a source code file."
+    )
 
     args = parser.parse_args()
 

diff --git a/cookbooks/Gradio/hf_model_parsers.py b/cookbooks/Gradio/hf_model_parsers.py
@@ -1,22 +1,27 @@
 from aiconfig_extension_hugging_face import (
     HuggingFaceAutomaticSpeechRecognitionTransformer,
     HuggingFaceImage2TextTransformer,
-    HuggingFaceTextSummarizationTransformer,
     HuggingFaceText2ImageDiffusor,
     HuggingFaceText2SpeechTransformer,
     HuggingFaceTextGenerationTransformer,
+    HuggingFaceTextSummarizationTransformer,
     HuggingFaceTextTranslationTransformer,
 )
-
-from aiconfig_extension_hugging_face.remote_inference_client.text_generation import HuggingFaceTextGenerationParser
+from aiconfig_extension_hugging_face.remote_inference_client.text_generation import (
+    HuggingFaceTextGenerationParser,
+)
 
 from aiconfig import AIConfigRuntime
 
 
 def register_model_parsers() -> None:
     """Register model parsers for HuggingFace models."""
-    automatic_speech_recognition = HuggingFaceAutomaticSpeechRecognitionTransformer()
-    AIConfigRuntime.register_model_parser(automatic_speech_recognition, automatic_speech_recognition.id())
+    automatic_speech_recognition = (
+        HuggingFaceAutomaticSpeechRecognitionTransformer()
+    )
+    AIConfigRuntime.register_model_parser(
+        automatic_speech_recognition, automatic_speech_recognition.id()
+    )
 
     image_to_text = HuggingFaceImage2TextTransformer()
     AIConfigRuntime.register_model_parser(image_to_text, image_to_text.id())
@@ -28,12 +33,20 @@ def register_model_parsers() -> None:
     AIConfigRuntime.register_model_parser(text_to_speech, text_to_speech.id())
 
     text_generation = HuggingFaceTextGenerationTransformer()
-    AIConfigRuntime.register_model_parser(text_generation, text_generation.id())
+    AIConfigRuntime.register_model_parser(
+        text_generation, text_generation.id()
+    )
     text_summarization = HuggingFaceTextSummarizationTransformer()
-    AIConfigRuntime.register_model_parser(text_summarization, text_summarization.id())
+    AIConfigRuntime.register_model_parser(
+        text_summarization, text_summarization.id()
+    )
     text_translation = HuggingFaceTextTranslationTransformer()
-    AIConfigRuntime.register_model_parser(text_translation, text_translation.id())
+    AIConfigRuntime.register_model_parser(
+        text_translation, text_translation.id()
+    )
 
     # Register remote inference client for text generation
     text_generation_remote = HuggingFaceTextGenerationParser()
-    AIConfigRuntime.register_model_parser(text_generation_remote, text_generation_remote.id())
+    AIConfigRuntime.register_model_parser(
+        text_generation_remote, text_generation_remote.id()
+    )
diff --git a/cookbooks/HuggingFace/hf.py b/cookbooks/HuggingFace/hf.py
@@ -3,7 +3,10 @@
 
 # HuggingFace API imports
 from huggingface_hub import InferenceClient
-from huggingface_hub.inference._text_generation import TextGenerationResponse, TextGenerationStreamResponse
+from huggingface_hub.inference._text_generation import (
+    TextGenerationResponse,
+    TextGenerationStreamResponse,
+)
 
 # ModelParser Utils
 # Type hint imports
@@ -104,7 +107,9 @@ def construct_stream_output(
     return output
 
 
-def construct_regular_output(response, response_includes_details: bool) -> Output:
+def construct_regular_output(
+    response, response_includes_details: bool
+) -> Output:
     metadata = {}
     data = response
     if response_includes_details:
@@ -155,7 +160,9 @@ def __init__(self, model_id: str = None, use_api_token=True):
         if use_api_token:
             # You are allowed to use Hugging Face for a bit before you get
             # rate limited, in which case you will receive a clear error
-            token = get_api_key_from_environment("HUGGING_FACE_API_TOKEN", required=False).unwrap()
+            token = get_api_key_from_environment(
+                "HUGGING_FACE_API_TOKEN", required=False
+            ).unwrap()
 
         self.client = InferenceClient(model_id, token=token)
 
@@ -165,7 +172,14 @@ def id(self) -> str:
         """
         return "HuggingFaceTextParser"
 
-    def serialize(self, prompt_name: str, data: Any, ai_config: "AIConfigRuntime", parameters: Optional[Dict] = None, **kwargs) -> List[Prompt]:
+    def serialize(
+        self,
+        prompt_name: str,
+        data: Any,
+        ai_config: "AIConfigRuntime",
+        parameters: Optional[Dict] = None,
+        **kwargs
+    ) -> List[Prompt]:
         """
         Defines how a prompt and model inference settings get serialized in the .aiconfig.
 
@@ -188,7 +202,9 @@ def serialize(self, prompt_name: str, data: Any, ai_config: "AIConfigRuntime", p
         prompt = Prompt(
             name=prompt_name,
             input=prompt_input,
-            metadata=PromptMetadata(model=model_metadata, parameters=parameters, **kwargs),
+            metadata=PromptMetadata(
+                model=model_metadata, parameters=parameters, **kwargs
+            ),
         )
         return [prompt]
 
@@ -209,7 +225,9 @@ async def deserialize(
         Returns:
             dict: Model-specific completion parameters.
         """
-        resolved_prompt = resolve_prompt(prompt, params if params is not None else {}, aiconfig)
+        resolved_prompt = resolve_prompt(
+            prompt, params if params is not None else {}, aiconfig
+        )
 
         # Build Completion data
         model_settings = self.get_model_settings(prompt, aiconfig)
@@ -220,7 +238,9 @@ async def deserialize(
 
         return completion_data
 
-    async def run_inference(self, prompt: Prompt, aiconfig, options, parameters) -> List[Output]:
+    async def run_inference(
+        self, prompt: Prompt, aiconfig, options, parameters
+    ) -> List[Output]:
         """
         Invoked to run a prompt in the .aiconfig. This method should perform
         the actual model inference based on the provided prompt and inference settings.
@@ -232,10 +252,15 @@ async def run_inference(self, prompt: Prompt, aiconfig, options, parameters) ->
         Returns:
             InferenceResponse: The response from the model.
         """
-        completion_data = await self.deserialize(prompt, aiconfig, options, parameters)
+        completion_data = await self.deserialize(
+            prompt, aiconfig, options, parameters
+        )
 
         # if stream enabled in runtime options and config, then stream. Otherwise don't stream.
-        stream = (options.stream if options else False) and (not "stream" in completion_data or completion_data.get("stream") != False)
+        stream = (options.stream if options else False) and (
+            not "stream" in completion_data
+            or completion_data.get("stream") != False
+        )
 
         response = self.client.text_generation(**completion_data)
         response_is_detailed = completion_data.get("details", False)
@@ -248,7 +273,9 @@ async def run_inference(self, prompt: Prompt, aiconfig, options, parameters) ->
             outputs.append(output)
         else:
             # Handles stream callback
-            output = construct_stream_output(response, response_is_detailed, options)
+            output = construct_stream_output(
+                response, response_is_detailed, options
+            )
             outputs.append(output)
 
         prompt.outputs = outputs

diff --git a/cookbooks/HuggingFace/python/hf.py b/cookbooks/HuggingFace/python/hf.py
@@ -3,7 +3,10 @@
 
 # HuggingFace API imports
 from huggingface_hub import InferenceClient
-from huggingface_hub.inference._text_generation import TextGenerationResponse, TextGenerationStreamResponse
+from huggingface_hub.inference._text_generation import (
+    TextGenerationResponse,
+    TextGenerationStreamResponse,
+)
 
 # ModelParser Utils
 # Type hint imports
@@ -104,7 +107,9 @@ def construct_stream_output(
     return output
 
 
-def construct_regular_output(response, response_includes_details: bool) -> Output:
+def construct_regular_output(
+    response, response_includes_details: bool
+) -> Output:
     metadata = {}
     data = response
     if response_includes_details:
@@ -155,7 +160,9 @@ def __init__(self, model_id: str = None, use_api_token=True):
         if use_api_token:
             # You are allowed to use Hugging Face for a bit before you get
             # rate limited, in which case you will receive a clear error
-            token = get_api_key_from_environment("HUGGING_FACE_API_TOKEN", required=False).unwrap()
+            token = get_api_key_from_environment(
+                "HUGGING_FACE_API_TOKEN", required=False
+            ).unwrap()
 
         self.client = InferenceClient(model_id, token=token)
 
@@ -165,7 +172,14 @@ def id(self) -> str:
         """
         return "HuggingFaceTextParser"
 
-    def serialize(self, prompt_name: str, data: Any, ai_config: "AIConfigRuntime", parameters: Optional[Dict] = None, **kwargs) -> List[Prompt]:
+    def serialize(
+        self,
+        prompt_name: str,
+        data: Any,
+        ai_config: "AIConfigRuntime",
+        parameters: Optional[Dict] = None,
+        **kwargs
+    ) -> List[Prompt]:
         """
         Defines how a prompt and model inference settings get serialized in the .aiconfig.
 
@@ -188,7 +202,9 @@ def serialize(self, prompt_name: str, data: Any, ai_config: "AIConfigRuntime", p
         prompt = Prompt(
             name=prompt_name,
             input=prompt_input,
-            metadata=PromptMetadata(model=model_metadata, parameters=parameters, **kwargs),
+            metadata=PromptMetadata(
+                model=model_metadata, parameters=parameters, **kwargs
+            ),
         )
         return [prompt]
 
@@ -220,7 +236,9 @@ async def deserialize(
 
         return completion_data
 
-    async def run_inference(self, prompt: Prompt, aiconfig, options, parameters) -> List[Output]:
+    async def run_inference(
+        self, prompt: Prompt, aiconfig, options, parameters
+    ) -> List[Output]:
         """
         Invoked to run a prompt in the .aiconfig. This method should perform
         the actual model inference based on the provided prompt and inference settings.
@@ -232,10 +250,15 @@ async def run_inference(self, prompt: Prompt, aiconfig, options, parameters) ->
         Returns:
             InferenceResponse: The response from the model.
         """
-        completion_data = await self.deserialize(prompt, aiconfig, options, parameters)
+        completion_data = await self.deserialize(
+            prompt, aiconfig, options, parameters
+        )
 
         # if stream enabled in runtime options and config, then stream. Otherwise don't stream.
-        stream = (options.stream if options else False) and (not "stream" in completion_data or completion_data.get("stream") != False)
+        stream = (options.stream if options else False) and (
+            not "stream" in completion_data
+            or completion_data.get("stream") != False
+        )
 
         response = self.client.text_generation(**completion_data)
         response_is_detailed = completion_data.get("details", False)
@@ -248,7 +271,9 @@ async def run_inference(self, prompt: Prompt, aiconfig, options, parameters) ->
             outputs.append(output)
         else:
             # Handles stream callback
-            output = construct_stream_output(response, response_is_detailed, options)
+            output = construct_stream_output(
+                response, response_is_detailed, options
+            )
             outputs.append(output)
 
         prompt.outputs = outputs

diff --git a/cookbooks/Wizard-GPT/wizard-gpt.py b/cookbooks/Wizard-GPT/wizard-gpt.py
@@ -20,7 +20,9 @@ async def main():
             break
 
         # Dynamically generate the prompt name and prompt object
-        new_prompt_name = f"prompt{len(config.prompts)+1}"  # Prompt{number of prompts}
+        new_prompt_name = (
+            f"prompt{len(config.prompts)+1}"  # Prompt{number of prompts}
+        )
         new_prompt = Prompt(name=new_prompt_name, input=user_input)
 
         # Add the new prompt and run the model