diff --git a/docs/getting_started/quickstart/quickstart_llm.ipynb b/docs/getting_started/quickstart/quickstart_llm.ipynb index face6616a9..af63bc1293 100644 --- a/docs/getting_started/quickstart/quickstart_llm.ipynb +++ b/docs/getting_started/quickstart/quickstart_llm.ipynb @@ -269,7 +269,7 @@ " \"According to the IPCC report, what are key risks in the Europe?\",\n", " \"Is sea level rise avoidable? When will it stop?\",\n", "]\n", - "giskard_dataset = giskard.Dataset(pd.DataFrame({\"question\": examples}))\n", + "giskard_dataset = giskard.Dataset(pd.DataFrame({\"question\": examples}), target=None)\n", "\n", "print(giskard_model.predict(giskard_dataset).prediction)" ] @@ -320,8 +320,7 @@ "Scan completed: 2 issues found. (Took 0:03:11.962467)\n", "LLM-assisted detectors have used the following resources:\n", "OpenAI GPT-4 calls for evaluation: 22 (10608 prompt tokens and 1526 sampled tokens)\n", - "OpenAI API costs for evaluation amount to $0.41 (standard pricing).\n", - "\n" + "OpenAI API costs for evaluation amount to $0.41 (standard pricing).\n" ] } ], diff --git a/docs/integrations/wandb/wandb-llm-example.ipynb b/docs/integrations/wandb/wandb-llm-example.ipynb index 8f67a77ee5..27c5418b11 100644 --- a/docs/integrations/wandb/wandb-llm-example.ipynb +++ b/docs/integrations/wandb/wandb-llm-example.ipynb @@ -57,6 +57,7 @@ "cell_type": "code", "source": [ "import wandb\n", + "\n", "wandb.login(key=\"key to retrieve from https://wandb.ai/authorize\")" ], "metadata": { @@ -184,7 +185,7 @@ "outputs": [], "source": [ "models = {\"gpt-3.5-turbo\": {\"langchain\": None, \"giskard\": None, \"scan_report\": None, \"test_suite\": None},\n", - " \"gpt-4\": {\"langchain\": None, \"giskard\": None, \"scan_report\": None, \"test_suite\": None},}" + " \"gpt-4\": {\"langchain\": None, \"giskard\": None, \"scan_report\": None, \"test_suite\": None}, }" ], "metadata": { "id": "83I0M300lI6V" @@ -207,7 +208,6 @@ "from langchain.chains import SequentialChain\n", "\n", "for model in models.keys():\n", - "\n", " # langchain model powered by ChatGPT\n", " llm = ChatOpenAI(temperature=0.2, model=model)\n", "\n", @@ -216,8 +216,9 @@ " product_chain = LLMChain(llm=llm, prompt=product_prompt_template, output_key=\"description\")\n", "\n", " # Concatenation of both chains\n", - " models[model][\"langchain\"] = SequentialChain(chains=[keywords_chain, product_chain], input_variables=[\"product_name\"],\n", - " output_variables=[\"description\"])" + " models[model][\"langchain\"] = SequentialChain(chains=[keywords_chain, product_chain],\n", + " input_variables=[\"product_name\"],\n", + " output_variables=[\"description\"])" ], "metadata": { "id": "MBxfN87aN2Gc" @@ -247,10 +248,12 @@ "import giskard\n", "\n", "for model in models.keys():\n", - " models[model][\"giskard\"] = giskard.Model(models[model][\"langchain\"], name=\"Product keywords and description generator\", model_type=\"text_generation\",\n", - " description=\"Generate product description based on a product's name and the associated keywords. \"\n", - " \"Description should be using emojis and being SEO compliant.\",\n", - " feature_names=['product_name'])" + " models[model][\"giskard\"] = giskard.Model(models[model][\"langchain\"],\n", + " name=\"Product keywords and description generator\",\n", + " model_type=\"text_generation\",\n", + " description=\"Generate product description based on a product's name and the associated keywords. \"\n", + " \"Description should be using emojis and being SEO compliant.\",\n", + " feature_names=['product_name'])" ], "metadata": { "id": "FTGiW_RROFfD" @@ -271,6 +274,7 @@ "cell_type": "code", "source": [ "import pandas as pd\n", + "\n", "pd.set_option(\"display.max_colwidth\", 999)\n", "\n", "dataset = giskard.Dataset(pd.DataFrame({\n", @@ -278,7 +282,7 @@ " \"Automatic Plant Watering System\",\n", " \"Miniature Exercise Equipment\"],\n", "\n", - "}), name=\"Test dataset\",\n", + "}), name=\"Test dataset\", target=None,\n", " column_types={\"product_name\": \"text\"})" ], "metadata": { @@ -300,14 +304,15 @@ "cell_type": "code", "source": [ "import wandb\n", + "\n", "run = wandb.init(project=os.environ[\"WANDB_PROJECT\"], name=\"examples\")\n", "predictions = models[\"gpt-3.5-turbo\"][\"giskard\"].predict(dataset).prediction\n", "for k, v in dataset.df.product_name.to_dict().items():\n", - " os.environ[\"WANDB_NAME\"] = \"examples_\"+str(k)\n", - " print(\"Example #\", k+1)\n", - " print(\"product_name (input):\", v)\n", - " print(\"product_description (output):\", predictions[k])\n", - " print(\"--------------------------------------------------------------------\")\n", + " os.environ[\"WANDB_NAME\"] = \"examples_\" + str(k)\n", + " print(\"Example #\", k + 1)\n", + " print(\"product_name (input):\", v)\n", + " print(\"product_description (output):\", predictions[k])\n", + " print(\"--------------------------------------------------------------------\")\n", "run.finish()" ], "metadata": { @@ -333,22 +338,22 @@ "cell_type": "code", "source": [ "for model in models.keys():\n", - "\t# Initiate a new run with the foundational model name inside the W&B project\n", + " # Initiate a new run with the foundational model name inside the W&B project\n", " run = wandb.init(project=os.environ[\"WANDB_PROJECT\"], name=model)\n", "\n", " # Scan report\n", - "\t# 1) Generate\n", + " # 1) Generate\n", " models[model]['scan_report'] = giskard.scan(models[model]['giskard'], dataset, raise_exceptions=True)\n", " # 2) Log into W&B\n", " models[model]['scan_report'].to_wandb(run)\n", "\n", " # Test suite\n", - "\t# 1) Generate\n", + " # 1) Generate\n", " models[model]['test_suite'] = models[model]['scan_report'].generate_test_suite()\n", " # 2) Log into W&B\n", " models[model]['test_suite'].run().to_wandb(run)\n", "\n", - "\t# End W&B run\n", + " # End W&B run\n", " run.finish()" ], "metadata": { diff --git a/docs/reference/notebooks/LLM_Description_Product.ipynb b/docs/reference/notebooks/LLM_Description_Product.ipynb index 6d2ea62750..66ddf15e94 100644 --- a/docs/reference/notebooks/LLM_Description_Product.ipynb +++ b/docs/reference/notebooks/LLM_Description_Product.ipynb @@ -275,7 +275,7 @@ " \"Miniature Exercise Equipment\"\n", "]\n", "\n", - "giskard_dataset = Dataset(pd.DataFrame({TEXT_COLUMN_NAME: corpus}))" + "giskard_dataset = Dataset(pd.DataFrame({TEXT_COLUMN_NAME: corpus}), target=None)" ], "metadata": { "id": "FTGiW_RROFfD" diff --git a/docs/reference/notebooks/LLM_Newspaper_Comment_Generation.ipynb b/docs/reference/notebooks/LLM_Newspaper_Comment_Generation.ipynb index 27ac310821..bf86dcfff2 100644 --- a/docs/reference/notebooks/LLM_Newspaper_Comment_Generation.ipynb +++ b/docs/reference/notebooks/LLM_Newspaper_Comment_Generation.ipynb @@ -220,7 +220,7 @@ "execution_count": 5, "outputs": [], "source": [ - "giskard_dataset = Dataset(df_filtered)" + "giskard_dataset = Dataset(df_filtered, target=None)" ], "metadata": { "collapsed": false, diff --git a/docs/reference/notebooks/LLM_QA_Google.ipynb b/docs/reference/notebooks/LLM_QA_Google.ipynb index b9e144e9f4..e8630ac978 100644 --- a/docs/reference/notebooks/LLM_QA_Google.ipynb +++ b/docs/reference/notebooks/LLM_QA_Google.ipynb @@ -115,20 +115,20 @@ "execution_count": 4, "outputs": [], "source": [ - "import os\n", "import json\n", + "import os\n", "from pathlib import Path\n", "\n", "import openai\n", "import pandas as pd\n", "from langchain import OpenAI\n", - "from qdrant_client import QdrantClient\n", + "from langchain.chains import load_chain, RetrievalQA\n", "from langchain.chains.base import Chain\n", - "from langchain.vectorstores import Qdrant\n", - "from langchain.prompts import PromptTemplate\n", "from langchain.embeddings import OpenAIEmbeddings\n", - "from langchain.chains import load_chain, RetrievalQA\n", + "from langchain.prompts import PromptTemplate\n", "from langchain.text_splitter import RecursiveCharacterTextSplitter\n", + "from langchain.vectorstores import Qdrant\n", + "from qdrant_client import QdrantClient\n", "\n", "from giskard import Model, Dataset, scan, GiskardClient" ], @@ -276,7 +276,7 @@ "outputs": [], "source": [ "raw_data = pd.DataFrame({TEXT_COLUMN_NAME: questions[:5]})\n", - "giskard_dataset = Dataset(raw_data)" + "giskard_dataset = Dataset(raw_data, target=None)" ], "metadata": { "collapsed": false @@ -312,8 +312,8 @@ " db = Qdrant.from_documents(docs, OpenAIEmbeddings(), host=\"localhost\",\n", " collection_name=\"google answers\", force_recreate=True)\n", " return db\n", - " \n", - " \n", + "\n", + "\n", "# Create the chain.\n", "llm = OpenAI(model_name=LLM_NAME, temperature=0)\n", "prompt_template = PromptTemplate(template=PROMPT_TEMPLATE, input_variables=[\"context\", \"question\"])\n", @@ -358,17 +358,17 @@ "\n", " def save_model(self, path: str):\n", " out_dest = Path(path)\n", - " \n", + "\n", " # Save the chain object\n", " self.model.save(out_dest.joinpath(\"model.json\"))\n", "\n", " # Save the Qdrant connection details\n", " db = self.model.retriever.vectorstore\n", - " \n", + "\n", " qdrant_meta = {\n", " \"collection_name\": db.collection_name,\n", " }\n", - " \n", + "\n", " with out_dest.joinpath(\"qdrant.json\").open(\"w\") as f:\n", " json.dump(qdrant_meta, f)\n", "\n", @@ -379,12 +379,12 @@ " # Load the FAISS-based retriever\n", " with src.joinpath(\"qdrant.json\").open(\"r\") as f:\n", " qdrant_meta = json.load(f)\n", - " \n", + "\n", " client = QdrantClient(\n", " \"localhost\",\n", " api_key=None,\n", " )\n", - " \n", + "\n", " db = Qdrant(\n", " client,\n", " collection_name=qdrant_meta[\"collection_name\"],\n", @@ -398,10 +398,12 @@ "\n", "# Wrap the QA chain.\n", "giskard_model = QdrantRAGModel(\n", - " model=google_qa_chain, # A prediction function that encapsulates all the data pre-processing steps and that could be executed with the dataset used by the scan.\n", + " model=google_qa_chain,\n", + " # A prediction function that encapsulates all the data pre-processing steps and that could be executed with the dataset used by the scan.\n", " model_type=\"text_generation\", # Either regression, classification or text_generation.\n", " name=\"The LLM, which knows different facts\", # Optional.\n", - " description=\"This model knows different facts about movies, history, news, etc. It provides short single-sentence summary answer only. This model politely refuse if it does not know an answer.\", # Is used to generate prompts during the scan.\n", + " description=\"This model knows different facts about movies, history, news, etc. It provides short single-sentence summary answer only. This model politely refuse if it does not know an answer.\",\n", + " # Is used to generate prompts during the scan.\n", " feature_names=[TEXT_COLUMN_NAME] # Default: all columns of your dataset.\n", ")" ], diff --git a/docs/reference/notebooks/LLM_QA_IPCC.ipynb b/docs/reference/notebooks/LLM_QA_IPCC.ipynb index fdeffdf76c..df718ac9c6 100644 --- a/docs/reference/notebooks/LLM_QA_IPCC.ipynb +++ b/docs/reference/notebooks/LLM_QA_IPCC.ipynb @@ -324,7 +324,7 @@ " \"According to the IPCC report, what are key risks in the Europe?\",\n", " \"Is sea level rise avoidable? When will it stop?\"\n", " ]\n", - "}))" + "}), target=None)" ] }, { diff --git a/docs/reference/notebooks/LLM_QA_Winter_Olympics.ipynb b/docs/reference/notebooks/LLM_QA_Winter_Olympics.ipynb index 315a3b9b11..89cf36a67c 100644 --- a/docs/reference/notebooks/LLM_QA_Winter_Olympics.ipynb +++ b/docs/reference/notebooks/LLM_QA_Winter_Olympics.ipynb @@ -348,7 +348,7 @@ "]\n", "\n", "raw_data = pd.DataFrame(data={TEXT_COLUMN_NAME: corpus})\n", - "giskard_dataset = Dataset(raw_data)\n", + "giskard_dataset = Dataset(raw_data, target=None)\n", "\n", "\n", "# Wrap the model.\n", diff --git a/giskard/core/core.py b/giskard/core/core.py index f98889646a..1582dd0b10 100644 --- a/giskard/core/core.py +++ b/giskard/core/core.py @@ -24,6 +24,37 @@ class Kwargs: pass +_T = TypeVar("_T") + + +# Sentinel class used until PEP 0661 is accepted +class NotGiven: + """ + A sentinel singleton class used to distinguish omitted keyword arguments + from those passed in with the value None (which may have different behavior). + + For example: + + ```py + def get(timeout: Union[int, NotGiven, None] = NotGiven()) -> Response: ... + + get(timout=1) # 1s timeout + get(timout=None) # No timeout + get() # Default timeout behavior, which may not be statically known at the method definition. + ``` + """ + + def __bool__(self) -> Literal[False]: + return False + + def __repr__(self) -> str: + return "NOT_GIVEN" + + +NotGivenOr = Union[_T, NotGiven] +NOT_GIVEN = NotGiven() + + def _get_plugin_method_full_name(func): from giskard.ml_worker.testing.registry.registry import plugins_root diff --git a/giskard/core/dataset_validation.py b/giskard/core/dataset_validation.py index 611e8676c7..3a3193d2ca 100644 --- a/giskard/core/dataset_validation.py +++ b/giskard/core/dataset_validation.py @@ -1,6 +1,5 @@ -from typing import Hashable - import pandas as pd +from typing import Hashable from giskard.client.python_utils import warning from giskard.core.core import SupportedColumnTypes @@ -8,8 +7,14 @@ from giskard.datasets.base import Dataset +def validate_dataset(ds: Dataset): + validate_dtypes(ds) + validate_target_exists(ds) + validate_optional_target(ds) + + def validate_optional_target(ds: Dataset): - if ds.target is None: + if not ds.is_target_given: warning( "You did not provide the optional argument 'target'. " "'target' is the column name in df corresponding to the actual target variable (ground truth)." diff --git a/giskard/core/model_validation.py b/giskard/core/model_validation.py index d848bf07de..92a78dbc59 100644 --- a/giskard/core/model_validation.py +++ b/giskard/core/model_validation.py @@ -1,9 +1,9 @@ import tempfile -from typing import Any, Callable, Iterable, List, Optional, Union import numpy as np import pandas as pd import yaml +from typing import List, Iterable, Union, Callable, Any, Optional from giskard.client.python_utils import warning from giskard.core.core import ModelMeta, ModelType, SupportedModelTypes @@ -14,14 +14,11 @@ from ..utils import fullname from ..utils.analytics_collector import analytics, get_dataset_properties, get_model_properties -from .dataset_validation import validate_optional_target @configured_validate_arguments def validate_model(model: BaseModel, validate_ds: Optional[Dataset] = None, print_validation_message: bool = True): try: - if model.meta.model_type != SupportedModelTypes.TEXT_GENERATION and validate_ds is not None: - validate_optional_target(validate_ds) _do_validate_model(model, validate_ds) except (ValueError, TypeError) as err: _track_validation_error(err, model, validate_ds) diff --git a/giskard/datasets/base/__init__.py b/giskard/datasets/base/__init__.py index b55a5bb6b3..08fca1feaf 100644 --- a/giskard/datasets/base/__init__.py +++ b/giskard/datasets/base/__init__.py @@ -5,7 +5,6 @@ import uuid from functools import cached_property from pathlib import Path -from typing import Dict, Hashable, List, Optional, Union import numpy as np import pandas @@ -13,13 +12,14 @@ import yaml from mlflow import MlflowClient from pandas.api.types import is_list_like, is_numeric_dtype +from typing import Dict, Hashable, List, Optional, Union from xxhash import xxh3_128_hexdigest from zstandard import ZstdDecompressor from giskard.client.giskard_client import GiskardClient from giskard.client.io_utils import compress, save_df from giskard.client.python_utils import warning -from giskard.core.core import DatasetMeta, SupportedColumnTypes +from giskard.core.core import DatasetMeta, SupportedColumnTypes, NOT_GIVEN, NotGivenOr from giskard.core.errors import GiskardImportError from giskard.core.validation import configured_validate_arguments from giskard.ml_worker.testing.registry.slicing_function import SlicingFunction, SlicingFunctionType @@ -144,7 +144,7 @@ class Dataset(ColumnMetadataMixin): """ name: Optional[str] - target: Optional[str] + _target: NotGivenOr[Optional[str]] column_types: Dict[str, str] df: pd.DataFrame id: uuid.UUID @@ -156,7 +156,7 @@ def __init__( self, df: pd.DataFrame, name: Optional[str] = None, - target: Optional[Hashable] = None, + target: NotGivenOr[Optional[Hashable]] = NOT_GIVEN, cat_columns: Optional[List[str]] = None, column_types: Optional[Dict[Hashable, str]] = None, id: Optional[uuid.UUID] = None, @@ -169,7 +169,7 @@ def __init__( Args: df (pd.DataFrame): The input dataset as a pandas DataFrame. name (Optional[str]): The name of the dataset. - target (Optional[str]): The column name in df corresponding to the actual target variable (ground truth). + target (Optional[str]): The column name in df corresponding to the actual target variable (ground truth). The target needs to be explicitly set to `None` if the dataset doesn't have any target variable. cat_columns (Optional[List[str]]): A list of column names that are categorical. column_types (Optional[Dict[str, str]]): A dictionary mapping column names to their types. id (Optional[uuid.UUID]): A UUID that uniquely identifies this dataset. @@ -186,13 +186,12 @@ def __init__( self.name = name self.df = pd.DataFrame(df) - self.target = target + self._target = target if validation: - from giskard.core.dataset_validation import validate_dtypes, validate_target_exists + from giskard.core.dataset_validation import validate_dataset - validate_dtypes(self) - validate_target_exists(self) + validate_dataset(self) self.column_dtypes = self.extract_column_dtypes(self.df) @@ -229,7 +228,13 @@ def __init__( logger.info("Your 'pandas.DataFrame' is successfully wrapped by Giskard's 'Dataset' wrapper class.") - self.data_processor = DataProcessor() + @property + def is_target_given(self) -> bool: + return self._target is not NOT_GIVEN + + @property + def target(self) -> Optional[str]: + return self._target or None def add_slicing_function(self, slicing_function: SlicingFunction): """ diff --git a/tests/integrations/test_mlflow.py b/tests/integrations/test_mlflow.py index eb9bfc3dfb..08ac8897de 100644 --- a/tests/integrations/test_mlflow.py +++ b/tests/integrations/test_mlflow.py @@ -79,7 +79,7 @@ def test_errors(dataset_name, model_name, request): # dataset type error dataset_copy = dataset.copy() dataset_copy.df = [[0.6, 0.4]] - dataset_copy.target = [1] + dataset_copy._target = [1] with pytest.raises(Exception) as e: _evaluate(dataset_copy, model, evaluator_config) diff --git a/tests/test_dataset.py b/tests/test_dataset.py index 9426c1a6b1..01ec0fc599 100644 --- a/tests/test_dataset.py +++ b/tests/test_dataset.py @@ -1,19 +1,17 @@ -import pandas as pd -import numpy as np -from pydantic import ValidationError -import pytest import uuid +import numpy as np +import pandas as pd +import pytest import requests_mock +from pydantic import ValidationError -from giskard.datasets.base import Dataset -from giskard.core.dataset_validation import validate_optional_target from giskard.client.dtos import DatasetMetaInfo - +from giskard.core.dataset_validation import validate_optional_target +from giskard.datasets.base import Dataset from tests import utils from tests.communications.test_dto_serialization import is_required, get_fields, get_name - # FIXME: conflict on `name` between Giskard Hub (@NotBlank) and Python client (optional in DatasetMeta and DatasetMetaInfo) MANDATORY_FIELDS = [ "id", @@ -57,8 +55,7 @@ def test_factory(): assert isinstance(my_dataset, Dataset) -def test_valid_df_column_types(): - # Option 0: none of column_types, cat_columns, infer_column_types = True are provided +def test_validate_optional_target(): with pytest.warns( UserWarning, match=r"You did not provide the optional argument 'target'\. 'target' is the column name " @@ -66,6 +63,20 @@ def test_valid_df_column_types(): ): my_dataset = Dataset(valid_df) validate_optional_target(my_dataset) + + with pytest.warns(None) as record: + my_dataset = Dataset(valid_df, target=None) + validate_optional_target(my_dataset) + + my_dataset = Dataset(valid_df, target="text_column") + validate_optional_target(my_dataset) + + assert len(record) == 0 + + +def test_valid_df_column_types(): + # Option 0: none of column_types, cat_columns, infer_column_types = True are provided + my_dataset = Dataset(valid_df) assert my_dataset.column_types == { "categorical_column": "category", "text_column": "text", @@ -158,7 +169,6 @@ def test_numeric_column_names(): def test_infer_column_types(): - # if df_size >= 100 ==> category_threshold = floor(log10(df_size)) assert Dataset(pd.DataFrame({"f": [1, 2] * 50})).column_types["f"] == "category" assert Dataset(pd.DataFrame({"f": ["a", "b"] * 50})).column_types["f"] == "category" @@ -228,8 +238,9 @@ def test_dataset_meta_info(): mandatory_field_names = [] optional_field_names = [] for name, field in get_fields(klass).items(): - mandatory_field_names.append(get_name(name, field)) if is_required(field) else \ - optional_field_names.append(get_name(name, field)) + mandatory_field_names.append(get_name(name, field)) if is_required(field) else optional_field_names.append( + get_name(name, field) + ) assert set(mandatory_field_names) == set(MANDATORY_FIELDS) assert set(optional_field_names) == set(OPTIONAL_FIELDS) @@ -242,7 +253,9 @@ def test_fetch_dataset_meta(request): with utils.MockedClient(mock_all=False) as (client, mr): meta_info = utils.mock_dataset_meta_info(dataset, project_key) meta_info.pop(op) - mr.register_uri(method=requests_mock.GET, url=utils.get_url_for_dataset(dataset, project_key), json=meta_info) + mr.register_uri( + method=requests_mock.GET, url=utils.get_url_for_dataset(dataset, project_key), json=meta_info + ) # Should not raise client.load_dataset_meta(project_key, uuid=str(dataset.id)) @@ -251,7 +264,9 @@ def test_fetch_dataset_meta(request): with utils.MockedClient(mock_all=False) as (client, mr): meta_info = utils.mock_dataset_meta_info(dataset, project_key) meta_info.pop(op) - mr.register_uri(method=requests_mock.GET, url=utils.get_url_for_dataset(dataset, project_key), json=meta_info) + mr.register_uri( + method=requests_mock.GET, url=utils.get_url_for_dataset(dataset, project_key), json=meta_info + ) # Should raise due to missing of values with pytest.raises(ValidationError):