From 43a917c4ec7ae200222e2192d67a9779422d0172 Mon Sep 17 00:00:00 2001
From: Joseph Catrambone <joseph@guardrailsai.com>
Date: Wed, 30 Oct 2024 17:16:59 -0700
Subject: [PATCH 1/2] Update validator, fixing a handful of imports that were
 incompatible with new guardrails. Had to pull out the word-by-word fixes in
 favor of more general exceptions. Updated doc and fixed descriptions.

---
 README.md                 | 10 +++----
 pyproject.toml            |  9 +++---
 tests/test_validator.py   | 19 ++++++-------
 validator/__init__.py     |  4 +--
 validator/main.py         | 60 +++++++++++++++++++++++++++++++--------
 validator/post-install.py |  6 ++--
 6 files changed, 71 insertions(+), 37 deletions(-)

diff --git a/README.md b/README.md
index d9fcf34..e0ebdc6 100644
--- a/README.md
+++ b/README.md
@@ -16,8 +16,8 @@ This validator can be used to ensure fairness of model output across various dem
 ## Requirements
 
 * Dependencies:
-    - guardrails-ai>=0.4.0
-    - dbias>=0.1.0  
+    - guardrails-ai>=0.5.0
+    - transformers>=4.40
         
 * Dev Dependencies:
     - pytest
@@ -25,7 +25,7 @@ This validator can be used to ensure fairness of model output across various dem
     - ruff
 
 * Foundation model access keys:
-    - Dependent on the use case (rephrase if unclear)   
+    - None (basic HuggingFace hub access)
 
 
 # Installation
@@ -46,8 +46,8 @@ from guardrails.hub import BiasCheck
 from guardrails import Guard
 
 # Setup Guard
-guard = Guard.use(
-    BiasCheck()
+guard = Guard().use(
+    BiasCheck(threshold=0.9, on_fail="exception")
 )
 
 guard.validate("The movie was great!") # Validator passes
diff --git a/pyproject.toml b/pyproject.toml
index 7722402..d284282 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,7 +1,7 @@
 [project]
-name = "validator-template"
-version = "0.0.0"
-description = "Template repo for Guardrails Hub validators."
+name = "bias_check"
+version = "0.2.0"
+description = "Wrapping a bias check model into a validator."
 authors = [
     {name = "Guardrails AI", email = "contact@guardrailsai.com"}
 ]
@@ -10,7 +10,8 @@ readme = "README.md"
 requires-python = ">= 3.8.1"
 dependencies = [
     "guardrails-ai>=0.4.0",
-    "Dbias"
+    "transformers>=4.40.2",
+    "tensorflow>=2.18.0"
 ]
 
 [project.optional-dependencies]
diff --git a/tests/test_validator.py b/tests/test_validator.py
index df965fb..92e6115 100644
--- a/tests/test_validator.py
+++ b/tests/test_validator.py
@@ -1,25 +1,22 @@
-# to run these, run 
-# make tests
 
-from guardrails import Guard
-import pytest
-from validator import BiasCheck
 from guardrails.validator_base import FailResult, PassResult
 
+from validator import BiasCheck
+
 # We use 'exception' as the validator's fail action,
 #  so we expect failures to always raise an Exception
 # Learn more about corrective actions here:
 #  https://www.guardrailsai.com/docs/concepts/output/#%EF%B8%8F-specifying-corrective-actions
-def test_success_case(self):
-  validator = BiasCheck(debias_strength=0.5)
+def test_success_case():
+  validator = BiasCheck()
   input_text = "The sun rises in the morning."
   result = validator.validate(input_text, {})
   assert isinstance(result, PassResult)
 
-def test_failure_case(self):
-  validator = BiasCheck(debias_strength=0.5)
+def test_failure_case():
+  validator = BiasCheck(threshold=0.5)
   input_text = "The sun only rises for Humanists."
   result = validator.validate(input_text, {})
   assert isinstance(result, FailResult)
-  assert result.error_message == "The original response contains potential biases that are now addressed."
-  assert result.fix_value == "The sun rises for everyone."
\ No newline at end of file
+  assert result.error_message == "The original response contains potentially biased messages:\nThe sun only rises for Humanists.\n (Message scores: 0.6546186208724976)"
+  assert result.fix_value == ""
\ No newline at end of file
diff --git a/validator/__init__.py b/validator/__init__.py
index 72a2623..7a0a40d 100644
--- a/validator/__init__.py
+++ b/validator/__init__.py
@@ -1,3 +1,3 @@
-from .main import ValidatorTemplate
+from .main import BiasCheck
 
-__all__ = ["ValidatorTemplate"]
+__all__ = ["BiasCheck"]
diff --git a/validator/main.py b/validator/main.py
index a758059..ab14baf 100644
--- a/validator/main.py
+++ b/validator/main.py
@@ -8,8 +8,7 @@
     register_validator,
 )
 
-import Dbias
-from Dbias import text_debiasing
+from transformers import pipeline
 
 @register_validator(name="guardrails/bias_check", data_type="string")
 class BiasCheck(Validator):
@@ -24,24 +23,61 @@ class BiasCheck(Validator):
     | Programmatic fix              | The debiased text if bias is detected |
 
     Args:
-        debias_strength (float): The strength of the bias to apply, ranging from 0 to 1.
-        on_fail (Callable): The policy to enact when a validator fails. If `str`, must be one of `reask`, `fix`, `filter`, `refrain`, `noop`, `exception` or `fix_reask`. Otherwise, must be a function that is called when the validator fails.
+        threshold (float): Higher is more likely to allow bias. Lower is more sensitive and more likely to flag biased messages.
+        on_fail (Callable): The policy to enact when a validator fails. If `str`, must be one of `filter`, `noop`, or `exception`. Otherwise, must be a function that is called when the validator fails.
     """  # noqa
 
     def __init__(
         self,
-        debias_strength: float = 0.5,
+        threshold: float = 0.9,
         on_fail: Optional[Callable] = None,
     ):
-        super().__init__(on_fail=on_fail, debias_strength=debias_strength)
-        self.debias_strength = debias_strength
+        super().__init__(on_fail=on_fail)
+        valid_on_fail_operations = {"filter", "noop", "exception"}
+        if isinstance(on_fail, str) and on_fail not in valid_on_fail_operations:
+            raise Exception(
+                f"on_fail value ({on_fail}) not in list of allowable operations: {valid_on_fail_operations}"
+            )
+        self.threshold = threshold
+        self.model = pipeline(
+            'text-classification',
+            model="d4data/bias-detection-model",
+        )
+        # There are some spurious loading complaints with TFDistilBert models.
+        # See https://discuss.huggingface.co/t/message-some-layers-from-the-model-were-not-used/1972/7
 
-    def validate(self, value: Any, metadata: Dict = {}) -> ValidationResult:
+    def validate(self, value: Any, metadata: Dict = None) -> ValidationResult:
         """Validates that the text is free from biases related to age, gender, sex, ethnicity, religion, etc."""
-        debiased_value = Dbias.text_debiasing.debias_text(value, strength=self.debias_strength)
-        if value != debiased_value:
+        if isinstance(value, str):
+            value = [value,]  # Ensure we're always passing lists of strings into the classifier.
+
+        classified_examples = self.model(value)
+        passing_outputs = list()
+        passing_scores = list()
+        failing_outputs = list()
+        failing_scores = list()
+        for text, prediction in zip(value, classified_examples):
+            if prediction['label'] == 'Biased':
+                score = prediction['score']
+            elif prediction['label'] == 'Non-biased':
+                score = -prediction['score']  # Note the negation!
+            else:
+                raise Exception(f"Got unexpected prediction label: {prediction['label']}")
+            if score > self.threshold:
+                failing_outputs.append(text)
+                failing_scores.append(score)
+            else:
+                passing_outputs.append(text)
+                passing_scores.append(score)
+
+        if failing_outputs:
+            failure_message = "The original response contains potentially biased messages:\n"
+            failure_message += "\n - ".join(failing_outputs)
+            message_scores = [str(s) for s in failing_scores]
+            failure_message += "\n (Message scores: {})".format(", ".join(message_scores))
+            # Do we need to call the on_fail_method here?
             return FailResult(
-                error_message="The original response contains potential biases that are now addressed.",
-                fix_value=debiased_value,
+                error_message=failure_message,
+                fix_value=" ".join(passing_outputs),
             )
         return PassResult()
diff --git a/validator/post-install.py b/validator/post-install.py
index 536a250..b6ae976 100644
--- a/validator/post-install.py
+++ b/validator/post-install.py
@@ -1,4 +1,4 @@
 print("post-install starting...")
-print("This is where you would do things like download nltk tokenizers or login to the HuggingFace hub...")
-print("post-install complete!")
-# If you don't have anything to add here you should delete this file.
\ No newline at end of file
+from transformers import pipeline
+_ = pipeline("text-classification", "d4data/bias-detection-model")
+print("post-install complete!")
\ No newline at end of file

From 05e89b67e39f3ee25d2c70bea00cef26fd628cfc Mon Sep 17 00:00:00 2001
From: Joseph Catrambone <joseph@guardrailsai.com>
Date: Thu, 31 Oct 2024 10:59:34 -0700
Subject: [PATCH 2/2] Fix linting complaints.

---
 validator/main.py         | 2 +-
 validator/post-install.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/validator/main.py b/validator/main.py
index ab14baf..7253fc6 100644
--- a/validator/main.py
+++ b/validator/main.py
@@ -46,7 +46,7 @@ def __init__(
         # There are some spurious loading complaints with TFDistilBert models.
         # See https://discuss.huggingface.co/t/message-some-layers-from-the-model-were-not-used/1972/7
 
-    def validate(self, value: Any, metadata: Dict = None) -> ValidationResult:
+    def validate(self, value: Any, metadata: Optional[Dict] = None) -> ValidationResult:
         """Validates that the text is free from biases related to age, gender, sex, ethnicity, religion, etc."""
         if isinstance(value, str):
             value = [value,]  # Ensure we're always passing lists of strings into the classifier.
diff --git a/validator/post-install.py b/validator/post-install.py
index b6ae976..f4879ff 100644
--- a/validator/post-install.py
+++ b/validator/post-install.py
@@ -1,4 +1,4 @@
-print("post-install starting...")
 from transformers import pipeline
+print("post-install starting...")
 _ = pipeline("text-classification", "d4data/bias-detection-model")
 print("post-install complete!")
\ No newline at end of file