Merge pull request #422 from guardrails-ai/karan/sensitive-language

Add ToxicLanguage validator
guardrails-ai · Dec 7, 2023 · 5de4cb6 · 5de4cb6
2 parents bf8286b + 3500799
commit 5de4cb6
Show file tree

Hide file tree

Showing 8 changed files with 1,153 additions and 61 deletions.
diff --git a/docs/examples/toxic_language.ipynb b/docs/examples/toxic_language.ipynb
@@ -0,0 +1,206 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Check whether an LLM-generated response contains toxic language\n",
+    "\n",
+    "### Using the `ToxicLanguage` validator\n",
+    "\n",
+    "This is a simple walkthrough of the `ToxicLanguage` validator. This validator checks whether an LLM-generated response contains toxic language. It uses the pre-trained multi-label model from HuggingFace -`unitary/unbiased-toxic-roberta` to check whether the generated text is toxic. It supports both full-text-level and sentence-level validation.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Import the guardrails package\n",
+    "# and the ToxicLanguage validator\n",
+    "import guardrails as gd\n",
+    "from guardrails.validators import ToxicLanguage\n",
+    "from rich import print"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create a Guard object with this validator\n",
+    "# Here, we'll use the default validation method of \"sentence\"\n",
+    "# and the default threshold of 0.25\n",
+    "\n",
+    "guard = gd.Guard.from_string(\n",
+    "    validators=[ToxicLanguage(on_fail=\"fix\")],\n",
+    "    description=\"testmeout\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">\n",
+       "What a lovely day it has been to roam around the city.\n",
+       "I love it here!\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\n",
+       "What a lovely day it has been to roam around the city.\n",
+       "I love it here!\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "# Parse the raw response\n",
+    "raw_response = \"\"\"\n",
+    "What a lovely day it has been to roam around the city. I love it here!\n",
+    "I hate how pathetic and needy you can be sometimes.\n",
+    "\"\"\"\n",
+    "output = guard.parse(\n",
+    "    llm_output=raw_response,\n",
+    ")\n",
+    "\n",
+    "# Print the output\n",
+    "print(output)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "As you can see here, the sentence `I hate how pathetic and needy you can be sometimes.` has been detected as toxic, and hence removed from the validated output.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">\n",
+       "I am a Generative AI model that is trained on a large corpus of text.\n",
+       "This is a very powerful tool for generating new text, but it can also be used to generate text that is offensive or\n",
+       "hateful.\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\n",
+       "I am a Generative AI model that is trained on a large corpus of text.\n",
+       "This is a very powerful tool for generating new text, but it can also be used to generate text that is offensive or\n",
+       "hateful.\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "# Parse the raw response\n",
+    "raw_response = \"\"\"\n",
+    "I am a Generative AI model that is trained on a large corpus of text.\n",
+    "I am shocked by how disgusting and vile you are.\n",
+    "This is a very powerful tool for generating new text, but it can also be used to generate text that is offensive or hateful.\n",
+    "\"\"\"\n",
+    "output = guard.parse(\n",
+    "    llm_output=raw_response,\n",
+    ")\n",
+    "\n",
+    "# Print the output\n",
+    "print(output)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Similarly, here the sentence `I am shocked by how disgusting and vile you are.` has been detected as toxic, and hence removed from the validated output.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Test with validation method 'full'\n",
+    "full_guard = gd.Guard.from_string(\n",
+    "    validators=[ToxicLanguage(validation_method=\"full\", on_fail=\"fix\")],\n",
+    "    description=\"testmeout\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "# Parse the raw response\n",
+    "raw_response = \"Stop being such a dumb piece of shit. Why can't you comprehend this?\"\n",
+    "output = full_guard.parse(\n",
+    "    llm_output=raw_response,\n",
+    ")\n",
+    "\n",
+    "# Print the output\n",
+    "print(output)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Here, we're doing validation on the entire text, and toxic language was detected here - hence, the nothing is returned here.\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "lang",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/guardrails/validators/__init__.py b/guardrails/validators/__init__.py
@@ -37,6 +37,7 @@
 from guardrails.validators.similar_to_document import SimilarToDocument
 from guardrails.validators.similar_to_list import SimilarToList
 from guardrails.validators.sql_column_presence import SqlColumnPresence
+from guardrails.validators.toxic_language import ToxicLanguage, pipeline
 from guardrails.validators.two_words import TwoWords
 from guardrails.validators.upper_case import UpperCase
 from guardrails.validators.valid_choices import ValidChoices
@@ -76,11 +77,13 @@
     "PIIFilter",
     "SimilarToList",
     "DetectSecrets",
+    "ToxicLanguage",
     "CompetitorCheck",
     # Validator helpers
     "detect_secrets",
     "AnalyzerEngine",
     "AnonymizerEngine",
+    "pipeline",
     # Base classes
     "Validator",
     "register_validator",