Skip to content

Commit

Permalink
Fix for guardrails - previous version returned constant false positives
Browse files Browse the repository at this point in the history
  • Loading branch information
w4ffl35 committed Jan 28, 2024
1 parent bd6bc08 commit 5ff85a3
Show file tree
Hide file tree
Showing 6 changed files with 432 additions and 234 deletions.
88 changes: 31 additions & 57 deletions src/airunner/aihandler/casual_lm_transfformer_base_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from llama_index import VectorStoreIndex, SimpleDirectoryReader

from airunner.aihandler.tokenizer_handler import TokenizerHandler
from airunner.enums import SignalCode, SelfReflectionCategory
from airunner.enums import SignalCode


class CasualLMTransformerBaseHandler(TokenizerHandler):
Expand Down Expand Up @@ -125,78 +125,47 @@ def load_query_engine_from_disk(self):
streaming=True
)

def prepare_messages(self):
"""
This is currently crafted for mistralai/Mistral-7B-Instruct-v0.1
(specially the guardrails, self-reflection, and prompt template).
It will need to be updated or overriden for other models.
:return:
"""
optional_self_reflection = {}
optional_self_reflection[SelfReflectionCategory.ILLEGAL] = "illegal: Illegal activity."
optional_self_reflection[SelfReflectionCategory.HATE_VIOLENCE_HARASSMENT] = "hate violence harassment: Generation of hateful, harassing, or violent content: content that expresses, incites, or promotes hate based on identity, content that intends to harass, threaten, or bully an individual, content that promotes or glorifies violence or celebrates the suffering or humiliation of others."
optional_self_reflection[SelfReflectionCategory.MALWARE] = "malware: Generation of malware: content that attempts to generate code that is designed to disrupt, damage, or gain unauthorized access to a computer system."
optional_self_reflection[SelfReflectionCategory.PHYSICAL_HARM] = "physical harm: activity that has high risk of physical harm, including: weapons development, military and warfare, management or operation of critical infrastructure in energy, transportation, and water, content that promotes, encourages, or depicts acts of self-harm, such as suicide, cutting, and eating disorders."
optional_self_reflection[SelfReflectionCategory.ECONOMIC_HARM] = "economic harm: activity that has high risk of economic harm, including: multi-level marketing, gambling, payday lending, automated determinations of eligibility for credit, employment, educational institutions, or public assistance services."
optional_self_reflection[SelfReflectionCategory.FRAUD] = "fraud: Fraudulent or deceptive activity, including: scams, coordinated inauthentic behavior, plagiarism, academic dishonesty, astroturfing, such as fake grassroots support or fake review generation, disinformation, spam, pseudo-pharmaceuticals."
optional_self_reflection[SelfReflectionCategory.ADULT] = "adult: Adult content, adult industries, and dating apps, including: content meant to arouse sexual excitement, such as the description of sexual activity, or that promotes sexual services (excluding sex education and wellness), erotic chat, pornography."
optional_self_reflection[SelfReflectionCategory.POLITICAL] = "political: Political campaigning or lobbying, by: generating high volumes of campaign materials, generating campaign materials personalized to or targeted at specific demographics, building conversational or interactive systems such as chatbots that provide information about campaigns or engage in political advocacy or lobbying, building products for political campaigning or lobbying purposes."
optional_self_reflection[SelfReflectionCategory.PRIVACY] = "privacy: Activity that violates people's privacy, including: tracking or monitoring an individual without their consent, facial recognition of private individuals, classifying individuals based on protected characteristics, using biometrics for identification or assessment, unlawful collection or disclosure of personal identifiable information or educational, financial, or other protected records."
optional_self_reflection[SelfReflectionCategory.UNQUALIFIED_LAW] = "unqualified law: Engaging in the unauthorized practice of law, or offering tailored legal advice without a qualified person reviewing the information."
optional_self_reflection[SelfReflectionCategory.UNQUALIFIED_FINANCIAL] = "unqualified financial: Offering tailored financial advice without a qualified person reviewing the information."
optional_self_reflection[SelfReflectionCategory.UNQUALIFIED_HEALTH] = "unqualified health: Telling someone that they have or do not have a certain health condition, or providing instructions on how to cure or treat a health condition."

def build_system_prompt(self):
# The guardrails prompt is optional and can be overriden.
guardrails_prompt = ""
if self.settings["llm_generator_settings"]["guardrails_active"]:
if self.settings["llm_generator_settings"]["use_guardrails"]:
guardrails_prompt = self.settings["llm_generator_settings"]["guardrails_prompt"]

# The self-reflection prompt is not optional, but the categories are.
def dec(m): return ''.join(chr(ord(c) ^ 0x55) for c in m)
self_reflection_prompt = [
"You're given a list of moderation categories as below:",
dec("xu6=<91u47 &0ou6=<91u&0- 49u47 &0u84!0'<49u:'u4;,u6:;!0;!u!=4!u0-%9:<!&u:'u=4'8&u6=<91'0;{"),
]

# All other categories in self-reflection are optional and can be controlled via the interface
self_reflection_categories = self.settings["llm_generator_settings"]["self_reflection_categories"]
for optional_self_reflection in self_reflection_categories:
if optional_self_reflection["active"]:
self_reflection_prompt.append(
f"- {optional_self_reflection['category']}"
)
system_prompt = []

self_reflection_prompt.extend([
"Please classify the following text into one of these categories, and answer with that single word only.",
"",
"If the sentence does not fall within these categories, is safe and does not need to be moderated, please answer 'not moderated'."
])
self_reflection_prompt = "\n".join(self_reflection_prompt)
if self.settings["llm_generator_settings"]["use_guardrails"]:
system_prompt.append(guardrails_prompt)

system_instructions = self.settings["llm_generator_settings"]["system_instructions"]

system_instructions.append(guardrails_prompt)
system_instructions.append(self_reflection_prompt)
if self.settings["llm_generator_settings"]["use_system_instructions"]:
system_prompt.append(
self.settings["llm_generator_settings"]["system_instructions"]
)

if self.settings["llm_generator_settings"]["assign_names"]:
system_instructions.append(
system_prompt.append(
"Your name is " + self.botname + ". "
)
system_prompt.append(
"The user's name is " + self.username + "."
)

if self.settings["llm_generator_settings"]["use_mood"]:
system_instructions.append(f"Your mood: {self.bot_mood}.")
system_prompt.append(f"Your mood: {self.bot_mood}.")

if self.settings["llm_generator_settings"]["use_personality"]:
system_instructions.append(f"Your personality: {self.bot_personality}.")
system_prompt.append(f"Your personality: {self.bot_personality}.")

system_prompt = "\n".join(system_instructions)
system_prompt = "\n".join(system_prompt)
return system_prompt

messages = [
ChatMessage(
def prepare_messages(self, system_prompt=None):
if system_prompt is None:
system_prompt = ChatMessage(
role="system",
content=system_prompt
content=self.build_system_prompt()
)
messages = [
system_prompt
]
for message in self.history:
messages.append(
Expand All @@ -216,7 +185,13 @@ def dec(m): return ''.join(chr(ord(c) ^ 0x55) for c in m)

def chat_stream(self):
self.logger.info("Generating chat response")
self.add_message_to_history(
self.prompt,
role="user"
)

messages = self.prepare_messages()

streaming_response = self.llm.stream_chat(messages)
is_first_message = True
is_end_of_message = False
Expand All @@ -235,7 +210,6 @@ def chat_stream(self):
if not is_end_of_message:
self.send_final_message()

print("assistant_message: " + assistant_message)
self.add_message_to_history(
assistant_message
)
Expand Down Expand Up @@ -286,9 +260,9 @@ def emit_streamed_text_signal(self, **kwargs):
kwargs
)

def add_message_to_history(self, message):
def add_message_to_history(self, message, role="assistant"):
self.history.append({
"role": "assistant",
"role": role,
"content": message
})

Expand Down
15 changes: 0 additions & 15 deletions src/airunner/enums.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,21 +48,6 @@ class ServiceCode(Enum):
GET_CALLBACK_FOR_SLIDER = "get_callback_for_slider"


class SelfReflectionCategory(Enum):
ILLEGAL = "illegal",
HATE_VIOLENCE_HARASSMENT = "hate violence harassment"
MALWARE = "malware"
PHYSICAL_HARM = "physical harm"
ECONOMIC_HARM = "economic harm"
FRAUD = "fraud"
ADULT = "adult"
POLITICAL = "political"
PRIVACY = "privacy"
UNQUALIFIED_LAW = "unqualified law"
UNQUALIFIED_FINANCIAL = "unqualified financial"
UNQUALIFIED_HEALTH = "unqualified health"


class SignalCode(Enum):
AI_MODELS_REFRESH_SIGNAL = "refresh_ai_models_signal"
AI_MODELS_SAVE_OR_UPDATE_SIGNAL = "ai_models_save_or_update_signal"
Expand Down
80 changes: 77 additions & 3 deletions src/airunner/widgets/llm/bot_preferences.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from PyQt6 import QtWidgets

from airunner.widgets.base_widget import BaseWidget
from airunner.widgets.llm.templates.bot_preferences_ui import Ui_bot_preferences

Expand All @@ -11,14 +13,49 @@ def __init__(self, *args, **kwargs):
self.ui.botname.blockSignals(True)
self.ui.bot_personality.blockSignals(True)
self.ui.bot_mood.blockSignals(True)
self.ui.names_groupbox.blockSignals(True)
self.ui.personality_groupbox.blockSignals(True)
self.ui.mood_groupbox.blockSignals(True)
self.ui.system_instructions.blockSignals(True)
self.ui.guardrails_prompt.blockSignals(True)
self.ui.system_instructions_groupbox.blockSignals(True)
self.ui.guardrails_groupbox.blockSignals(True)
self.ui.username.setText(self.settings["llm_generator_settings"]["username"])
self.ui.botname.setText(self.settings["llm_generator_settings"]["botname"])
self.ui.bot_personality.setPlainText(self.settings["llm_generator_settings"]["bot_personality"])
self.ui.bot_mood.setPlainText(self.settings["llm_generator_settings"]["bot_mood"])
self.ui.names_groupbox.setChecked(self.settings["llm_generator_settings"]["assign_names"])
self.ui.personality_groupbox.setChecked(self.settings["llm_generator_settings"]["use_personality"])
self.ui.mood_groupbox.setChecked(self.settings["llm_generator_settings"]["use_mood"])
self.ui.system_instructions.setPlainText(self.settings["llm_generator_settings"]["system_instructions"])
self.ui.system_instructions_groupbox.setChecked(self.settings["llm_generator_settings"]["use_system_instructions"])
self.ui.guardrails_prompt.setPlainText(self.settings["llm_generator_settings"]["guardrails_prompt"])
self.ui.guardrails_groupbox.setChecked(self.settings["llm_generator_settings"]["use_guardrails"])
self.ui.username.blockSignals(False)
self.ui.botname.blockSignals(False)
self.ui.bot_personality.blockSignals(False)
self.ui.bot_mood.blockSignals(False)
self.ui.names_groupbox.blockSignals(False)
self.ui.personality_groupbox.blockSignals(False)
self.ui.mood_groupbox.blockSignals(False)
self.ui.system_instructions.blockSignals(False)
self.ui.guardrails_prompt.blockSignals(False)
self.ui.system_instructions_groupbox.blockSignals(False)
self.ui.guardrails_groupbox.blockSignals(False)

def toggle_self_reflection_category(self, state):
checkbox = self.sender()
label = checkbox.text()
settings = self.settings
llm_generator_settings = settings["llm_generator_settings"]
self_reflection_categories = llm_generator_settings["self_reflection_categories"]
for index, category in enumerate(self_reflection_categories):
cat = category["category"]
if cat.value == label:
category["active"] = state
self_reflection_categories[index] = category
llm_generator_settings["self_reflection_categories"] = self_reflection_categories
settings["llm_generator_settings"] = llm_generator_settings
self.settings = settings
break

def username_changed(self, val):
settings = self.settings
Expand All @@ -38,4 +75,41 @@ def bot_mood_changed(self):
def bot_personality_changed(self):
settings = self.settings
settings["llm_generator_settings"]["bot_personality"] = self.ui.bot_personality.toPlainText()
self.settings = settings
self.settings = settings

def guardrails_prompt_changed(self):
val = self.ui.guardrails_prompt.toPlainText()
settings = self.settings
settings["llm_generator_settings"]["guardrails_prompt"] = val
self.settings = settings

def system_instructions_changed(self):
val = self.ui.system_instructions.toPlainText()
settings = self.settings
settings["llm_generator_settings"]["system_instructions"] = val
self.settings = settings

def toggle_use_names(self, val):
settings = self.settings
settings["llm_generator_settings"]["assign_names"] = val
self.settings = settings

def toggle_use_personality(self, val):
settings = self.settings
settings["llm_generator_settings"]["use_personality"] = val
self.settings = settings

def toggle_use_mood(self, val):
settings = self.settings
settings["llm_generator_settings"]["use_mood"] = val
self.settings = settings

def toggle_use_guardrails(self, val):
settings = self.settings
settings["llm_generator_settings"]["use_guardrails"] = val
self.settings = settings

def toggle_use_system_instructions(self, val):
settings = self.settings
settings["llm_generator_settings"]["use_system_instructions"] = val
self.settings = settings
Loading

0 comments on commit 5ff85a3

Please sign in to comment.