Skip to content

Commit

Permalink
themes: accept json string, enable jailbreak/refusal similarities as …
Browse files Browse the repository at this point in the history
…standalone functions (#85)

* themes groups as json string, similarities as standalone functions, reference embeddings as global variables

---------

Co-authored-by: Jamie Broomall <[email protected]>
  • Loading branch information
FelipeAdachi and jamie256 authored Jul 10, 2023
1 parent ac6ad37 commit 7906380
Show file tree
Hide file tree
Showing 2 changed files with 79 additions and 33 deletions.
25 changes: 25 additions & 0 deletions langkit/tests/test_themes.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,3 +60,28 @@ def test_theme(interactions):
else:
assert jail_median > 0.11
assert refusal_median > 0.11


@pytest.mark.load
def test_themes_with_json_string():
from whylogs.experimental.core.udf_schema import udf_schema, _reset_udfs
from langkit import themes
import json
import whylogs as why

refusals_json = {
"refusals": [
"I'm very sorry, but I cannot provide information or assistance on . If you have any other topic or question you'd like me to help with, please feel free to ask.",
"Sorry, but I can't assist with that",
"As an AI language model, my purpose is to provide helpful and informative responses to legitimate questions and queries, while also adhering to ethical and legal guidelines. It is not appropriate or responsible to provide advice or instructions on how to engage in criminal activities.",
"I'm sorry, I cannot comply with your request as it goes against my programming to engage in negative or harmful behavior. My purpose is to assist and provide helpful responses.",
]
}
# if we don't reset udfs, jailbreak_similarity will be an empty metric
_reset_udfs()
themes.init(theme_json=json.dumps(refusals_json))
schema = udf_schema()

prof = why.log({"prompt": "hello"}, schema=schema).view()
udf_keys = prof.get_column("prompt").get_metric("udf").to_summary_dict().keys()
assert not any([x.startswith("jailbreak_similarity") for x in udf_keys])
87 changes: 54 additions & 33 deletions langkit/themes.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,44 +15,56 @@

_transformer_model = None
_theme_groups = None

lang_config = LangKitConfig()

_jailbreak_embeddings = None
_refusal_embeddings = None


def register_theme_udfs():
global _jailbreak_embeddings
global _refusal_embeddings

_jailbreak_embeddings = [
_transformer_model.encode(s, convert_to_tensor=True)
for s in _theme_groups.get("jailbreaks", [])
]
_refusal_embeddings = [
_transformer_model.encode(s, convert_to_tensor=True)
for s in _theme_groups.get("refusals", [])
]

if "jailbreaks" in _theme_groups:
jailbreak_embeddings = [
_transformer_model.encode(s, convert_to_tensor=True)
for s in _theme_groups["jailbreaks"]
]

@register_metric_udf(col_type=String)
def jailbreak_similarity(text: str) -> float:
if _transformer_model is None:
raise ValueError("Must initialize a transformer before calling encode!")
similarities = []
text_embedding = _transformer_model.encode(text, convert_to_tensor=True)
for embedding in jailbreak_embeddings:
similarity = get_embeddings_similarity(text_embedding, embedding)
similarities.append(similarity)
return max(similarities)
register_metric_udf(col_type=String)(jailbreak_similarity)
else:
diagnostic_logger.info("No jailbreaks found in theme groups file")

if "refusals" in _theme_groups:
refusal_embeddings = [
_transformer_model.encode(s, convert_to_tensor=True)
for s in _theme_groups["refusals"]
]

@register_metric_udf(col_type=String)
def refusal_similarity(text: str) -> float:
if _transformer_model is None:
raise ValueError("Must initialize a transformer before calling encode!")
similarities = []
text_embedding = _transformer_model.encode(text, convert_to_tensor=True)
for embedding in refusal_embeddings:
similarity = get_embeddings_similarity(text_embedding, embedding)
similarities.append(similarity)
return max(similarities)
register_metric_udf(col_type=String)(refusal_similarity)
else:
diagnostic_logger.info("No refusals found in theme groups file")


def jailbreak_similarity(text: str) -> Optional[float]:
if _transformer_model is None:
raise ValueError("Must initialize a transformer before calling encode!")
similarities = []
text_embedding = _transformer_model.encode(text, convert_to_tensor=True)
for embedding in _jailbreak_embeddings:
similarity = get_embeddings_similarity(text_embedding, embedding)
similarities.append(similarity)
return max(similarities) if similarities else None


def refusal_similarity(text: str) -> Optional[float]:
if _transformer_model is None:
raise ValueError("Must initialize a transformer before calling encode!")
similarities = []
text_embedding = _transformer_model.encode(text, convert_to_tensor=True)
for embedding in _refusal_embeddings:
similarity = get_embeddings_similarity(text_embedding, embedding)
similarities.append(similarity)
return max(similarities) if similarities else None


def load_themes(json_path: str):
Expand All @@ -71,13 +83,22 @@ def load_themes(json_path: str):
return None


def init(transformer_name: Optional[str] = None, theme_file_path: Optional[str] = None):
def init(
transformer_name: Optional[str] = None,
theme_file_path: Optional[str] = None,
theme_json: Optional[str] = None,
):
global _transformer_model
global _theme_groups
if transformer_name is None:
transformer_name = lang_config.transformer_name
if theme_file_path is not None and theme_json is not None:
raise ValueError("Cannot specify both theme_file_path and theme_json")
if theme_file_path is None:
_theme_groups = load_themes(lang_config.theme_file_path)
if theme_json:
_theme_groups = json.loads(theme_json)
else:
_theme_groups = load_themes(lang_config.theme_file_path)
else:
_theme_groups = load_themes(theme_file_path)

Expand Down

0 comments on commit 7906380

Please sign in to comment.