From 4477d01fba797176a42101b89943efb23989e54f Mon Sep 17 00:00:00 2001 From: Rabah Abdul Khalek Date: Wed, 20 Dec 2023 13:12:41 +0100 Subject: [PATCH] updated readme, requirements, validation --- README.md | 4 ++++ requirements.txt | 6 ++++++ validate.py | 28 ++++++++++++++++++++++++++++ 3 files changed, 38 insertions(+) create mode 100644 requirements.txt create mode 100644 validate.py diff --git a/README.md b/README.md index 114bbd5..06c3e42 100644 --- a/README.md +++ b/README.md @@ -6,3 +6,7 @@ The `prompt_injections.csv` file is a concatenation of prompts from the followin - https://github.com/agencyenterprise/PromptInject with their respective licenses in the `licenses` directory. + +## Setup and validation + +The pip `requirement.txt` is only needed to run `validate.py`, a minimal validation script to ensure that the prompt injection data is generated in the correct format. diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..09844fe --- /dev/null +++ b/requirements.txt @@ -0,0 +1,6 @@ +numpy==1.26.2 +pandas==2.1.4 +python-dateutil==2.8.2 +pytz==2023.3.post1 +six==1.16.0 +tzdata==2023.3 diff --git a/validate.py b/validate.py new file mode 100644 index 0000000..925a522 --- /dev/null +++ b/validate.py @@ -0,0 +1,28 @@ +import pandas as pd +import ast + +INJECTION_DATA_PATH = "prompt_injections.csv" +GISKARD_META_PATH = "giskard_meta_data.csv" + +def _check_matching_dfs_len(df1, df2): + if len(df1) != len(df2): + raise ValueError( + f"{__name__}: {INJECTION_DATA_PATH} and {GISKARD_META_PATH} should " + "have the same length and should be a one-to-one mapping of each other." + ) + +def _check_meta_df_requirements(df): + if "expected_strings" not in df.columns: + raise ValueError(f"{__name__}: expected_strings are needed for the evaluation.") + + if df.expected_strings.isnull().values.any(): + raise ValueError(f"{__name__}: expected_strings column cannot have any NaN values.") + df.expected_strings = df.expected_strings.apply(ast.literal_eval) + + +if __name__ == "__main__": + prompt_injections_df = pd.read_csv(INJECTION_DATA_PATH, index_col=["index"]) + meta_df = pd.read_csv(GISKARD_META_PATH, index_col=["index"]) + _check_matching_dfs_len(meta_df, prompt_injections_df) + _check_meta_df_requirements(meta_df) + print("Validation passed succesfully!") \ No newline at end of file