From 6a84c667f067feb16c89710a37c9c8a994a8c366 Mon Sep 17 00:00:00 2001 From: Peter Harrington Date: Mon, 18 Sep 2023 10:52:27 -0700 Subject: [PATCH 1/2] all process output now in str format --- evals/eval_tools.py | 2 +- evals/new_code_eval.yaml | 27 +++++++++++++++++++-------- 2 files changed, 20 insertions(+), 9 deletions(-) diff --git a/evals/eval_tools.py b/evals/eval_tools.py index 45475ea3f8..97c2b50199 100644 --- a/evals/eval_tools.py +++ b/evals/eval_tools.py @@ -96,7 +96,7 @@ def check_executable_satisfies_function(eval_d: dict) -> bool: output_satisfies: "tf = lambda a : len(a) == 10" """ process = run_executable(eval_d=eval_d) - process_output = process.communicate()[0].strip() + process_output = str(process.communicate()[0].strip(), "utf-8") exec(eval_d["output_satisfies"]) checking_function_ref = locals().get("tf") diff --git a/evals/new_code_eval.yaml b/evals/new_code_eval.yaml index bd8450d0c0..cb9cf0e9b6 100644 --- a/evals/new_code_eval.yaml +++ b/evals/new_code_eval.yaml @@ -1,12 +1,23 @@ evaluations: - - name: password_gen - project_root: "projects/password_gen_eval" - code_prompt: "Create a password generator CLI tool in Python that generates strong, random passwords based on user-specified criteria, such as length and character types (letters, numbers, symbols). The password generator should be a python program named passwordgenerator.py with two arguments: length, and character types. The character types argument can be one or more of the the following: l for lowercase, u for uppercase, d for digits, and s for symbols." + - name: currency_converter + project_root: "projects/currency_converter" + code_prompt: "Build a currency converter CLI tool in Python using an API for exchange rates. The currency converter should be a python program named currency.py with three required arguments: base currency symbol, target currency symbol and base currency amount. The currency converter will convert the amount in base currency amount to the target currency. The output of the program should only be the amount of target currency. For example the following command: `python currency.py USD CNY 1` should return a number like 7.5." expected_results: - type: check_executable_exits_normally - executable_name: "python passwordgenerator.py" - executable_arguments: "10 d" + executable_name: "python currency.py" + executable_arguments: "USD CAD 10" - type: check_executable_satisfies_function - executable_name: "python passwordgenerator.py" - executable_arguments: "10 d" - output_satisfies: "tf = lambda a : len(a) == 10" + executable_name: "python currency.py" + executable_arguments: "USD CAD 10" + output_satisfies: "tf = lambda a : a.replace('.', '').isnumeric()" +# - name: password_gen +# project_root: "projects/password_gen_eval" +# code_prompt: "Create a password generator CLI tool in Python that generates strong, random passwords based on user-specified criteria, such as length and character types (letters, numbers, symbols). The password generator should be a python program named passwordgenerator.py with two arguments: length, and character types. The character types argument can be one or more of the the following: l for lowercase, u for uppercase, d for digits, and s for symbols." +# expected_results: +# - type: check_executable_exits_normally +# executable_name: "python passwordgenerator.py" +# executable_arguments: "10 d" +# - type: check_executable_satisfies_function +# executable_name: "python passwordgenerator.py" +# executable_arguments: "10 d" +# output_satisfies: "tf = lambda a : len(a) == 10" From 38dd734a5bac321c752672eedc11cc0737b102a0 Mon Sep 17 00:00:00 2001 From: Peter Harrington Date: Mon, 18 Sep 2023 12:11:13 -0700 Subject: [PATCH 2/2] two projects working --- evals/EVAL_NEW_CODE_RESULTS.md | 17 +++++++++++++++++ evals/evals_new_code.py | 2 +- evals/new_code_eval.yaml | 22 +++++++++++----------- 3 files changed, 29 insertions(+), 12 deletions(-) diff --git a/evals/EVAL_NEW_CODE_RESULTS.md b/evals/EVAL_NEW_CODE_RESULTS.md index a44c9f71e7..3d60a0da41 100644 --- a/evals/EVAL_NEW_CODE_RESULTS.md +++ b/evals/EVAL_NEW_CODE_RESULTS.md @@ -12,3 +12,20 @@ |:---------------------------|:-------------|:------------------------------------|:-------| | projects/password_gen_eval | password_gen | check_executable_exits_normally | ✅ | | projects/password_gen_eval | password_gen | check_executable_satisfies_function | ✅ | +## 2023-09-18 + +### Existing Code Evaluation Summary: + +| Project | Evaluation | All Tests Pass | +|:----------------------------|:-------------------|:-----------------| +| projects/currency_converter | currency_converter | ✅ | +| projects/password_gen_eval | password_gen | ✅ | + +### Detailed Test Results: + +| Project | Evaluation | Test | Pass | +|:----------------------------|:-------------------|:------------------------------------|:-------| +| projects/currency_converter | currency_converter | check_executable_exits_normally | ✅ | +| projects/currency_converter | currency_converter | check_executable_satisfies_function | ✅ | +| projects/password_gen_eval | password_gen | check_executable_exits_normally | ✅ | +| projects/password_gen_eval | password_gen | check_executable_satisfies_function | ✅ | diff --git a/evals/evals_new_code.py b/evals/evals_new_code.py index 97383156d6..572d7755da 100644 --- a/evals/evals_new_code.py +++ b/evals/evals_new_code.py @@ -58,7 +58,7 @@ def single_evaluate(eval_ob: dict) -> list[bool]: process.wait() # we want to wait until it finishes. print("running tests on the newly generated code") - # TODO: test the code we should have an executable name + # test the code with the executable name in the config file evaluation_results = [] for test_case in eval_ob["expected_results"]: print(f"checking: {test_case['type']}") diff --git a/evals/new_code_eval.yaml b/evals/new_code_eval.yaml index cb9cf0e9b6..57f723910d 100644 --- a/evals/new_code_eval.yaml +++ b/evals/new_code_eval.yaml @@ -10,14 +10,14 @@ evaluations: executable_name: "python currency.py" executable_arguments: "USD CAD 10" output_satisfies: "tf = lambda a : a.replace('.', '').isnumeric()" -# - name: password_gen -# project_root: "projects/password_gen_eval" -# code_prompt: "Create a password generator CLI tool in Python that generates strong, random passwords based on user-specified criteria, such as length and character types (letters, numbers, symbols). The password generator should be a python program named passwordgenerator.py with two arguments: length, and character types. The character types argument can be one or more of the the following: l for lowercase, u for uppercase, d for digits, and s for symbols." -# expected_results: -# - type: check_executable_exits_normally -# executable_name: "python passwordgenerator.py" -# executable_arguments: "10 d" -# - type: check_executable_satisfies_function -# executable_name: "python passwordgenerator.py" -# executable_arguments: "10 d" -# output_satisfies: "tf = lambda a : len(a) == 10" + - name: password_gen + project_root: "projects/password_gen_eval" + code_prompt: "Create a password generator CLI tool in Python that generates strong, random passwords based on user-specified criteria, such as length and character types (letters, numbers, symbols). The password generator should be a python program named passwordgenerator.py with two arguments: length, and character types. The character types argument can be one or more of the the following: l for lowercase, u for uppercase, d for digits, and s for symbols." + expected_results: + - type: check_executable_exits_normally + executable_name: "python passwordgenerator.py" + executable_arguments: "10 d" + - type: check_executable_satisfies_function + executable_name: "python passwordgenerator.py" + executable_arguments: "10 d" + output_satisfies: "tf = lambda a : len(a) == 10"