Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/dev' into fix/useless_file
Browse files Browse the repository at this point in the history
  • Loading branch information
YukinoWan committed Nov 21, 2023
2 parents 1ea2bf3 + de33589 commit 8b4c2a9
Show file tree
Hide file tree
Showing 106 changed files with 576 additions and 14,644 deletions.
18 changes: 18 additions & 0 deletions .github/workflows/lint.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
name: Lint

on: [push, pull_request]

jobs:
lint:
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Set up Python 3.8
uses: actions/setup-python@v4
with:
python-version: "3.8"
- name: Run linters
run: |
pipx install pre-commit
pre-commit run --files llm_judge/*.py
30 changes: 0 additions & 30 deletions .github/workflows/python-package.yml

This file was deleted.

183 changes: 178 additions & 5 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,10 +1,183 @@
# Python
__pycache__
*.pyc
*.egg-info
dist
# Created by https://www.toptal.com/developers/gitignore/api/python
# Edit at https://www.toptal.com/developers/gitignore?templates=python

### Python ###
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
.pybuilder/
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version

# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock

# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock

# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/#use-with-ide
.pdm.toml

# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
<<<<<<< HEAD
.env
=======
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/

# pytype static type analyzer
.pytype/

# Cython debug symbols
cython_debug/

# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/

### Python Patch ###
# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
poetry.toml

# ruff
.ruff_cache/

# LSP config files
pyrightconfig.json

# End of https://www.toptal.com/developers/gitignore/api/python
>>>>>>> origin/dev

# folder
data
Expand Down
14 changes: 14 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.5.0
hooks:
- id: end-of-file-fixer
- id: trailing-whitespace
- id: check-yaml
- id: check-toml
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.1.4
hooks:
- id: ruff
args: [--fix, --exit-non-zero-on-fix]
- id: ruff-format
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.

Large diffs are not rendered by default.

File renamed without changes.
File renamed without changes.
File renamed without changes.
33 changes: 16 additions & 17 deletions llm_judge/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
import re
import time
from typing import Optional
import sys
import openai
from dotenv import load_dotenv

Expand All @@ -29,15 +28,15 @@

# Categories that need reference answers
NEED_REF_CATS = ["math", "reasoning", "coding"]
#NEED_REF_CATS = []
# NEED_REF_CATS = []

# Extract scores from judgments
two_score_pattern = re.compile("\[\[(\d+\.?\d*),\s?(\d+\.?\d*)\]\]")
two_score_pattern_backup = re.compile("\[(\d+\.?\d*),\s?(\d+\.?\d*)\]")
one_score_pattern = re.compile("\[\[(\d+\.?\d*)\]\]")
#one_score_pattern_backup = re.compile("\[(\d+\.?\d*)\]")
# one_score_pattern_backup = re.compile("\[(\d+\.?\d*)\]")
one_score_pattern_another_format = re.compile("\[\[rating:(\d+)\]\]")
one_score_pattern_another_format2 =re.compile("\[\[rating: (\d+)\]\]")
one_score_pattern_another_format2 = re.compile("\[\[rating: (\d+)\]\]")

# Sampling temperature configs for
temperature_config = {
Expand Down Expand Up @@ -112,8 +111,8 @@ def load_model_answers(answer_dir: str):
for filename in filenames:
model_name = os.path.basename(filename)[:-6]
answer = {}
#print(filename)

# print(filename)
with open(filename, "r") as fin:
# distinguished process for lora predictions
# if "lora" not in filename:
Expand Down Expand Up @@ -153,7 +152,7 @@ def run_judge_single(question, answer, judge, ref_answer, multi_turn=False):
model = judge.model_name
if ref_answer is not None:
kwargs["ref_answer_1"] = ref_answer["choices"][0]["turns"][0]
#kwargs["ref_answer_2"] = ref_answer["choices"][0]["turns"][1]
# kwargs["ref_answer_2"] = ref_answer["choices"][0]["turns"][1]

if multi_turn:
user_prompt = judge.prompt_template["prompt_template"].format(
Expand Down Expand Up @@ -188,9 +187,9 @@ def run_judge_single(question, answer, judge, ref_answer, multi_turn=False):
# if not match:
# match = re.search(one_score_pattern_backup, judgment)
if not match:
match = re.search(one_score_pattern_another_format,judgment)
match = re.search(one_score_pattern_another_format, judgment)
if not match:
match = re.search(one_score_pattern_another_format2,judgment)
match = re.search(one_score_pattern_another_format2, judgment)
if match:
rating = ast.literal_eval(match.groups()[0])
else:
Expand Down Expand Up @@ -241,7 +240,7 @@ def play_a_match_single(match: MatchPair, output_file: str):
if output_file:
os.makedirs(os.path.dirname(output_file), exist_ok=True)
with open(output_file, "a") as fout:
fout.write(json.dumps(result,ensure_ascii=False) + "\n")
fout.write(json.dumps(result, ensure_ascii=False) + "\n")

return result

Expand All @@ -251,9 +250,9 @@ def run_judge_pair(question, answer_a, answer_b, judge, ref_answer, multi_turn=F
model = judge.model_name
if ref_answer is not None:
kwargs["ref_answer_1"] = ref_answer["choices"][0]["turns"][0]
print("参考回答1:",ref_answer["choices"][0]["turns"][0])
print("参考回答1:", ref_answer["choices"][0]["turns"][0])
kwargs["ref_answer_2"] = ref_answer["choices"][0]["turns"][1]
print("参考回答2:",ref_answer["choices"][0]["turns"][1])
print("参考回答2:", ref_answer["choices"][0]["turns"][1])

if multi_turn:
system_prompt = judge.prompt_template["system_prompt"]
Expand Down Expand Up @@ -362,7 +361,7 @@ def play_a_match_pair(match: MatchPair, output_file: str):
"turn": turn,
"tstamp": time.time(),
}

print(
f"question: {question_id}, turn: {turn}, model_1: {model_1}, model_2: {model_2}, "
f"g1_winner: {g1_winner}, g2_winner: {g2_winner}, "
Expand Down Expand Up @@ -410,7 +409,7 @@ def play_a_match_pair(match: MatchPair, output_file: str):
if output_file:
os.makedirs(os.path.dirname(output_file), exist_ok=True)
with open(output_file, "a") as fout:
fout.write(json.dumps(result,ensure_ascii=False) + "\n")
fout.write(json.dumps(result, ensure_ascii=False) + "\n")

return result

Expand Down Expand Up @@ -577,7 +576,7 @@ def get_pairwise_judge_explanation(gamekey, judgment_dict):
return (
f"**Game 1**. **A**: {model_1}, **B**: {model_2}\n\n"
f"**Judgment**: {g1_judgment}"
+ f"\n\n`--------------------------`\n\n"
+ "\n\n`--------------------------`\n\n"
+ f"**Game 2**. **A**: {model_2}, **B**: {model_1}\n\n"
f"**Judgment**: {g2_judgment}"
)
Expand Down Expand Up @@ -619,8 +618,8 @@ def check_data(questions, model_answers, ref_answers, models, judges):
for q in questions:
if q["category"] not in NEED_REF_CATS:
continue
#print(q["question_id"])
#print(ref_answers[jg.model_name])
# print(q["question_id"])
# print(ref_answers[jg.model_name])
assert (
int(q["question_id"]) in ref_answers[jg.model_name]
), f"Missing reference answer to Question {q['question_id']} for judge {jg.model_name}"
Expand Down
Loading

0 comments on commit 8b4c2a9

Please sign in to comment.