Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 14 additions & 2 deletions experiments/benchmark_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,7 @@ def build_tiered_benchmark(n_formulas_per_tier,
max_mutants_per_formula=15,
near_eng_threshold=0.2,
far_eng_threshold=0.45,
min_similarity_threshold=1e-4,
misconception_weights=None,
seed=42,
max_candidates=10000,
Expand All @@ -205,6 +206,7 @@ def build_tiered_benchmark(n_formulas_per_tier,
max_mutants_per_formula: Maximum mutants per formula
near_eng_threshold: Max distance for closest mutant (paraphrases)
far_eng_threshold: Min distance for closest mutant (very different)
min_similarity_threshold: Min distance for closest mutant (reject if below, default 1e-4)
misconception_weights: Dict mapping misconception names to weights (0-1).
Default is uniform distribution. Higher weight = more samples.
Set to None to disable distribution control.
Expand All @@ -231,6 +233,7 @@ def build_tiered_benchmark(n_formulas_per_tier,
print(f" Target per tier: {n_formulas_per_tier} formulas")
print(f" Near-English: closest mutant distance < {near_eng_threshold}")
print(f" Far-English: closest mutant distance > {far_eng_threshold}")
print(f" Min similarity: closest mutant distance >= {min_similarity_threshold}")
print(f" Misconception distribution: {'uniform' if misconception_weights == DEFAULT_MISCONCEPTION_WEIGHTS else 'custom'}")
print(f" Max candidates to try: {max_candidates}\n")

Expand Down Expand Up @@ -435,6 +438,12 @@ def has_enough_for_distribution(pool):
max_distance = max(distances)
avg_distance = np.mean(distances)

# Skip templates where the closest mutant is too similar (nearly identical English)
if min_distance < min_similarity_threshold:
if verbose:
print(f" Skipping template '{template_formula}' due to too-similar mutant (distance {min_distance:.6f})")
continue

record = {
'ltl_formula': template_formula,
'english_translation': candidate_english,
Expand Down Expand Up @@ -524,8 +533,9 @@ def has_enough_for_distribution(pool):
max_distance = max(distances)
avg_distance = np.mean(distances)

# Skip formulas where the closest mutant has distance 0 (identical English)
if min_distance == 0:
# Skip formulas where the closest mutant is too similar (nearly identical English)
if min_distance < min_similarity_threshold:
print(f" Skipping formula '{candidate_formula}' due to too-similar mutant (distance {min_distance:.6f})")
continue

# Determine tier based on closest mutant distance
Expand Down Expand Up @@ -671,6 +681,7 @@ def main():
parser.add_argument("--n-per-tier", type=int, default=100, help="Target formulas per tier")
parser.add_argument("--near-eng-threshold", type=float, default=0.2, help="Near-English distance threshold")
parser.add_argument("--far-eng-threshold", type=float, default=0.45, help="Far-English distance threshold")
parser.add_argument("--min-similarity-threshold", type=float, default=1e-4, help="Minimum similarity threshold (reject if below)")
parser.add_argument("--seed", type=int, default=42, help="Random seed")
parser.add_argument("--max-candidates", type=int, default=5000, help="Max candidates to try")
parser.add_argument("--output-prefix", default="ltl_benchmark", help="Output file prefix")
Expand All @@ -682,6 +693,7 @@ def main():
n_formulas_per_tier=args.n_per_tier,
near_eng_threshold=args.near_eng_threshold,
far_eng_threshold=args.far_eng_threshold,
min_similarity_threshold=args.min_similarity_threshold,
seed=args.seed,
max_candidates=args.max_candidates,
verbose=not args.quiet,
Expand Down
101 changes: 101 additions & 0 deletions semantic_benchmark_far_eng.csv

Large diffs are not rendered by default.

101 changes: 101 additions & 0 deletions semantic_benchmark_near_eng.csv

Large diffs are not rendered by default.

204 changes: 191 additions & 13 deletions src/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,16 @@

from ltlnode import parse_ltl_string, SUPPORTED_SYNTAXES
from codebook import getAllApplicableMisconceptions
import os
import json
import sys
from feedbackgenerator import FeedbackGenerator
from logger import Logger
import ast
import exerciseprocessor
import os
import json
import sys
import csv
from difflib import SequenceMatcher
from markupsafe import Markup, escape
from feedbackgenerator import FeedbackGenerator
from logger import Logger
import ast
import exerciseprocessor
import exercisebuilder
import random
import spotutils
Expand All @@ -30,7 +33,7 @@
get_course_students,
get_exercises_for_course,
)
from modelroutes import modelroutes
from modelroutes import modelroutes



Expand Down Expand Up @@ -59,11 +62,102 @@ def enforce_https_in_production():
secure_url = request.url.replace("http://", "https://", 1)
return redirect(secure_url, code=301)

answer_logger = Logger()


def getUserName():
return current_user.username
answer_logger = Logger()
PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir))

BENCHMARK_FILES = {
"near": os.path.join(PROJECT_ROOT, "semantic_benchmark_near_eng.csv"),
"far": os.path.join(PROJECT_ROOT, "semantic_benchmark_far_eng.csv"),
}


def _load_benchmark_rows():
data = {}
for label, path in BENCHMARK_FILES.items():
if not os.path.exists(path):
data[label] = []
continue

with open(path, newline='') as csvfile:
reader = csv.DictReader(csvfile)
rows = []
for idx, row in enumerate(reader):
rows.append({
"row_index": idx,
"ltl_formula": row.get("ltl_formula", ""),
"english_translation": row.get("english_translation", ""),
"closest_mutant_formula": row.get("closest_mutant_formula", ""),
"closest_mutant_english": row.get("closest_mutant_english", ""),
"closest_mutant_misconception": row.get("closest_mutant_misconception", ""),
"closest_distance": float(row.get("closest_distance", 0) or 0),
"max_distance": float(row.get("max_distance", 0) or 0),
"avg_distance": float(row.get("avg_distance", 0) or 0),
"num_mutants": int(float(row.get("num_mutants", 0) or 0)),
"dataset": label
})
data[label] = rows
return data


BENCHMARK_ROWS = _load_benchmark_rows()


def _highlight_differences(text_a, text_b):
tokens_a = text_a.split()
tokens_b = text_b.split()
matcher = SequenceMatcher(None, tokens_a, tokens_b)
opcodes = matcher.get_opcodes()

def build(tokens, use_a):
parts = []
for tag, i1, i2, j1, j2 in opcodes:
segment = tokens[i1:i2] if use_a else tokens[j1:j2]
if not segment:
continue
escaped = " ".join(escape(token) for token in segment)
if tag == 'equal':
parts.append(escaped)
else:
parts.append(f"<mark>{escaped}</mark>")
return Markup(" ".join(parts))

return build(tokens_a, True), build(tokens_b, False)


def _next_index_for_dataset(user_id, dataset):
rows = BENCHMARK_ROWS.get(dataset, [])
rated = answer_logger.getRatedSentencePairIndices(user_id, dataset)
for idx in range(len(rows)):
if idx not in rated:
return idx
return None


def _pick_next_pair(user_id):
options = []
next_near = _next_index_for_dataset(user_id, "near")
next_far = _next_index_for_dataset(user_id, "far")

if next_near is not None:
options.append(("near", next_near))
if next_far is not None:
options.append(("far", next_far))

if not options:
return None, None, None

if len(options) == 2:
dataset = random.choice(["near", "far"])
index = next_near if dataset == "near" else next_far
else:
dataset, index = options[0]

row = BENCHMARK_ROWS[dataset][index]
return dataset, index, row


def getUserName():
return current_user.username

def getUserId():
return current_user.id
Expand Down Expand Up @@ -199,6 +293,90 @@ def ltl_to_english():
except Exception as e:
return jsonify({"error": "Failed to translate formula.", "details": str(e)}), 400


@app.route('/ltltoeng/rating', methods=['GET', 'POST'])
@login_required
def ltltoeng_rating():
user_id = getUserName()
rated_near = answer_logger.getRatedSentencePairIndices(user_id, "near")
rated_far = answer_logger.getRatedSentencePairIndices(user_id, "far")

total_count = len(BENCHMARK_ROWS.get("near", [])) + len(BENCHMARK_ROWS.get("far", []))
done_count = len(rated_near) + len(rated_far)
progress = {"done": done_count, "total": total_count}

if request.method == 'POST':
dataset = request.form.get('dataset', '')
row_index = request.form.get('row_index', '')
likert_value = request.form.get('likert')
awkward_flag = request.form.get('awkward_flag') == 'on'
awkward_notes = request.form.get('awkward_notes', '').strip()
unsure_flag = request.form.get('unsure') in ['1', 'on', 'true', 'True']

try:
row_index = int(row_index)
except ValueError:
return jsonify({"error": "Invalid rating payload"}), 400

likert_rating = None
if not unsure_flag:
try:
likert_rating = int(likert_value)
except (TypeError, ValueError):
return jsonify({"error": "Likert rating missing or invalid"}), 400
if likert_rating < 1 or likert_rating > 7:
return jsonify({"error": "Likert rating must be between 1 and 7"}), 400

rows = BENCHMARK_ROWS.get(dataset)
if rows is None or row_index < 0 or row_index >= len(rows):
return jsonify({"error": "Unknown benchmark item"}), 400

pair = rows[row_index]

answer_logger.logSentencePairRating({
"user_id": user_id,
"dataset": dataset,
"row_index": row_index,
"base_english": pair["english_translation"],
"mutant_english": pair["closest_mutant_english"],
"base_ltl": pair["ltl_formula"],
"mutant_ltl": pair["closest_mutant_formula"],
"misconception": pair["closest_mutant_misconception"],
"likert_rating": likert_rating,
"awkward_flag": awkward_flag,
"awkward_notes": awkward_notes,
"unsure": unsure_flag,
"closest_distance": pair["closest_distance"],
"max_distance": pair["max_distance"],
"avg_distance": pair["avg_distance"],
"num_mutants": pair["num_mutants"],
})

return redirect('/ltltoeng/rating')

dataset, row_index, pair = _pick_next_pair(user_id)
if pair is None:
return render_template(
'ltltoeng_rating.html',
uid=user_id,
done=True,
progress=progress
)

highlighted_base, highlighted_mutant = _highlight_differences(pair["english_translation"], pair["closest_mutant_english"])

return render_template(
'ltltoeng_rating.html',
uid=user_id,
done=False,
pair=pair,
dataset=dataset,
row_index=row_index,
highlighted_base=highlighted_base,
highlighted_mutant=highlighted_mutant,
progress=progress
)

return jsonify({"formula": formula, "english": english})


Expand Down
Loading