brownplt · sidprasad · Dec 8, 2025 · Dec 8, 2025 · Dec 8, 2025 · Dec 8, 2025
diff --git a/experiments/benchmark_builder.py b/experiments/benchmark_builder.py
@@ -185,6 +185,7 @@ def build_tiered_benchmark(n_formulas_per_tier,
                            max_mutants_per_formula=15,
                            near_eng_threshold=0.2,
                            far_eng_threshold=0.45,
+                           min_similarity_threshold=1e-4,
                            misconception_weights=None,
                            seed=42,
                            max_candidates=10000,
@@ -205,6 +206,7 @@ def build_tiered_benchmark(n_formulas_per_tier,
         max_mutants_per_formula: Maximum mutants per formula
         near_eng_threshold: Max distance for closest mutant (paraphrases)
         far_eng_threshold: Min distance for closest mutant (very different)
+        min_similarity_threshold: Min distance for closest mutant (reject if below, default 1e-4)
         misconception_weights: Dict mapping misconception names to weights (0-1).
                               Default is uniform distribution. Higher weight = more samples.
                               Set to None to disable distribution control.
@@ -231,6 +233,7 @@ def build_tiered_benchmark(n_formulas_per_tier,
         print(f"  Target per tier: {n_formulas_per_tier} formulas")
         print(f"  Near-English: closest mutant distance < {near_eng_threshold}")
         print(f"  Far-English: closest mutant distance > {far_eng_threshold}")
+        print(f"  Min similarity: closest mutant distance >= {min_similarity_threshold}")
         print(f"  Misconception distribution: {'uniform' if misconception_weights == DEFAULT_MISCONCEPTION_WEIGHTS else 'custom'}")
         print(f"  Max candidates to try: {max_candidates}\n")
 
@@ -435,6 +438,12 @@ def has_enough_for_distribution(pool):
                                     max_distance = max(distances)
                                     avg_distance = np.mean(distances)
 
+                                    # Skip templates where the closest mutant is too similar (nearly identical English)
+                                    if min_distance < min_similarity_threshold:
+                                        if verbose:
+                                            print(f"    Skipping template '{template_formula}' due to too-similar mutant (distance {min_distance:.6f})")
+                                        continue
+
                                     record = {
                                         'ltl_formula': template_formula,
                                         'english_translation': candidate_english,
@@ -524,8 +533,9 @@ def has_enough_for_distribution(pool):
         max_distance = max(distances)
         avg_distance = np.mean(distances)
 
-        # Skip formulas where the closest mutant has distance 0 (identical English)
-        if min_distance == 0:
+        # Skip formulas where the closest mutant is too similar (nearly identical English)
+        if min_distance < min_similarity_threshold:
+            print(f"    Skipping formula '{candidate_formula}' due to too-similar mutant (distance {min_distance:.6f})")
             continue
 
         # Determine tier based on closest mutant distance
@@ -671,6 +681,7 @@ def main():
     parser.add_argument("--n-per-tier", type=int, default=100, help="Target formulas per tier")
     parser.add_argument("--near-eng-threshold", type=float, default=0.2, help="Near-English distance threshold")
     parser.add_argument("--far-eng-threshold", type=float, default=0.45, help="Far-English distance threshold")
+    parser.add_argument("--min-similarity-threshold", type=float, default=1e-4, help="Minimum similarity threshold (reject if below)")
     parser.add_argument("--seed", type=int, default=42, help="Random seed")
     parser.add_argument("--max-candidates", type=int, default=5000, help="Max candidates to try")
     parser.add_argument("--output-prefix", default="ltl_benchmark", help="Output file prefix")
@@ -682,6 +693,7 @@ def main():
         n_formulas_per_tier=args.n_per_tier,
         near_eng_threshold=args.near_eng_threshold,
         far_eng_threshold=args.far_eng_threshold,
+        min_similarity_threshold=args.min_similarity_threshold,
         seed=args.seed,
         max_candidates=args.max_candidates,
         verbose=not args.quiet,

diff --git a/semantic_benchmark_far_eng.csv b/semantic_benchmark_far_eng.csv
diff --git a/semantic_benchmark_near_eng.csv b/semantic_benchmark_near_eng.csv
diff --git a/src/app.py b/src/app.py
@@ -4,13 +4,16 @@
 
 from ltlnode import parse_ltl_string, SUPPORTED_SYNTAXES
 from codebook import getAllApplicableMisconceptions
-import os
-import json
-import sys
-from feedbackgenerator import FeedbackGenerator
-from logger import Logger
-import ast
-import exerciseprocessor
+import os
+import json
+import sys
+import csv
+from difflib import SequenceMatcher
+from markupsafe import Markup, escape
+from feedbackgenerator import FeedbackGenerator
+from logger import Logger
+import ast
+import exerciseprocessor
 import exercisebuilder
 import random
 import spotutils
@@ -30,7 +33,7 @@
     get_course_students,
     get_exercises_for_course,
 )
-from modelroutes import modelroutes
+from modelroutes import modelroutes
 
 
 
@@ -59,11 +62,102 @@ def enforce_https_in_production():
         secure_url = request.url.replace("http://", "https://", 1)
         return redirect(secure_url, code=301)
 
-answer_logger = Logger()
-
-
-def getUserName():
-    return current_user.username
+answer_logger = Logger()
+PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir))
+
+BENCHMARK_FILES = {
+    "near": os.path.join(PROJECT_ROOT, "semantic_benchmark_near_eng.csv"),
+    "far": os.path.join(PROJECT_ROOT, "semantic_benchmark_far_eng.csv"),
+}
+
+
+def _load_benchmark_rows():
+    data = {}
+    for label, path in BENCHMARK_FILES.items():
+        if not os.path.exists(path):
+            data[label] = []
+            continue
+
+        with open(path, newline='') as csvfile:
+            reader = csv.DictReader(csvfile)
+            rows = []
+            for idx, row in enumerate(reader):
+                rows.append({
+                    "row_index": idx,
+                    "ltl_formula": row.get("ltl_formula", ""),
+                    "english_translation": row.get("english_translation", ""),
+                    "closest_mutant_formula": row.get("closest_mutant_formula", ""),
+                    "closest_mutant_english": row.get("closest_mutant_english", ""),
+                    "closest_mutant_misconception": row.get("closest_mutant_misconception", ""),
+                    "closest_distance": float(row.get("closest_distance", 0) or 0),
+                    "max_distance": float(row.get("max_distance", 0) or 0),
+                    "avg_distance": float(row.get("avg_distance", 0) or 0),
+                    "num_mutants": int(float(row.get("num_mutants", 0) or 0)),
+                    "dataset": label
+                })
+            data[label] = rows
+    return data
+
+
+BENCHMARK_ROWS = _load_benchmark_rows()
+
+
+def _highlight_differences(text_a, text_b):
+    tokens_a = text_a.split()
+    tokens_b = text_b.split()
+    matcher = SequenceMatcher(None, tokens_a, tokens_b)
+    opcodes = matcher.get_opcodes()
+
+    def build(tokens, use_a):
+        parts = []
+        for tag, i1, i2, j1, j2 in opcodes:
+            segment = tokens[i1:i2] if use_a else tokens[j1:j2]
+            if not segment:
+                continue
+            escaped = " ".join(escape(token) for token in segment)
+            if tag == 'equal':
+                parts.append(escaped)
+            else:
+                parts.append(f"<mark>{escaped}</mark>")
+        return Markup(" ".join(parts))
+
+    return build(tokens_a, True), build(tokens_b, False)
+
+
+def _next_index_for_dataset(user_id, dataset):
+    rows = BENCHMARK_ROWS.get(dataset, [])
+    rated = answer_logger.getRatedSentencePairIndices(user_id, dataset)
+    for idx in range(len(rows)):
+        if idx not in rated:
+            return idx
+    return None
+
+
+def _pick_next_pair(user_id):
+    options = []
+    next_near = _next_index_for_dataset(user_id, "near")
+    next_far = _next_index_for_dataset(user_id, "far")
+
+    if next_near is not None:
+        options.append(("near", next_near))
+    if next_far is not None:
+        options.append(("far", next_far))
+
+    if not options:
+        return None, None, None
+
+    if len(options) == 2:
+        dataset = random.choice(["near", "far"])
+        index = next_near if dataset == "near" else next_far
+    else:
+        dataset, index = options[0]
+
+    row = BENCHMARK_ROWS[dataset][index]
+    return dataset, index, row
+
+
+def getUserName():
+    return current_user.username
 
 def getUserId():
     return current_user.id
@@ -199,6 +293,90 @@ def ltl_to_english():
     except Exception as e:
         return jsonify({"error": "Failed to translate formula.", "details": str(e)}), 400
 
+
+@app.route('/ltltoeng/rating', methods=['GET', 'POST'])
+@login_required
+def ltltoeng_rating():
+    user_id = getUserName()
+    rated_near = answer_logger.getRatedSentencePairIndices(user_id, "near")
+    rated_far = answer_logger.getRatedSentencePairIndices(user_id, "far")
+
+    total_count = len(BENCHMARK_ROWS.get("near", [])) + len(BENCHMARK_ROWS.get("far", []))
+    done_count = len(rated_near) + len(rated_far)
+    progress = {"done": done_count, "total": total_count}
+
+    if request.method == 'POST':
+        dataset = request.form.get('dataset', '')
+        row_index = request.form.get('row_index', '')
+        likert_value = request.form.get('likert')
+        awkward_flag = request.form.get('awkward_flag') == 'on'
+        awkward_notes = request.form.get('awkward_notes', '').strip()
+        unsure_flag = request.form.get('unsure') in ['1', 'on', 'true', 'True']
+
+        try:
+            row_index = int(row_index)
+        except ValueError:
+            return jsonify({"error": "Invalid rating payload"}), 400
+
+        likert_rating = None
+        if not unsure_flag:
+            try:
+                likert_rating = int(likert_value)
+            except (TypeError, ValueError):
+                return jsonify({"error": "Likert rating missing or invalid"}), 400
+            if likert_rating < 1 or likert_rating > 7:
+                return jsonify({"error": "Likert rating must be between 1 and 7"}), 400
+
+        rows = BENCHMARK_ROWS.get(dataset)
+        if rows is None or row_index < 0 or row_index >= len(rows):
+            return jsonify({"error": "Unknown benchmark item"}), 400
+
+        pair = rows[row_index]
+
+        answer_logger.logSentencePairRating({
+            "user_id": user_id,
+            "dataset": dataset,
+            "row_index": row_index,
+            "base_english": pair["english_translation"],
+            "mutant_english": pair["closest_mutant_english"],
+            "base_ltl": pair["ltl_formula"],
+            "mutant_ltl": pair["closest_mutant_formula"],
+            "misconception": pair["closest_mutant_misconception"],
+            "likert_rating": likert_rating,
+            "awkward_flag": awkward_flag,
+            "awkward_notes": awkward_notes,
+            "unsure": unsure_flag,
+            "closest_distance": pair["closest_distance"],
+            "max_distance": pair["max_distance"],
+            "avg_distance": pair["avg_distance"],
+            "num_mutants": pair["num_mutants"],
+        })
+
+        return redirect('/ltltoeng/rating')
+
+    dataset, row_index, pair = _pick_next_pair(user_id)
+    if pair is None:
+        return render_template(
+            'ltltoeng_rating.html',
+            uid=user_id,
+            done=True,
+            progress=progress
+        )
+
+    highlighted_base, highlighted_mutant = _highlight_differences(pair["english_translation"], pair["closest_mutant_english"])
+
+    return render_template(
+        'ltltoeng_rating.html',
+        uid=user_id,
+        done=False,
+        pair=pair,
+        dataset=dataset,
+        row_index=row_index,
+        highlighted_base=highlighted_base,
+        highlighted_mutant=highlighted_mutant,
+        progress=progress
+    )
+
     return jsonify({"formula": formula, "english": english})