diff --git a/experiments/benchmark_builder.py b/experiments/benchmark_builder.py index 48d2482..bb92146 100644 --- a/experiments/benchmark_builder.py +++ b/experiments/benchmark_builder.py @@ -185,6 +185,7 @@ def build_tiered_benchmark(n_formulas_per_tier, max_mutants_per_formula=15, near_eng_threshold=0.2, far_eng_threshold=0.45, + min_similarity_threshold=1e-4, misconception_weights=None, seed=42, max_candidates=10000, @@ -205,6 +206,7 @@ def build_tiered_benchmark(n_formulas_per_tier, max_mutants_per_formula: Maximum mutants per formula near_eng_threshold: Max distance for closest mutant (paraphrases) far_eng_threshold: Min distance for closest mutant (very different) + min_similarity_threshold: Min distance for closest mutant (reject if below, default 1e-4) misconception_weights: Dict mapping misconception names to weights (0-1). Default is uniform distribution. Higher weight = more samples. Set to None to disable distribution control. @@ -231,6 +233,7 @@ def build_tiered_benchmark(n_formulas_per_tier, print(f" Target per tier: {n_formulas_per_tier} formulas") print(f" Near-English: closest mutant distance < {near_eng_threshold}") print(f" Far-English: closest mutant distance > {far_eng_threshold}") + print(f" Min similarity: closest mutant distance >= {min_similarity_threshold}") print(f" Misconception distribution: {'uniform' if misconception_weights == DEFAULT_MISCONCEPTION_WEIGHTS else 'custom'}") print(f" Max candidates to try: {max_candidates}\n") @@ -435,6 +438,12 @@ def has_enough_for_distribution(pool): max_distance = max(distances) avg_distance = np.mean(distances) + # Skip templates where the closest mutant is too similar (nearly identical English) + if min_distance < min_similarity_threshold: + if verbose: + print(f" Skipping template '{template_formula}' due to too-similar mutant (distance {min_distance:.6f})") + continue + record = { 'ltl_formula': template_formula, 'english_translation': candidate_english, @@ -524,8 +533,9 @@ def has_enough_for_distribution(pool): max_distance = max(distances) avg_distance = np.mean(distances) - # Skip formulas where the closest mutant has distance 0 (identical English) - if min_distance == 0: + # Skip formulas where the closest mutant is too similar (nearly identical English) + if min_distance < min_similarity_threshold: + print(f" Skipping formula '{candidate_formula}' due to too-similar mutant (distance {min_distance:.6f})") continue # Determine tier based on closest mutant distance @@ -671,6 +681,7 @@ def main(): parser.add_argument("--n-per-tier", type=int, default=100, help="Target formulas per tier") parser.add_argument("--near-eng-threshold", type=float, default=0.2, help="Near-English distance threshold") parser.add_argument("--far-eng-threshold", type=float, default=0.45, help="Far-English distance threshold") + parser.add_argument("--min-similarity-threshold", type=float, default=1e-4, help="Minimum similarity threshold (reject if below)") parser.add_argument("--seed", type=int, default=42, help="Random seed") parser.add_argument("--max-candidates", type=int, default=5000, help="Max candidates to try") parser.add_argument("--output-prefix", default="ltl_benchmark", help="Output file prefix") @@ -682,6 +693,7 @@ def main(): n_formulas_per_tier=args.n_per_tier, near_eng_threshold=args.near_eng_threshold, far_eng_threshold=args.far_eng_threshold, + min_similarity_threshold=args.min_similarity_threshold, seed=args.seed, max_candidates=args.max_candidates, verbose=not args.quiet, diff --git a/semantic_benchmark_far_eng.csv b/semantic_benchmark_far_eng.csv new file mode 100644 index 0000000..c6092f8 --- /dev/null +++ b/semantic_benchmark_far_eng.csv @@ -0,0 +1,101 @@ +ltl_formula,english_translation,closest_mutant_formula,closest_mutant_english,closest_mutant_misconception,closest_distance,max_distance,avg_distance,num_mutants +p3 | Gp1,Either 'p3' or 'p1' must always hold,(p3 | (F p1)),"Either 'p3' or at some point, 'p1' will hold",BadStateQuantification,0.45043107867240906,0.5518139600753784,0.4953062931696574,3 +(p0 U ((! p0) | (! p3))),'p0' until not both 'p0' and 'p3',((p0 U ((! p0) | (! p3))) | (G p0)),Either 'p0' until not both 'p0' and 'p3' or 'p0' must always hold,WeakU,0.45751529932022095,0.781352698802948,0.5453808307647705,4 +!p3 | Xp0,"Either 'p3' is false or in the next step, 'p0'",((! p3) | (F p0)),"Either 'p3' is false or at some point, 'p0' will hold",OtherImplicit,0.46314603090286255,0.46314603090286255,0.46314603090286255,1 +(p2 U ((! p2) | p2)),'p2' until either 'p2' is false or 'p2',(p2 U p2),'p2' until 'p2',ExclusiveU,0.46673595905303955,0.46673595905303955,0.46673595905303955,1 +X(p3 & FGp2),"In the next step, both 'p3' and eventually, 'p2' will always be true",(X (p3 & (F p2))),"In the next step, both 'p3' and at some point, 'p2' will hold",ImplicitG,0.46724972128868103,0.5417690873146057,0.487897090613842,4 +X((p1 & p2) | (!p1 & !p2)),"In the next step, either both 'p1' and 'p2' or neither 'p1' nor 'p2'",(X (((p1 & p2) | (! p1)) & (! p2))),"In the next step, both either both 'p1' and 'p2' or 'p1' is false and 'p2' is false",Precedence,0.49869200587272644,0.5975241661071777,0.5481080859899521,2 +F(p2 & p3),"Eventually, both 'p2' and 'p3' will be true simultaneously",(p2 & p3),Both 'p2' and 'p3',ImplicitF,0.6440837979316711,0.7360519766807556,0.6747398575146993,3 +p1 | G!p0,Either 'p1' or 'p0' will never occur,(p1 | (F (! p0))),"Either 'p1' or eventually, not 'p0'",BadStateQuantification,0.4521879255771637,0.7287090420722961,0.6086266140143076,3 +p3 U p2,'p3' until 'p2',((p3 U p2) | (G p3)),Either 'p3' until 'p2' or 'p3' must always hold,WeakU,0.4602723717689514,0.616101861000061,0.5230183998743693,3 +((G p2) U ((G p2) -> (F p2))),"'p2' must always hold until if 'p2' must always hold, then at some point, 'p2' will hold",(G p2),'p2' must always hold,OtherImplicit,0.46611276268959045,0.6317513585090637,0.5489320605993271,2 +XG(p0 & p3),"In the next step, at all times, both 'p0' and 'p3' hold",(X (p0 & p3)),"In the next step, both 'p0' and 'p3'",ImplicitG,0.49857398867607117,0.5757542252540588,0.5293595294157664,3 +(p3 U (p3 -> p3)),"'p3' until if 'p3' holds, then 'p3' holds",(p3 U p3),'p3' until 'p3',ExclusiveU,0.5209833383560181,0.5209833383560181,0.5209833383560181,1 +F(p0 & p3),"Eventually, both 'p0' and 'p3' will be true simultaneously",(p0 & p3),Both 'p0' and 'p3',ImplicitF,0.6623632311820984,0.7488532662391663,0.6911932428677877,3 +p2 | Gp3,Either 'p2' or 'p3' must always hold,(p2 | (F p3)),"Either 'p2' or at some point, 'p3' will hold",BadStateQuantification,0.4526665508747101,0.741706132888794,0.588632196187973,3 +p1 U p0,'p1' until 'p0',((p1 U p0) | (G p1)),Either 'p1' until 'p0' or 'p1' must always hold,WeakU,0.46299681067466736,0.6431487202644348,0.5307719111442566,3 +!p0 & Xp3,"Both 'p0' is false and in the next step, 'p3'",((! p0) & (F p3)),"Both 'p0' is false and at some point, 'p3' will hold",OtherImplicit,0.4761523902416229,0.4761523902416229,0.4761523902416229,1 +XG(p2 & p3),"In the next step, at all times, both 'p2' and 'p3' hold",(X (p2 & p3)),"In the next step, both 'p2' and 'p3'",ImplicitG,0.5027826428413391,0.5630331635475159,0.533862849076589,3 +(p1 U (p1 -> p1)),"'p1' until if 'p1' holds, then 'p1' holds",(p1 U p1),'p1' until 'p1',ExclusiveU,0.5512495636940002,0.8403921127319336,0.6958208382129669,2 +FGp2,"Eventually, 'p2' will always be true",(G p2),'p2' must always hold,ImplicitF,0.6659942865371704,0.7746404409408569,0.6934536993503571,4 +p1 | (p1 U p2),Either 'p1' or 'p1' until 'p2',(p1 | (p1 U (F p2))),"Either 'p1' or 'p1' until at some point, 'p2' will hold",BadStateQuantification,0.4548565745353699,0.7249919772148132,0.5574564039707184,3 +!p3 | Xp3,"Either 'p3' is false or in the next step, 'p3'",((! p3) | (F p3)),"Either 'p3' is false or at some point, 'p3' will hold",OtherImplicit,0.47656193375587463,0.47656193375587463,0.47656193375587463,1 +p1 U p2,'p1' until 'p2',((p1 U p2) | (G p1)),Either 'p1' until 'p2' or 'p1' must always hold,WeakU,0.4799743890762329,0.7211152911186218,0.5797340075174967,3 +G(p1 | p2),Always have either 'p1' or 'p2',(p1 | p2),At least one of 'p1' or 'p2',ImplicitG,0.5046390295028687,0.6220009326934814,0.5437596638997396,3 +(p0 U (p0 -> p0)),"'p0' until if 'p0' holds, then 'p0' holds",(p0 U p0),'p0' until 'p0',ExclusiveU,0.5660205483436584,0.5660205483436584,0.5660205483436584,1 +F(p0 & p2),"Eventually, both 'p0' and 'p2' will be true simultaneously",(p0 & p2),Both 'p0' and 'p2',ImplicitF,0.6757736802101135,0.7607364654541016,0.7040946086247762,3 +!p2 | p3 | GFp1,Either 'p2' is false or 'p3' or 'p1' will happen infinitely often,(((! p2) | p3) | (F (F p1))),"Either 'p2' is false or 'p3' or eventually, 'p1'",BadStateQuantification,0.45498692989349365,0.5934218168258667,0.542361855506897,4 +!p1 | p3,Either 'p1' is false or 'p3',(! p1),'p1' is false,OtherImplicit,0.4804563522338867,0.4804563522338867,0.4804563522338867,1 +p0 U p1,'p0' until 'p1',((p0 U p1) | (G p0)),Either 'p0' until 'p1' or 'p0' must always hold,WeakU,0.49506258964538574,0.7117910385131836,0.5696965654691061,3 +XG(p1 & p2),"In the next step, at all times, both 'p1' and 'p2' hold",(X (p1 & p2)),"In the next step, both 'p1' and 'p2'",ImplicitG,0.5117736458778381,0.5819584727287292,0.5519059896469116,3 +F(p0 & p1),"Eventually, both 'p0' and 'p1' will be true simultaneously",(p0 & p1),Both 'p0' and 'p1',ImplicitF,0.6860395073890686,0.7655901908874512,0.7125564018885294,3 +p3 | G!p3,Either 'p3' or 'p3' will never occur,(p3 | (F (! p3))),"Either 'p3' or eventually, not 'p3'",BadStateQuantification,0.4570067822933197,0.6578147411346436,0.571741650501887,3 +X(p0 | !p1 | !p2 | p3),"In the next step, either either 'p0' or 'p1' is false or 'p2' is false or 'p3'",(F (((p0 | (! p1)) | (! p2)) | p3)),"Eventually, either either 'p0' or 'p1' is false or 'p2' is false or 'p3'",OtherImplicit,0.5055903196334839,0.5055903196334839,0.5055903196334839,1 +G(p1 & p3),"At all times, both 'p1' and 'p3' hold",(p1 & p3),Both 'p1' and 'p3',ImplicitG,0.6503925323486328,0.7537016868591309,0.6848289171854655,3 +p3 & G!p2,Both 'p3' and 'p2' will never occur,(p3 & (F (! p2))),"Both 'p3' and eventually, not 'p2'",BadStateQuantification,0.46759116649627686,0.5926995873451233,0.5327227910359701,3 +((G p3) U (p3 & (G ((! p3) -> p2)))),'p3' must always hold until both 'p3' and it is always the case that 'p2' unless 'p3',(G p3),'p3' must always hold,OtherImplicit,0.5538555383682251,0.5538555383682251,0.5538555383682251,1 +FGp1,"Eventually, 'p1' will always be true",(F p1),"At some point, 'p1' will hold",ImplicitG,0.6717284917831421,0.8013848066329956,0.7046237289905548,4 +!p3 | GFp1,Either 'p3' is false or 'p1' will happen infinitely often,((! p3) | (F (F p1))),"Either 'p3' is false or eventually, 'p1'",BadStateQuantification,0.47198814153671265,0.632379949092865,0.5645895153284073,4 +p1 & p2,Both 'p1' and 'p2',p1,'p1',OtherImplicit,0.5831860303878784,0.5831860303878784,0.5831860303878784,1 +FGp3,"Eventually, 'p3' will always be true",(F p3),"At some point, 'p3' will hold",ImplicitG,0.6736251711845398,0.7966910004615784,0.7063409090042114,4 +p1 & F!p2,"Both 'p1' and eventually, not 'p2'",(p1 & (G (! p2))),Both 'p1' and 'p2' will never occur,BadStateQuantification,0.47616830468177795,0.685285747051239,0.6001089910666147,3 +X(p0 | !p1),"In the next step, either 'p0' or 'p1' is false",(F (p0 | (! p1))),"Eventually, either 'p0' or 'p1' is false",OtherImplicit,0.595894992351532,0.595894992351532,0.595894992351532,1 +G(p1 & p2),"At all times, both 'p1' and 'p2' hold",(p1 & p2),Both 'p1' and 'p2',ImplicitG,0.6804438829421997,0.7751515507698059,0.7120131055514017,3 +F!p3,"Eventually, not 'p3'",(G (! p3)),'p3' will never occur,BadStateQuantification,0.49604660272598267,0.6771801710128784,0.6168023149172465,3 +p0 & p3,Both 'p0' and 'p3',p3,'p3',OtherImplicit,0.5999793410301208,0.5999793410301208,0.5999793410301208,1 +G!p3,'p3' will never occur,(F (! p3)),"Eventually, not 'p3'",BadStateQuantification,0.49604660272598267,0.6052417159080505,0.5688433448473612,3 +FGp0,"Eventually, 'p0' will always be true",(F p0),"At some point, 'p0' will hold",ImplicitG,0.7481658458709717,0.8434385061264038,0.7759364545345306,4 +p2 & p3,Both 'p2' and 'p3',p2,'p2',OtherImplicit,0.6012242436408997,0.6012242436408997,0.6012242436408997,1 +p0 U p2,'p0' until 'p2',((F p0) U p2),"At some point, 'p0' will hold until 'p2'",BadStateQuantification,0.49832701683044434,0.746719241142273,0.5830078721046448,3 +p0 | p1,At least one of 'p0' or 'p1',p1,'p1',OtherImplicit,0.6131643652915955,0.6131643652915955,0.6131643652915955,1 +Fp3,"At some point, 'p3' will hold",(G p3),'p3' must always hold,BadStateQuantification,0.498857319355011,0.6736803650856018,0.6154060165087382,3 +p1 | p3 | Xp1,"Either at least one of 'p1' or 'p3' or in the next step, 'p1'",((p1 | p3) | (F p1)),"Either at least one of 'p1' or 'p3' or at some point, 'p1' will hold",OtherImplicit,0.6208100914955139,0.6208100914955139,0.6208100914955139,1 +Gp3,'p3' must always hold,(F p3),"At some point, 'p3' will hold",BadStateQuantification,0.498857319355011,0.5874304175376892,0.5579060514767965,3 +p0 | p3,At least one of 'p0' or 'p3',p3,'p3',OtherImplicit,0.6339837312698364,0.6339837312698364,0.6339837312698364,1 +Fp2,"At some point, 'p2' will hold",(G p2),'p2' must always hold,BadStateQuantification,0.5048030018806458,0.6781956553459167,0.6203981041908264,3 +!p2,'p2' is false,p2,'p2',OtherImplicit,0.6409874558448792,0.6409874558448792,0.6409874558448792,1 +Gp2,'p2' must always hold,(F p2),"At some point, 'p2' will hold",BadStateQuantification,0.5048030018806458,0.5831425189971924,0.5570293466250101,3 +!p3,'p3' is false,p3,'p3',OtherImplicit,0.6421687602996826,0.6421687602996826,0.6421687602996826,1 +G!p2,'p2' will never occur,(F (! p2)),"Eventually, not 'p2'",BadStateQuantification,0.5071642994880676,0.6483583450317383,0.601293663183848,3 +p0 | p2 | Xp2,"Either at least one of 'p0' or 'p2' or in the next step, 'p2'",((p0 | p2) | (F p2)),"Either at least one of 'p0' or 'p2' or at some point, 'p2' will hold",OtherImplicit,0.6449719667434692,0.6449719667434692,0.6449719667434692,1 +F!p2,"Eventually, not 'p2'",(G (! p2)),'p2' will never occur,BadStateQuantification,0.5071642994880676,0.6714813709259033,0.6167090137799581,3 +p3 | X!p3,"Either 'p3' or in the next step, 'p3' is false",(p3 | (F (! p3))),"Either 'p3' or eventually, not 'p3'",OtherImplicit,0.6540914177894592,0.6540914177894592,0.6540914177894592,1 +F!p0,"Eventually, not 'p0'",(G (! p0)),'p0' will never occur,BadStateQuantification,0.5171661376953125,0.6988669633865356,0.6383000214894613,3 +!p1,'p1' is false,p1,'p1',OtherImplicit,0.6591484546661377,0.6591484546661377,0.6591484546661377,1 +G!p0,'p0' will never occur,(F (! p0)),"Eventually, not 'p0'",BadStateQuantification,0.5171661376953125,0.701252281665802,0.6398902336756388,3 +X(p0 | p2),"In the next step, at least one of 'p0' or 'p2'",(F (p0 | p2)),"Eventually, at least one of 'p0' or 'p2'",OtherImplicit,0.667004406452179,0.667004406452179,0.667004406452179,1 +Fp1,"At some point, 'p1' will hold",(G p1),'p1' must always hold,BadStateQuantification,0.5173208117485046,0.6866223216056824,0.6301884849866232,3 +Gp1,'p1' must always hold,(F p1),"At some point, 'p1' will hold",BadStateQuantification,0.5173208117485046,0.6117421984672546,0.580268402894338,3 +!p0,'p0' is false,p0,'p0',OtherImplicit,0.6832073926925659,0.6832073926925659,0.6832073926925659,1 +F!p1,"Eventually, not 'p1'",(G (! p1)),'p1' will never occur,BadStateQuantification,0.518254280090332,0.6831350326538086,0.6281747817993164,3 +p2 | X!p2,"Either 'p2' or in the next step, 'p2' is false",(p2 | (F (! p2))),"Either 'p2' or eventually, not 'p2'",OtherImplicit,0.6898571848869324,0.6898571848869324,0.6898571848869324,1 +G!p1,'p1' will never occur,(F (! p1)),"Eventually, not 'p1'",BadStateQuantification,0.518254280090332,0.6784943342208862,0.6250809828440348,3 +p2 & Xp3,"Both 'p2' and in the next step, 'p3'",(p2 & (F p3)),"Both 'p2' and at some point, 'p3' will hold",OtherImplicit,0.6917235851287842,0.6917235851287842,0.6917235851287842,1 +Fp0,"At some point, 'p0' will hold",(G p0),'p0' must always hold,BadStateQuantification,0.5208287835121155,0.7393346428871155,0.6664993564287821,3 +p1 | p2,At least one of 'p1' or 'p2',p2,'p2',OtherImplicit,0.7012022137641907,0.7012022137641907,0.7012022137641907,1 +Gp0,'p0' must always hold,(F p0),"At some point, 'p0' will hold",BadStateQuantification,0.5208287835121155,0.6559706330299377,0.6109233498573303,3 +p3 | Xp3,"Either 'p3' or in the next step, 'p3'",(p3 | (F p3)),"Either 'p3' or at some point, 'p3' will hold",OtherImplicit,0.7160053849220276,0.7160053849220276,0.7160053849220276,1 +F(p3 & Fp1),"Eventually, both 'p3' and at some point, 'p1' will hold will be true simultaneously",(G (p3 & (F p1))),"At all times, both 'p3' and at some point, 'p1' will hold",BadStateQuantification,0.5369850397109985,0.5416533350944519,0.5400972366333008,3 +p0 | Xp1,"Either 'p0' or in the next step, 'p1'",(p0 | (F p1)),"Either 'p0' or at some point, 'p1' will hold",OtherImplicit,0.716743528842926,0.716743528842926,0.716743528842926,1 +p0 | Xp2,"Either 'p0' or in the next step, 'p2'",(p0 | (F p2)),"Either 'p0' or at some point, 'p2' will hold",OtherImplicit,0.7250054478645325,0.7250054478645325,0.7250054478645325,1 +p2 & FGp3,"Both 'p2' and eventually, 'p3' will always be true",(p2 & (G (G p3))),"Both 'p2' and at all times, 'p3'",BadStateQuantification,0.5996398329734802,0.7897274494171143,0.6651566028594971,4 +!p0 | p3,Either 'p0' is false or 'p3',p3,'p3',OtherImplicit,0.7462073564529419,0.7462073564529419,0.7462073564529419,1 +p2 | FGp3,"Either 'p2' or eventually, 'p3' will always be true",(p2 | (G (G p3))),"Either 'p2' or at all times, 'p3'",BadStateQuantification,0.6310169696807861,0.7770109176635742,0.6712877005338669,4 +p0 | Xp0,"Either 'p0' or in the next step, 'p0'",(p0 | (F p0)),"Either 'p0' or at some point, 'p0' will hold",OtherImplicit,0.7491868138313293,0.7491868138313293,0.7491868138313293,1 +p0 | FGp2,"Either 'p0' or eventually, 'p2' will always be true",(p0 | (G (G p2))),"Either 'p0' or at all times, 'p2'",BadStateQuantification,0.6483875513076782,0.8493967056274414,0.7040949761867523,4 +X!p3,"In the next step, 'p3' is false",(F (! p3)),"Eventually, not 'p3'",OtherImplicit,0.7590113878250122,0.7590113878250122,0.7590113878250122,1 +GFp3,'p3' will happen infinitely often,(F (F p3)),"Eventually, 'p3'",BadStateQuantification,0.6577970385551453,0.738743245601654,0.7151447683572769,4 +p2 | Xp2,"Either 'p2' or in the next step, 'p2'",(p2 | (F p2)),"Either 'p2' or at some point, 'p2' will hold",OtherImplicit,0.7597582340240479,0.7597582340240479,0.7597582340240479,1 +GFp2,'p2' will happen infinitely often,(F (F p2)),"Eventually, 'p2'",BadStateQuantification,0.6619113683700562,0.7502215504646301,0.725648507475853,4 +p1 | Xp1,"Either 'p1' or in the next step, 'p1'",(p1 | (F p1)),"Either 'p1' or at some point, 'p1' will hold",OtherImplicit,0.7647441029548645,0.7647441029548645,0.7647441029548645,1 +GFp1,'p1' will happen infinitely often,(F (F p1)),"Eventually, 'p1'",BadStateQuantification,0.695120632648468,0.7743967175483704,0.7534150779247284,4 +p1 | !p2,Either 'p1' or 'p2' is false,p1,'p1',OtherImplicit,0.7699452042579651,0.7699452042579651,0.7699452042579651,1 +GFp0,'p0' will happen infinitely often,(F (F p0)),"Eventually, 'p0'",BadStateQuantification,0.6987221240997314,0.8137977123260498,0.775435209274292,4 +p1 | !p3,Either 'p1' or 'p3' is false,p1,'p1',OtherImplicit,0.7796639800071716,0.7796639800071716,0.7796639800071716,1 +X!p2,"In the next step, 'p2' is false",(F (! p2)),"Eventually, not 'p2'",OtherImplicit,0.7816904783248901,0.7816904783248901,0.7816904783248901,1 +!p0 & p3,Both 'p0' is false and 'p3',p3,'p3',OtherImplicit,0.7827446460723877,0.7827446460723877,0.7827446460723877,1 +X!p1,"In the next step, 'p1' is false",(F (! p1)),"Eventually, not 'p1'",OtherImplicit,0.7882987260818481,0.7882987260818481,0.7882987260818481,1 +X!p0,"In the next step, 'p0' is false",(F (! p0)),"Eventually, not 'p0'",OtherImplicit,0.8068245053291321,0.8068245053291321,0.8068245053291321,1 +Xp3,"In the next step, 'p3'",(F p3),"At some point, 'p3' will hold",OtherImplicit,0.8077911734580994,0.8077911734580994,0.8077911734580994,1 +Xp2,"In the next step, 'p2'",(F p2),"At some point, 'p2' will hold",OtherImplicit,0.8097606897354126,0.8097606897354126,0.8097606897354126,1 +p0 | !p2,Either 'p0' or 'p2' is false,p0,'p0',OtherImplicit,0.8130448460578918,0.8130448460578918,0.8130448460578918,1 diff --git a/semantic_benchmark_near_eng.csv b/semantic_benchmark_near_eng.csv new file mode 100644 index 0000000..e54a594 --- /dev/null +++ b/semantic_benchmark_near_eng.csv @@ -0,0 +1,101 @@ +ltl_formula,english_translation,closest_mutant_formula,closest_mutant_english,closest_mutant_misconception,closest_distance,max_distance,avg_distance,num_mutants +(p0 U p1) & (p1 U p0),Both 'p0' until 'p1' and 'p1' until 'p0',((p0 U p1) & (p0 U p1)),Both 'p0' until 'p1' and 'p0' until 'p1',BadStateQuantification,0.023236310109496117,0.387821763753891,0.27713817497715354,4 +(Fp2 & X(p1 | !p3)) | (G!p2 & X(!p1 & p3)),"Either both at some point, 'p2' will hold and in the next step, either 'p1' or 'p3' is false or both 'p2' will never occur and in the next step, both 'p1' is false and 'p3'",(((F p2) & (X (p1 | (! p3)))) | ((G (! p2)) & ((X (! p1)) & p3))),"Either both at some point, 'p2' will hold and in the next step, either 'p1' or 'p3' is false or both 'p2' will never occur and both in the next step, 'p1' is false and 'p3'",BadStateIndex,0.02979588322341442,0.3414227068424225,0.19458286557346582,6 +(X (((G p2) & p0) & ((X p2) | p1))),"In the next step, both 'p2' must always hold and 'p0' and either in the next step, 'p2' or 'p1'",(X ((((G p2) & p0) & (X p2)) | p1)),"In the next step, either both 'p2' must always hold and 'p0' and in the next step, 'p2' or 'p1'",Precedence,0.033475689589977264,0.4500802159309387,0.296018673107028,4 +(((F p3) -> p2) U ((G p2) & (G ((G p1) | p0)))),"If at some point, 'p3' will hold, then 'p2' holds until both 'p2' must always hold and always have either 'p1' must always hold or 'p0'",(((F p3) -> p2) U ((G p2) & ((G p1) | p0))),"If at some point, 'p3' will hold, then 'p2' holds until both 'p2' must always hold and either 'p1' must always hold or 'p0'",ImplicitG,0.05513975769281387,0.4693278670310974,0.20608597621321678,6 +((((F p2) -> p3) U (p2 -> p1)) & (G (! (((F p2) -> p3) & (p2 -> p1))))),"Both if at some point, 'p2' will hold, then 'p3' until if 'p2' holds, then 'p1' holds and it is never the case that both if at some point, 'p2' will hold, then 'p3' and if 'p2', then 'p1' holds",((((F p2) -> p3) U (p2 -> p1)) & (G (! ((p2 -> p3) & (p2 -> p1))))),"Both if at some point, 'p2' will hold, then 'p3' until if 'p2' holds, then 'p1' holds and it is never the case that both if 'p2', then 'p3' and if 'p2' holds, then 'p1' holds",ImplicitF,0.077869713306427,0.2907422184944153,0.17991345801523753,7 +(((p2 | p3) U ((! p0) -> p1)) & (G ((p2 | p3) -> (! ((! p0) -> p1))))),"Both at least one of 'p2' or 'p3' hold until 'p1' unless 'p0' and if at least one of 'p2' or 'p3', then not 'p1' unless 'p0' is always true",((((p2 | p3) U ((! p0) -> p1)) | (G (p2 | p3))) & (G ((p2 | p3) -> (! ((! p0) -> p1))))),"Both either at least one of 'p2' or 'p3' hold until 'p1' unless 'p0' or always have either 'p2' or 'p3' and if at least one of 'p2' or 'p3', then not 'p1' unless 'p0' is always true",WeakU,0.10272090882062912,0.42701253294944763,0.2642976976931095,6 +G(FGp3 U p3),"Eventually, 'p3' will always be true until 'p3' is always true",((F (G p3)) U p3),"Eventually, 'p3' will always be true until 'p3'",OtherImplicit,0.11623762547969818,0.4842790961265564,0.27479931712150574,4 +((! p3) U ((! (! p3)) & (p2 | p3))),'p3' is false until both 'p3' and at least one of 'p2' or 'p3',((! p3) U (p2 | p3)),'p3' is false until at least one of 'p2' or 'p3',ExclusiveU,0.12543898820877075,0.46455007791519165,0.24368559196591377,4 +(p0 U p3) | (p1 U p3),Either 'p0' until 'p3' or 'p1' until 'p3',((p0 U p3) | (p3 U p1)),Either 'p0' until 'p3' or 'p3' until 'p1',BadStateQuantification,0.02593476139008999,0.4019530117511749,0.2798178284429014,4 +(Fp0 U p1) U (p2 U p0),"At some point, 'p0' will hold until 'p1', and this continues until 'p2' until 'p0'",((((F p0) U p1) U p2) U p0),"At some point, 'p0' will hold until 'p1' until 'p2', and this continues until 'p0'",Precedence,0.03628356382250786,0.3988723158836365,0.27059474512934684,5 +X(G!p3 | X(!p0 & G!p2)),"In the next step, either 'p3' will never occur or in the next step, both 'p0' is false and 'p2' will never occur",(X ((G (! p3)) | ((X (! p0)) & (G (! p2))))),"In the next step, either 'p3' will never occur or both in the next step, 'p0' is false and 'p2' will never occur",BadStateIndex,0.055499814450740814,0.28529950976371765,0.1977731455117464,4 +(((F p2) U (p1 -> p0)) & (G ((F p2) -> (! (p1 -> p0))))),"Both at some point, 'p2' will hold until if 'p1', then 'p0' holds and at all times, if at some point, 'p2' will hold, then not if 'p1', then 'p0'",(((F p2) U (p1 -> p0)) & ((F p2) -> (! (p1 -> p0)))),"Both at some point, 'p2' will hold until if 'p1', then 'p0' holds and if at some point, 'p2' will hold, then not if 'p1', then 'p0'",ImplicitG,0.06012389808893204,0.34225183725357056,0.18971148026841028,7 +(p2 U ((p0 -> p3) & (F ((F p0) -> p3)))),"'p2' until both if 'p0' holds, then 'p3' holds and eventually, if at some point, 'p0' will hold, then 'p3' holds",(p2 U ((p0 -> p3) & ((F p0) -> p3))),"'p2' until both if 'p0' holds, then 'p3' holds and if at some point, 'p0' will hold, then 'p3' holds",ImplicitF,0.08821691572666168,0.3291768729686737,0.2209063172340393,4 +((((F p3) | p1) U (F p3)) & (G (((F p3) | p1) -> (! (F p3))))),"Both either at some point, 'p3' will hold or 'p1' hold until at some point, 'p3' will hold and at all times, if either at some point, 'p3' will hold or 'p1', then not at some point, 'p3' will hold",(((((F p3) | p1) U (F p3)) | (G ((F p3) | p1))) & (G (((F p3) | p1) -> (! (F p3))))),"Both either at some point, 'p3' will hold or 'p1' hold until at some point, 'p3' will hold or always have either at some point, 'p3' will hold or 'p1' and at all times, if either at some point, 'p3' will hold or 'p1', then not at some point, 'p3' will hold",WeakU,0.10320486128330231,0.2763770818710327,0.21865300834178925,3 +G(FGp3 U p2),"Eventually, 'p3' will always be true until 'p2' is always true",((F (G p3)) U p2),"Eventually, 'p3' will always be true until 'p2'",OtherImplicit,0.11845925450325012,0.4850548803806305,0.3048924088478088,5 +((! p2) U ((! (! p2)) | (p0 -> p1))),"'p2' is false until either 'p2' or if 'p0' holds, then 'p1' holds",((! p2) U (p0 -> p1)),"'p2' is false until if 'p0' holds, then 'p1' holds",ExclusiveU,0.14683017134666443,0.47526925802230835,0.28169373273849485,5 +Xp0 & (p3 U p2),"Both in the next step, 'p0' and 'p3' until 'p2'",((X p0) & (p2 U p3)),"Both in the next step, 'p0' and 'p2' until 'p3'",BadStateQuantification,0.032083041965961456,0.5398684740066528,0.31490558572113514,4 +XX((!p0 & ((p0 & p3) | (!p0 & !p3))) | (p0 & ((p0 & !p3) | (!p0 & p3)))),"In 2 steps, either both 'p0' is false and either both 'p0' and 'p3' or neither 'p0' nor 'p3' or both 'p0' and either both 'p0' and 'p3' is false or both 'p0' is false and 'p3'",(X (X ((((! p0) & ((p0 & p3) | ((! p0) & (! p3)))) | p0) & ((p0 & (! p3)) | ((! p0) & p3))))),"In 2 steps, both either both 'p0' is false and either both 'p0' and 'p3' or neither 'p0' nor 'p3' or 'p0' and either both 'p0' and 'p3' is false or both 'p0' is false and 'p3'",Precedence,0.03734102472662926,0.2864071726799011,0.14032170797387758,3 +F((p1 & p2) | (!p1 & !p2)) | X(p2 & p3),"Either eventually, either both 'p1' and 'p2' or neither 'p1' nor 'p2' or in the next step, both 'p2' and 'p3'",((F ((p1 & p2) | ((! p1) & (! p2)))) | ((X p2) & p3)),"Either eventually, either both 'p1' and 'p2' or neither 'p1' nor 'p2' or both in the next step, 'p2' and 'p3'",BadStateIndex,0.06327647715806961,0.5925284028053284,0.32958004623651505,5 +(((F p1) U (G p3)) & (G ((F p1) -> (! (G p3))))),"Both at some point, 'p1' will hold until 'p3' must always hold and at all times, if at some point, 'p1' will hold, then not 'p3' must always hold",(((F p1) U (G p3)) & ((F p1) -> (! (G p3)))),"Both at some point, 'p1' will hold until 'p3' must always hold and if at some point, 'p1' will hold, then not 'p3' must always hold",ImplicitG,0.06466665118932724,0.4403659999370575,0.29131780937314034,6 +((p0 -> p3) U ((F p2) & (F p3))),"If 'p0' holds, then 'p3' holds until both at some point, 'p2' will hold and at some point, 'p3' will hold",((p0 -> p3) U ((F p2) & p3)),"If 'p0' holds, then 'p3' holds until both at some point, 'p2' will hold and 'p3'",ImplicitF,0.09620845317840576,0.37508946657180786,0.23864905908703804,4 +(((G p2) U ((! p2) | p1)) & (G ((G p2) -> (! ((! p2) | p1))))),"Both 'p2' must always hold until either 'p2' is false or 'p1' and if 'p2' must always hold, then not either 'p2' is false or 'p1' is always true",((((G p2) U ((! p2) | p1)) | (G (G p2))) & (G ((G p2) -> (! ((! p2) | p1))))),"Both either 'p2' must always hold until either 'p2' is false or 'p1' or at all times, 'p2' and if 'p2' must always hold, then not either 'p2' is false or 'p1' is always true",WeakU,0.10728871822357178,0.39227768778800964,0.2693173885345459,4 +G((!p1 & (!p3 | Fp0)) | (p1 & p3 & G!p0)),"Always have either both 'p1' is false and either 'p3' is false or at some point, 'p0' will hold or both 'p1' and 'p3' and 'p0' will never occur",(((! p1) & ((! p3) | (F p0))) | ((p1 & p3) & (G (! p0)))),"Either both 'p1' is false and either 'p3' is false or at some point, 'p0' will hold or both 'p1' and 'p3' and 'p0' will never occur",OtherImplicit,0.15162089467048645,0.2360028177499771,0.19255444705486296,5 +(((F p3) & p1) U ((! ((F p3) & p1)) | (p3 -> p1))),"Both at some point, 'p3' will hold and 'p1' hold until either not both at some point, 'p3' will hold and 'p1' or if 'p3', then 'p1' holds",(((F p3) & p1) U (p3 -> p1)),"Both at some point, 'p3' will hold and 'p1' hold until if 'p3', then 'p1' holds",ExclusiveU,0.16197362542152405,0.4915367066860199,0.326755166053772,2 +(((p0 & p1) | (!p0 & !p1)) & XXG!p0) | (((p0 & !p1) | (!p0 & p1)) & XXFp0),"Either both either both 'p0' and 'p1' or neither 'p0' nor 'p1' and in 2 steps, 'p0' will never occur or both either both 'p0' and 'p1' is false or both 'p0' is false and 'p1' and in 2 steps, at some point, 'p0' will hold",(((((p0 & p1) | ((! p0) & (! p1))) & (X (X (G (! p0))))) | ((p0 & (! p1)) | ((! p0) & p1))) & (X (X (F p0)))),"Both either both either both 'p0' and 'p1' or neither 'p0' nor 'p1' and in 2 steps, 'p0' will never occur or either both 'p0' and 'p1' is false or both 'p0' is false and 'p1' and in 2 steps, at some point, 'p0' will hold",Precedence,0.04049919173121452,0.2636733651161194,0.17311375650266805,6 +X(p3 | (p2 U p1)),"In the next step, either 'p3' or 'p2' until 'p1'",(X (p3 | (p1 U p2))),"In the next step, either 'p3' or 'p1' until 'p2'",BadStateQuantification,0.046349335461854935,0.5506623983383179,0.31848904956132174,4 +Fp0 U (p3 | X(p2 & p3)),"At some point, 'p0' will hold until either 'p3' or in the next step, both 'p2' and 'p3'",((F p0) U (p3 | ((X p2) & p3))),"At some point, 'p0' will hold until either 'p3' or both in the next step, 'p2' and 'p3'",BadStateIndex,0.06353627145290375,0.4033069312572479,0.24506637702385584,6 +(((p1 | p3) U (! p3)) & (G ((p1 | p3) -> (! (! p3))))),"Both at least one of 'p1' or 'p3' hold until 'p3' is false and at all times, if at least one of 'p1' or 'p3', then not 'p3' is false",(((p1 | p3) U (! p3)) & ((p1 | p3) -> (! (! p3)))),"Both at least one of 'p1' or 'p3' hold until 'p3' is false and if at least one of 'p1' or 'p3', then not 'p3' is false",ImplicitG,0.06696542352437973,0.38150396943092346,0.22889351099729538,5 +FG((p3 | G!p1 | Gp0) & (p3 | F(!p0 | p1))),"Eventually, always maintain both either 'p3' or 'p1' will never occur or 'p0' must always hold and either 'p3' or eventually, either 'p0' is false or 'p1'",(F (G (((p3 | (G (! p1))) | (G p0)) & (p3 | ((! p0) | p1))))),"Eventually, always maintain both either 'p3' or 'p1' will never occur or 'p0' must always hold and either 'p3' or either 'p0' is false or 'p1'",ImplicitF,0.10394419729709625,0.2454671561717987,0.1904296338558197,5 +((((X p3) | p1) U (! p1)) & (G (! (((X p3) | p1) & (! p1))))),"Both either in the next step, 'p3' or 'p1' hold until 'p1' is false and it is never the case that both either in the next step, 'p3' or 'p1' and 'p1' is false",(((((X p3) | p1) U (! p1)) | (G ((X p3) | p1))) & (G (! (((X p3) | p1) & (! p1))))),"Both either in the next step, 'p3' or 'p1' hold until 'p1' is false or always have either in the next step, 'p3' or 'p1' and it is never the case that both either in the next step, 'p3' or 'p1' and 'p1' is false",WeakU,0.11572763323783875,0.4223088324069977,0.2547943741083145,4 +((! p2) U ((! (! p2)) | (p0 & p2))),'p2' is false until either 'p2' or both 'p0' and 'p2',((! p2) U (p0 & p2)),'p2' is false until both 'p0' and 'p2',ExclusiveU,0.1634880155324936,0.2883411943912506,0.2259146049618721,2 +(p3 U Gp0) U X(p2 U Xp2),"'p3' until 'p0' must always hold, and this continues until in the next step, 'p2' until in the next step, 'p2'",((p3 U (G p0)) U (F (p2 U (X p2)))),"'p3' until 'p0' must always hold, and this continues until eventually, 'p2' until in the next step, 'p2'",OtherImplicit,0.1733333170413971,0.27758660912513733,0.21240630373358727,4 +(p3 U (p0 U p1)) U (p3 U p0),"'p3' until 'p0' until 'p1', and this continues until 'p3' until 'p0'",(((p3 U (p0 U p1)) U p3) U p0),"'p3' until 'p0' until 'p1' until 'p3', and this continues until 'p0'",Precedence,0.041355639696121216,0.31306329369544983,0.20924533531069756,4 +p2 | (p0 U p3),Either 'p2' or 'p0' until 'p3',(p2 | (p3 U p0)),Either 'p2' or 'p3' until 'p0',BadStateQuantification,0.04961862415075302,0.4655517041683197,0.3165608998388052,4 +((G p1) U (((G p0) -> p1) & (G ((G p0) -> p3)))),"'p1' must always hold until both if 'p0' must always hold, then 'p1' holds and at all times, if 'p0' must always hold, then 'p3' holds",((G p1) U ((p0 -> p1) & (G ((G p0) -> p3)))),"'p1' must always hold until both if 'p0' holds, then 'p1' holds and at all times, if 'p0' must always hold, then 'p3' holds",ImplicitG,0.07011505961418152,0.6801282167434692,0.25602786242961884,5 +(X (((X p3) | p2) & ((! p1) & p2))),"In the next step, both either in the next step, 'p3' or 'p2' and both 'p1' is false and 'p2'",((X ((X p3) | p2)) & ((! p1) & p2)),"Both in the next step, either in the next step, 'p3' or 'p2' and both 'p1' is false and 'p2'",BadStateIndex,0.08070292323827744,0.31975439190864563,0.20022865757346153,2 +((F p0) U (((X p2) & p0) & (F ((F p1) -> p3)))),"At some point, 'p0' will hold until both in the next step, 'p2' and 'p0' and eventually, if at some point, 'p1' will hold, then 'p3' holds",((F p0) U (((X p2) & p0) & ((F p1) -> p3))),"At some point, 'p0' will hold until both in the next step, 'p2' and 'p0' and if at some point, 'p1' will hold, then 'p3' holds",ImplicitF,0.10425001382827759,0.38347673416137695,0.23939508944749832,6 +((((X p2) | p1) U (! p3)) & (G (! (((X p2) | p1) & (! p3))))),"Both either in the next step, 'p2' or 'p1' hold until 'p3' is false and it is never the case that both either in the next step, 'p2' or 'p1' and 'p3' is false",(((((X p2) | p1) U (! p3)) | (G ((X p2) | p1))) & (G (! (((X p2) | p1) & (! p3))))),"Both either in the next step, 'p2' or 'p1' hold until 'p3' is false or always have either in the next step, 'p2' or 'p1' and it is never the case that both either in the next step, 'p2' or 'p1' and 'p3' is false",WeakU,0.11577607691287994,0.4322969913482666,0.24379653632640838,5 +(p2 U ((! p2) & ((! p1) | p2))),'p2' until both 'p2' is false and either 'p1' is false or 'p2',(p2 U ((! p1) | p2)),'p2' until either 'p1' is false or 'p2',ExclusiveU,0.16365621984004974,0.5095543265342712,0.2741558313369751,5 +(Fp0 & (p0 U p3)) U p1,"Both at some point, 'p0' will hold and 'p0' until 'p3' hold until 'p1'",((F p0) & (p0 U p3)),"Both at some point, 'p0' will hold and 'p0' until 'p3'",OtherImplicit,0.18703140318393707,0.36475157737731934,0.24695269763469696,5 +(Fp1 U p2) U (p3 U p1),"At some point, 'p1' will hold until 'p2', and this continues until 'p3' until 'p1'",((((F p1) U p2) U p3) U p1),"At some point, 'p1' will hold until 'p2' until 'p3', and this continues until 'p1'",Precedence,0.04229610413312912,0.5763116478919983,0.29718530774116514,5 +((p3 U p0) U (p1 U p2)) U p3,"'p3' until 'p0' until 'p1' until 'p2', and this continues until 'p3'",(p3 U ((p3 U p0) U (p1 U p2))),"'p3' until 'p3' until 'p0', and this continues until 'p1' until 'p2'",BadStateQuantification,0.049981262534856796,0.7916196584701538,0.3216691827401519,4 +((F p3) U (((! p3) & p2) & (G ((G p3) -> p2)))),"At some point, 'p3' will hold until both 'p3' is false and 'p2' and at all times, if 'p3' must always hold, then 'p2' holds",((F p3) U (((! p3) & p2) & ((G p3) -> p2))),"At some point, 'p3' will hold until both 'p3' is false and 'p2' and if 'p3' must always hold, then 'p2' holds",ImplicitG,0.07021810859441757,0.3413970470428467,0.22759981006383895,5 +(X (((! p3) | p0) & ((X p2) & p3))),"In the next step, both either 'p3' is false or 'p0' and both in the next step, 'p2' and 'p3'",((X ((! p3) | p0)) & ((X p2) & p3)),"Both in the next step, either 'p3' is false or 'p0' and both in the next step, 'p2' and 'p3'",BadStateIndex,0.08377265185117722,0.348086953163147,0.2159298025071621,2 +(((X p1) | p3) U ((F p0) & (F p1))),"Either in the next step, 'p1' or 'p3' hold until both at some point, 'p0' will hold and at some point, 'p1' will hold",(((X p1) | p3) U ((F p0) & p1)),"Either in the next step, 'p1' or 'p3' hold until both at some point, 'p0' will hold and 'p1'",ImplicitF,0.10776412487030029,0.22966280579566956,0.16871346533298492,2 +(((X p1) | p0) U (((X p1) | p0) -> (p1 | p3))),"Either in the next step, 'p1' or 'p0' hold until if either in the next step, 'p1' or 'p0', then at least one of 'p1' or 'p3'",((((X p1) | p0) U (((X p1) | p0) -> (p1 | p3))) | (G ((X p1) | p0))),"Either in the next step, 'p1' or 'p0' hold until if either in the next step, 'p1' or 'p0', then at least one of 'p1' or 'p3' or always have either in the next step, 'p1' or 'p0'",WeakU,0.11592033505439758,0.22778265178203583,0.1908400058746338,4 +((p2 -> p3) U ((p2 -> p3) -> (p2 -> p3))),"If 'p2' holds, then 'p3' holds until if 'p2' holds, then 'p3' holds, then if 'p2' holds, then 'p3' holds",((p2 -> p3) U (p2 -> p3)),"If 'p2' holds, then 'p3' holds until if 'p2' holds, then 'p3' holds",ExclusiveU,0.16937156021595,0.3729923367500305,0.27118194848299026,2 +GFp1 U p1,'p1' will happen infinitely often until 'p1',(G (F p1)),'p1' will happen infinitely often,OtherImplicit,0.19305652379989624,0.6683191061019897,0.49543946981430054,3 +(Gp2 U p1) U (p0 U Xp0),"'p2' must always hold until 'p1', and this continues until 'p0' until in the next step, 'p0'",((((G p2) U p1) U p0) U (X p0)),"'p2' must always hold until 'p1' until 'p0', and this continues until in the next step, 'p0'",Precedence,0.04280916601419449,0.3918285071849823,0.22731567472219466,5 +(((G p2) -> p3) U ((! ((G p2) -> p3)) | (p3 & p1))),"If 'p2' must always hold, then 'p3' holds until either 'p2' must always hold, but not 'p3' or both 'p3' and 'p1'",(((G p2) -> p3) U (G ((! ((G p2) -> p3)) | (p3 & p1)))),"If 'p2' must always hold, then 'p3' holds until always have either 'p2' must always hold, but not 'p3' or both 'p3' and 'p1'",BadStateQuantification,0.058982182294130325,0.3857884705066681,0.19858889964719614,6 +((((! p0) | p2) U (G p1)) & (G (((! p0) | p2) -> (! (G p1))))),"Both either 'p0' is false or 'p2' hold until 'p1' must always hold and at all times, if either 'p0' is false or 'p2', then not 'p1' must always hold",((((! p0) | p2) U (G p1)) & (((! p0) | p2) -> (! (G p1)))),"Both either 'p0' is false or 'p2' hold until 'p1' must always hold and if either 'p0' is false or 'p2', then not 'p1' must always hold",ImplicitG,0.0703347846865654,0.34491488337516785,0.2206909939646721,5 +(p3 U p2) U X(p0 & p2),"'p3' until 'p2', and this continues until in the next step, both 'p0' and 'p2'",((p3 U p2) U ((X p0) & p2)),"'p3' until 'p2', and this continues until both in the next step, 'p0' and 'p2'",BadStateIndex,0.08811672776937485,0.4199155271053314,0.24801081977784634,4 +(((F p1) & p0) U ((p3 -> p2) & (F ((G p1) -> p0)))),"Both at some point, 'p1' will hold and 'p0' hold until both if 'p3', then 'p2' holds and eventually, if 'p1' must always hold, then 'p0' holds",(((F p1) & p0) U ((p3 -> p2) & ((G p1) -> p0))),"Both at some point, 'p1' will hold and 'p0' hold until both if 'p3', then 'p2' holds and if 'p1' must always hold, then 'p0' holds",ImplicitF,0.10813377052545547,0.28540918231010437,0.17905174816648164,6 +((p3 | p1) U (((F p0) & p3) & (G ((! p1) -> p0)))),"At least one of 'p3' or 'p1' hold until both at some point, 'p0' will hold and 'p3' and it is always the case that 'p0' unless 'p1'",(((p3 | p1) U (((F p0) & p3) & (G ((! p1) -> p0)))) | (G (p3 | p1))),"Either at least one of 'p3' or 'p1' hold until both at some point, 'p0' will hold and 'p3' and it is always the case that 'p0' unless 'p1' or always have either 'p3' or 'p1'",WeakU,0.11668875068426132,0.42371803522109985,0.20582491053002222,7 +(((G p1) & p3) U (((G p1) & p3) -> ((G p3) -> p1))),"Both 'p1' must always hold and 'p3' hold until if both 'p1' must always hold and 'p3', then if 'p3' must always hold, then 'p1'",(((G p1) & p3) U ((G p3) -> p1)),"Both 'p1' must always hold and 'p3' hold until if 'p3' must always hold, then 'p1'",ExclusiveU,0.1726030558347702,0.1726030558347702,0.1726030558347702,1 +F(p0 & p1) U p1,"Eventually, both 'p0' and 'p1' will be true simultaneously until 'p1'",(F (p0 & p1)),"Eventually, both 'p0' and 'p1' will be true simultaneously",OtherImplicit,0.19820520281791687,0.6248161196708679,0.4115106612443924,2 +XX((p2 & X!p2) | (!p2 & Xp2)),"In 2 steps, either both 'p2' and in the next step, 'p2' is false or both 'p2' is false and in the next step, 'p2'",(X (X (((p2 & (X (! p2))) | (! p2)) & (X p2)))),"In 2 steps, both either both 'p2' and in the next step, 'p2' is false or 'p2' is false and in the next step, 'p2'",Precedence,0.042971137911081314,0.2920770049095154,0.16071786110599837,3 +p1 | (p3 U p2),Either 'p1' or 'p3' until 'p2',(p1 | (p2 U p3)),Either 'p1' or 'p2' until 'p3',BadStateQuantification,0.06289363652467728,0.7450254559516907,0.42813421972095966,4 +(((G p2) -> p1) U (((F p3) -> p1) & (G (p2 -> p3)))),"If 'p2' must always hold, then 'p1' holds until both if at some point, 'p3' will hold, then 'p1' holds and at all times, if 'p2' holds, then 'p3' holds",(((G p2) -> p1) U (((F p3) -> p1) & (p2 -> p3))),"If 'p2' must always hold, then 'p1' holds until both if at some point, 'p3' will hold, then 'p1' holds and if 'p2' holds, then 'p3' holds",ImplicitG,0.07125857472419739,0.28554272651672363,0.16359106451272964,7 +XX(Gp0 | F!p3),"In 2 steps, either 'p0' must always hold or eventually, not 'p3'",(X (X (X ((G p0) | (F (! p3)))))),"In 3 steps, either 'p0' must always hold or eventually, not 'p3'",BadStateIndex,0.09606853872537613,0.3544589877128601,0.2574476197361946,5 +(((F p0) U ((F p3) -> p1)) & (G (! ((F p0) & ((F p3) -> p1))))),"Both at some point, 'p0' will hold until if at some point, 'p3' will hold, then 'p1' and it is never the case that both at some point, 'p0' will hold and if at some point, 'p3' will hold, then 'p1'",(((F p0) U ((F p3) -> p1)) & (G (! ((F p0) & (p3 -> p1))))),"Both at some point, 'p0' will hold until if at some point, 'p3' will hold, then 'p1' and it is never the case that both at some point, 'p0' will hold and if 'p3', then 'p1'",ImplicitF,0.11162640154361725,0.31764358282089233,0.2133680209517479,6 +(((p1 | p2) U ((G p3) | p1)) & (G (! ((p1 | p2) & ((G p3) | p1))))),Both at least one of 'p1' or 'p2' hold until either 'p3' must always hold or 'p1' and it is never the case that both at least one of 'p1' or 'p2' and either 'p3' must always hold or 'p1',((((p1 | p2) U ((G p3) | p1)) | (G (p1 | p2))) & (G (! ((p1 | p2) & ((G p3) | p1))))),Both either at least one of 'p1' or 'p2' hold until either 'p3' must always hold or 'p1' or always have either 'p1' or 'p2' and it is never the case that both at least one of 'p1' or 'p2' and either 'p3' must always hold or 'p1',WeakU,0.12026950716972351,0.3468334674835205,0.19986751079559326,5 +((p1 -> p0) U ((! (p1 -> p0)) & ((! p0) | p1))),"If 'p1' holds, then 'p0' holds until both 'p1', but not 'p0' and either 'p0' is false or 'p1'",((p1 -> p0) U ((! p0) | p1)),"If 'p1' holds, then 'p0' holds until either 'p0' is false or 'p1'",ExclusiveU,0.1887683868408203,0.542787492275238,0.2860000729560852,5 +X((p1 & G!p0) | (!p1 & Fp0)),"In the next step, either both 'p1' and 'p0' will never occur or both 'p1' is false and at some point, 'p0' will hold",(X (((p1 & (G (! p0))) | (! p1)) & (F p0))),"In the next step, both either both 'p1' and 'p0' will never occur or 'p1' is false and at some point, 'p0' will hold",Precedence,0.04458479955792427,0.44826582074165344,0.2596312053501606,5 +XX(p0 U p1),"In 2 steps, 'p0' until 'p1'",(X (X (p1 U p0))),"In 2 steps, 'p1' until 'p0'",BadStateQuantification,0.07203678041696548,0.4469376802444458,0.2801337894052267,4 +(((! p3) & p2) U (p1 & (G ((! p2) | p3)))),Both 'p3' is false and 'p2' hold until both 'p1' and always have either 'p2' is false or 'p3',(((! p3) & p2) U (p1 & ((! p2) | p3))),Both 'p3' is false and 'p2' hold until both 'p1' and either 'p2' is false or 'p3',ImplicitG,0.07618267834186554,0.38971635699272156,0.19905509303013483,6 +Fp1 U XXp3,"At some point, 'p1' will hold until in 2 steps, 'p3'",((F p1) U (X (X (X p3)))),"At some point, 'p1' will hold until in 3 steps, 'p3'",BadStateIndex,0.09645338356494904,0.4187382757663727,0.3098797984421253,4 +(((X p1) -> p2) U (((G p0) -> p2) & (F ((G p2) | p1)))),"If in the next step, 'p1', then 'p2' holds until both if 'p0' must always hold, then 'p2' holds and eventually, either 'p2' must always hold or 'p1'",(((X p1) -> p2) U (((G p0) -> p2) & ((G p2) | p1))),"If in the next step, 'p1', then 'p2' holds until both if 'p0' must always hold, then 'p2' holds and either 'p2' must always hold or 'p1'",ImplicitF,0.11210452020168304,0.3786934018135071,0.1797017753124237,7 +Gp1 U (((p0 & Xp1) | (!p0 & X!p1)) U p3),"'p1' must always hold until either both 'p0' and in the next step, 'p1' or both 'p0' is false and in the next step, 'p1' is false hold until 'p3'",(((G p1) U (((p0 & (X p1)) | ((! p0) & (X (! p1)))) U p3)) | (G (G p1))),"Either 'p1' must always hold until either both 'p0' and in the next step, 'p1' or both 'p0' is false and in the next step, 'p1' is false hold until 'p3' or at all times, 'p1'",WeakU,0.123885877430439,0.38047879934310913,0.2639784500002861,5 +(p2 U Xp1) U (p2 U p1),"'p2' until in the next step, 'p1', and this continues until 'p2' until 'p1'",(((p2 U (X p1)) U p2) U p1),"'p2' until in the next step, 'p1' until 'p2', and this continues until 'p1'",Precedence,0.04481009766459465,0.4488793909549713,0.23276558394233385,3 +((p0 & p3) U ((! (p0 & p3)) | (p2 -> p3))),"Both 'p0' and 'p3' hold until either not both 'p0' and 'p3' or if 'p2', then 'p3' holds",((p0 & p3) U (p2 -> p3)),"Both 'p0' and 'p3' hold until if 'p2', then 'p3' holds",ExclusiveU,0.1904260814189911,0.6873605847358704,0.3713172674179077,3 +X(p1 U p0),"In the next step, 'p1' until 'p0'",(X (p0 U p1)),"In the next step, 'p0' until 'p1'",BadStateQuantification,0.08421752601861954,0.5432999134063721,0.344382606446743,3 +((p3 U (F p1)) & (G (p3 -> (! (F p1))))),"Both 'p3' until at some point, 'p1' will hold and at all times, if 'p3', then not at some point, 'p1' will hold",((p3 U (F p1)) & (p3 -> (! (F p1)))),"Both 'p3' until at some point, 'p1' will hold and if 'p3', then not at some point, 'p1' will hold",ImplicitG,0.08505173772573471,0.4061712324619293,0.289356429874897,5 +(X (((X p1) -> p2) & (p3 -> p0))),"In the next step, both if in the next step, 'p1', then 'p2' holds and if 'p3' holds, then 'p0' holds",((X ((X p1) -> p2)) & (p3 -> p0)),"Both in the next step, if in the next step, 'p1', then 'p2' holds and if 'p3' holds, then 'p0' holds",BadStateIndex,0.09651118516921997,0.3920551836490631,0.287649542093277,3 +((p1 | p0) U (((F p2) & p3) & (F p1))),"At least one of 'p1' or 'p0' hold until both at some point, 'p2' will hold and 'p3' and at some point, 'p1' will hold",((p1 | p0) U (((F p2) & p3) & p1)),"At least one of 'p1' or 'p0' hold until both at some point, 'p2' will hold and 'p3' and 'p1'",ImplicitF,0.11937731504440308,0.6894240379333496,0.27272829413414,6 +(Gp3 U Xp0) | Fp1,"Either 'p3' must always hold until in the next step, 'p0' or at some point, 'p1' will hold",((((G p3) U (X p0)) | (G (G p3))) | (F p1)),"Either 'p3' must always hold until in the next step, 'p0' or at all times, 'p3' or at some point, 'p1' will hold",WeakU,0.1245269775390625,0.5009090900421143,0.2504582405090332,5 +(((X p2) | p0) U ((! ((X p2) | p0)) | (p3 & p2))),"Either in the next step, 'p2' or 'p0' hold until either neither in the next step, 'p2' nor 'p0' or both 'p3' and 'p2'",((((X p2) | p0) U (! ((X p2) | p0))) | (p3 & p2)),"Either in the next step, 'p2' or 'p0' hold until neither in the next step, 'p2' nor 'p0' or both 'p3' and 'p2'",Precedence,0.04540524631738663,0.5387542247772217,0.2179892659187317,5 +((p0 U (F p1)) & (G (p0 -> (! (F p1))))),"Both 'p0' until at some point, 'p1' will hold and at all times, if 'p0', then not at some point, 'p1' will hold",((p0 U (F p1)) & (p0 -> (! (F p1)))),"Both 'p0' until at some point, 'p1' will hold and if 'p0', then not at some point, 'p1' will hold",ImplicitG,0.08571310341358185,0.3936465382575989,0.23511863748232523,6 +((((X p2) | p3) U ((F p2) | p3)) & (G (! (((X p2) | p3) & ((F p2) | p3))))),"Both either in the next step, 'p2' or 'p3' hold until either at some point, 'p2' will hold or 'p3' and it is never the case that both either in the next step, 'p2' or 'p3' and either at some point, 'p2' will hold or 'p3'",((((X p2) | p3) U ((F p2) | p3)) & (F (! (((X p2) | p3) & ((F p2) | p3))))),"Both either in the next step, 'p2' or 'p3' hold until either at some point, 'p2' will hold or 'p3' and eventually, it will not be the case that both either in the next step, 'p2' or 'p3' and either at some point, 'p2' will hold or 'p3'",BadStateQuantification,0.08660925924777985,0.4033510386943817,0.23819689825177193,4 +p0 & X(p1 & (!p0 | !p2)),"Both 'p0' and in the next step, both 'p1' and not both 'p0' and 'p2'",(p0 & ((X p1) & ((! p0) | (! p2)))),"Both 'p0' and both in the next step, 'p1' and not both 'p0' and 'p2'",BadStateIndex,0.09772362560033798,0.5759117007255554,0.37638361006975174,3 +(X (((F p1) -> p2) & (F p3))),"In the next step, both if at some point, 'p1' will hold, then 'p2' holds and at some point, 'p3' will hold",(X (((F p1) -> p2) & p3)),"In the next step, both if at some point, 'p1' will hold, then 'p2' holds and 'p3'",ImplicitF,0.12310720980167389,0.5043647885322571,0.27577129006385803,4 +(((p3 | p1) U (! p2)) & (G (! ((p3 | p1) & (! p2))))),Both at least one of 'p3' or 'p1' hold until 'p2' is false and it is never the case that both at least one of 'p3' or 'p1' and 'p2' is false,((((p3 | p1) U (! p2)) | (G (p3 | p1))) & (G (! ((p3 | p1) & (! p2))))),Both either at least one of 'p3' or 'p1' hold until 'p2' is false or always have either 'p3' or 'p1' and it is never the case that both at least one of 'p3' or 'p1' and 'p2' is false,WeakU,0.12589393556118011,0.32602033019065857,0.2151953637599945,5 +((!p0 | p3) & XXp0) | (p0 & !p3 & XX!p0),"Either both either 'p0' is false or 'p3' and in 2 steps, 'p0' or both 'p0' and 'p3' is false and in 2 steps, 'p0' is false",(((((! p0) | p3) & (X (X p0))) | (p0 & (! p3))) & (X (X (! p0)))),"Both either both either 'p0' is false or 'p3' and in 2 steps, 'p0' or both 'p0' and 'p3' is false and in 2 steps, 'p0' is false",Precedence,0.04554428532719612,0.28295162320137024,0.1624156596759955,3 +X(p0 U p3),"In the next step, 'p0' until 'p3'",(X (p3 U p0)),"In the next step, 'p3' until 'p0'",BadStateQuantification,0.0900951400399208,0.4956358075141907,0.33615104605754215,3 +(((G p3) -> p2) U ((! ((G p3) -> p2)) & (p0 -> p3))),"If 'p3' must always hold, then 'p2' holds until both 'p3' must always hold, but not 'p2' and if 'p0' holds, then 'p3' holds",(((G p3) -> p2) U ((! (p3 -> p2)) & (p0 -> p3))),"If 'p3' must always hold, then 'p2' holds until both 'p3', but not 'p2' and if 'p0' holds, then 'p3' holds",ImplicitG,0.09025835990905762,0.34275302290916443,0.2114135151108106,6 +(X (((F p0) | p1) & ((X p1) & p2))),"In the next step, both either at some point, 'p0' will hold or 'p1' and both in the next step, 'p1' and 'p2'",((X ((F p0) | p1)) & ((X p1) & p2)),"Both in the next step, either at some point, 'p0' will hold or 'p1' and both in the next step, 'p1' and 'p2'",BadStateIndex,0.1026991754770279,0.4575364887714386,0.31032146140933037,4 +(((G p3) -> p0) U (((F p2) -> p1) & (G (G p3)))),"If 'p3' must always hold, then 'p0' holds until both if at some point, 'p2' will hold, then 'p1' holds and at all times, 'p3'",(((G p3) -> p0) U ((p2 -> p1) & (G (G p3)))),"If 'p3' must always hold, then 'p0' holds until both if 'p2' holds, then 'p1' holds and at all times, 'p3'",ImplicitF,0.12346068769693375,0.337312251329422,0.2065150427321593,6 +((((! p2) | p0) U p0) & (G (((! p2) | p0) -> (! p0)))),"Both either 'p2' is false or 'p0' hold until 'p0' and if either 'p2' is false or 'p0', then not 'p0' is always true",(((((! p2) | p0) U p0) | (G ((! p2) | p0))) & (G (((! p2) | p0) -> (! p0)))),"Both either 'p2' is false or 'p0' hold until 'p0' or always have either 'p2' is false or 'p0' and if either 'p2' is false or 'p0', then not 'p0' is always true",WeakU,0.1286027878522873,0.37017664313316345,0.24930460453033448,5 +(Fp3 U p0) U (p3 U Gp1),"At some point, 'p3' will hold until 'p0', and this continues until 'p3' until 'p1' must always hold",((((F p3) U p0) U p3) U (G p1)),"At some point, 'p3' will hold until 'p0' until 'p3', and this continues until 'p1' must always hold",Precedence,0.045582808554172516,0.35584917664527893,0.23125888283054033,6 +(((p1 & p0) U (F p2)) & (G ((p1 & p0) -> (! (F p2))))),"Both 'p1' and 'p0' hold until at some point, 'p2' will hold and at all times, if both 'p1' and 'p0', then not at some point, 'p2' will hold",(((p1 & p0) U (F p2)) & ((p1 & p0) -> (! (F p2)))),"Both 'p1' and 'p0' hold until at some point, 'p2' will hold and if both 'p1' and 'p0', then not at some point, 'p2' will hold",ImplicitG,0.09037087857723236,0.40757039189338684,0.26233986020088196,6 +Gp0 U (!p0 | (p1 & p3)),'p0' must always hold until either 'p0' is false or both 'p1' and 'p3',((G p0) U (G ((! p0) | (p1 & p3)))),'p0' must always hold until always have either 'p0' is false or both 'p1' and 'p3',BadStateQuantification,0.09152600914239883,0.41693347692489624,0.23366023153066634,5 +(X (((G p1) & p3) & (! p0))),"In the next step, both 'p1' must always hold and 'p3' and 'p0' is false",((X ((G p1) & p3)) & (! p0)),"Both in the next step, both 'p1' must always hold and 'p3' and 'p0' is false",BadStateIndex,0.10643599182367325,0.5495067834854126,0.30764263309538364,4 +((G p1) U ((! (G p1)) & ((F p1) | p3))),"'p1' must always hold until both it is not the case that 'p1' must always hold and either at some point, 'p1' will hold or 'p3'",(((G p1) U ((! (G p1)) & ((F p1) | p3))) | (G (G p1))),"Either 'p1' must always hold until both it is not the case that 'p1' must always hold and either at some point, 'p1' will hold or 'p3' or at all times, 'p1'",WeakU,0.12948283553123474,0.5531959533691406,0.2667570263147354,5 +(((X p3) -> p2) U ((! ((X p3) -> p2)) | ((F p3) & p0))),"If in the next step, 'p3', then 'p2' holds until either in the next step, 'p3', but not 'p2' or both at some point, 'p3' will hold and 'p0'",(((X p3) -> p2) U ((! ((X p3) -> p2)) | (p3 & p0))),"If in the next step, 'p3', then 'p2' holds until either in the next step, 'p3', but not 'p2' or both 'p3' and 'p0'",ImplicitF,0.13020284473896027,0.49387961626052856,0.260651171207428,6 +(p3 U Xp0) U (p2 U p1),"'p3' until in the next step, 'p0', and this continues until 'p2' until 'p1'",(((p3 U (X p0)) U p2) U p1),"'p3' until in the next step, 'p0' until 'p2', and this continues until 'p1'",Precedence,0.04631377384066582,0.42235496640205383,0.23479179944843054,4 +X(FGp3 U ((p3 U p1) U p2)),"In the next step, eventually, 'p3' will always be true until 'p3' until 'p1', and this continues until 'p2'",(X ((F (G p3)) U (F ((p3 U p1) U p2)))),"In the next step, eventually, 'p3' will always be true until eventually, 'p3' until 'p1', and this continues until 'p2'",BadStateQuantification,0.09275685995817184,0.4013374447822571,0.2730415426194668,6 +((! p3) U (p1 & (G ((F p3) -> p2)))),"'p3' is false until both 'p1' and at all times, if at some point, 'p3' will hold, then 'p2' holds",((! p3) U (p1 & ((F p3) -> p2))),"'p3' is false until both 'p1' and if at some point, 'p3' will hold, then 'p2' holds",ImplicitG,0.09401790797710419,0.6226906180381775,0.26009408065250944,7 +(X ((G p0) & (p1 -> p3))),"In the next step, both 'p0' must always hold and if 'p1' holds, then 'p3' holds",((X (G p0)) & (p1 -> p3)),"Both in the next step, 'p0' must always hold and if 'p1' holds, then 'p3' holds",BadStateIndex,0.10660488158464432,0.5265659689903259,0.28082766085863115,5 +((G p2) U (((F p3) & p2) & (F ((G p2) & p1)))),"'p2' must always hold until both at some point, 'p3' will hold and 'p2' and eventually 'p1' will occur, and from then on 'p2' will always hold",(((G p2) U (((F p3) & p2) & (F ((G p2) & p1)))) | (G (G p2))),"Either 'p2' must always hold until both at some point, 'p3' will hold and 'p2' and eventually 'p1' will occur, and from then on 'p2' will always hold or at all times, 'p2'",WeakU,0.13080976903438568,0.6666805744171143,0.32677003368735313,4 +((((F p3) & p0) U ((! p1) -> p0)) & (G (((F p3) & p0) -> (! ((! p1) -> p0))))),"Both at some point, 'p3' will hold and 'p0' hold until 'p0' unless 'p1' and if both at some point, 'p3' will hold and 'p0', then not 'p0' unless 'p1' is always true",((((F p3) & p0) U ((! p1) -> p0)) & (G ((p3 & p0) -> (! ((! p1) -> p0))))),"Both at some point, 'p3' will hold and 'p0' hold until 'p0' unless 'p1' and if both 'p3' and 'p0', then not 'p0' unless 'p1' is always true",ImplicitF,0.1309593915939331,0.3799649775028229,0.23854931071400642,4 diff --git a/src/app.py b/src/app.py index 7e5fd78..f583dc1 100644 --- a/src/app.py +++ b/src/app.py @@ -4,13 +4,16 @@ from ltlnode import parse_ltl_string, SUPPORTED_SYNTAXES from codebook import getAllApplicableMisconceptions -import os -import json -import sys -from feedbackgenerator import FeedbackGenerator -from logger import Logger -import ast -import exerciseprocessor +import os +import json +import sys +import csv +from difflib import SequenceMatcher +from markupsafe import Markup, escape +from feedbackgenerator import FeedbackGenerator +from logger import Logger +import ast +import exerciseprocessor import exercisebuilder import random import spotutils @@ -30,7 +33,7 @@ get_course_students, get_exercises_for_course, ) -from modelroutes import modelroutes +from modelroutes import modelroutes @@ -59,11 +62,102 @@ def enforce_https_in_production(): secure_url = request.url.replace("http://", "https://", 1) return redirect(secure_url, code=301) -answer_logger = Logger() - - -def getUserName(): - return current_user.username +answer_logger = Logger() +PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir)) + +BENCHMARK_FILES = { + "near": os.path.join(PROJECT_ROOT, "semantic_benchmark_near_eng.csv"), + "far": os.path.join(PROJECT_ROOT, "semantic_benchmark_far_eng.csv"), +} + + +def _load_benchmark_rows(): + data = {} + for label, path in BENCHMARK_FILES.items(): + if not os.path.exists(path): + data[label] = [] + continue + + with open(path, newline='') as csvfile: + reader = csv.DictReader(csvfile) + rows = [] + for idx, row in enumerate(reader): + rows.append({ + "row_index": idx, + "ltl_formula": row.get("ltl_formula", ""), + "english_translation": row.get("english_translation", ""), + "closest_mutant_formula": row.get("closest_mutant_formula", ""), + "closest_mutant_english": row.get("closest_mutant_english", ""), + "closest_mutant_misconception": row.get("closest_mutant_misconception", ""), + "closest_distance": float(row.get("closest_distance", 0) or 0), + "max_distance": float(row.get("max_distance", 0) or 0), + "avg_distance": float(row.get("avg_distance", 0) or 0), + "num_mutants": int(float(row.get("num_mutants", 0) or 0)), + "dataset": label + }) + data[label] = rows + return data + + +BENCHMARK_ROWS = _load_benchmark_rows() + + +def _highlight_differences(text_a, text_b): + tokens_a = text_a.split() + tokens_b = text_b.split() + matcher = SequenceMatcher(None, tokens_a, tokens_b) + opcodes = matcher.get_opcodes() + + def build(tokens, use_a): + parts = [] + for tag, i1, i2, j1, j2 in opcodes: + segment = tokens[i1:i2] if use_a else tokens[j1:j2] + if not segment: + continue + escaped = " ".join(escape(token) for token in segment) + if tag == 'equal': + parts.append(escaped) + else: + parts.append(f"{escaped}") + return Markup(" ".join(parts)) + + return build(tokens_a, True), build(tokens_b, False) + + +def _next_index_for_dataset(user_id, dataset): + rows = BENCHMARK_ROWS.get(dataset, []) + rated = answer_logger.getRatedSentencePairIndices(user_id, dataset) + for idx in range(len(rows)): + if idx not in rated: + return idx + return None + + +def _pick_next_pair(user_id): + options = [] + next_near = _next_index_for_dataset(user_id, "near") + next_far = _next_index_for_dataset(user_id, "far") + + if next_near is not None: + options.append(("near", next_near)) + if next_far is not None: + options.append(("far", next_far)) + + if not options: + return None, None, None + + if len(options) == 2: + dataset = random.choice(["near", "far"]) + index = next_near if dataset == "near" else next_far + else: + dataset, index = options[0] + + row = BENCHMARK_ROWS[dataset][index] + return dataset, index, row + + +def getUserName(): + return current_user.username def getUserId(): return current_user.id @@ -199,6 +293,90 @@ def ltl_to_english(): except Exception as e: return jsonify({"error": "Failed to translate formula.", "details": str(e)}), 400 + +@app.route('/ltltoeng/rating', methods=['GET', 'POST']) +@login_required +def ltltoeng_rating(): + user_id = getUserName() + rated_near = answer_logger.getRatedSentencePairIndices(user_id, "near") + rated_far = answer_logger.getRatedSentencePairIndices(user_id, "far") + + total_count = len(BENCHMARK_ROWS.get("near", [])) + len(BENCHMARK_ROWS.get("far", [])) + done_count = len(rated_near) + len(rated_far) + progress = {"done": done_count, "total": total_count} + + if request.method == 'POST': + dataset = request.form.get('dataset', '') + row_index = request.form.get('row_index', '') + likert_value = request.form.get('likert') + awkward_flag = request.form.get('awkward_flag') == 'on' + awkward_notes = request.form.get('awkward_notes', '').strip() + unsure_flag = request.form.get('unsure') in ['1', 'on', 'true', 'True'] + + try: + row_index = int(row_index) + except ValueError: + return jsonify({"error": "Invalid rating payload"}), 400 + + likert_rating = None + if not unsure_flag: + try: + likert_rating = int(likert_value) + except (TypeError, ValueError): + return jsonify({"error": "Likert rating missing or invalid"}), 400 + if likert_rating < 1 or likert_rating > 7: + return jsonify({"error": "Likert rating must be between 1 and 7"}), 400 + + rows = BENCHMARK_ROWS.get(dataset) + if rows is None or row_index < 0 or row_index >= len(rows): + return jsonify({"error": "Unknown benchmark item"}), 400 + + pair = rows[row_index] + + answer_logger.logSentencePairRating({ + "user_id": user_id, + "dataset": dataset, + "row_index": row_index, + "base_english": pair["english_translation"], + "mutant_english": pair["closest_mutant_english"], + "base_ltl": pair["ltl_formula"], + "mutant_ltl": pair["closest_mutant_formula"], + "misconception": pair["closest_mutant_misconception"], + "likert_rating": likert_rating, + "awkward_flag": awkward_flag, + "awkward_notes": awkward_notes, + "unsure": unsure_flag, + "closest_distance": pair["closest_distance"], + "max_distance": pair["max_distance"], + "avg_distance": pair["avg_distance"], + "num_mutants": pair["num_mutants"], + }) + + return redirect('/ltltoeng/rating') + + dataset, row_index, pair = _pick_next_pair(user_id) + if pair is None: + return render_template( + 'ltltoeng_rating.html', + uid=user_id, + done=True, + progress=progress + ) + + highlighted_base, highlighted_mutant = _highlight_differences(pair["english_translation"], pair["closest_mutant_english"]) + + return render_template( + 'ltltoeng_rating.html', + uid=user_id, + done=False, + pair=pair, + dataset=dataset, + row_index=row_index, + highlighted_base=highlighted_base, + highlighted_mutant=highlighted_mutant, + progress=progress + ) + return jsonify({"formula": formula, "english": english}) diff --git a/src/logger.py b/src/logger.py index 344d747..f2b3654 100644 --- a/src/logger.py +++ b/src/logger.py @@ -1,15 +1,16 @@ -import datetime -from sqlalchemy import create_engine, Column, Integer, String, DateTime, Boolean -from sqlalchemy.orm import sessionmaker -from sqlalchemy.ext.declarative import declarative_base -import os -from sqlalchemy import inspect -from sqlalchemy.orm import scoped_session, sessionmaker +import datetime +from sqlalchemy import create_engine, Column, Integer, String, DateTime, Boolean, Float +from sqlalchemy.orm import sessionmaker +from sqlalchemy.ext.declarative import declarative_base +import os +from sqlalchemy import inspect +from sqlalchemy.orm import scoped_session, sessionmaker -STUDENT_RESPONSE_TABLE = 'student_responses' -GENERATED_EXERCISE_TABLE = 'generated_exercise' -ENGLISH_LTL_TABLE = 'english_ltl_pairs' +STUDENT_RESPONSE_TABLE = 'student_responses' +GENERATED_EXERCISE_TABLE = 'generated_exercise' +ENGLISH_LTL_TABLE = 'english_ltl_pairs' +SENTENCE_PAIR_RATING_TABLE = 'sentence_pair_ratings' def get_db_uri(): @@ -63,31 +64,55 @@ class GeneratedExercise(Base): exerciseName = Column(String) -class EnglishLTLRating(Base): - __tablename__ = ENGLISH_LTL_TABLE - id = Column(Integer, primary_key=True) - english = Column(String) - ltl = Column(String) - comments = Column(String) - user_id = Column(String) - timestamp = Column(DateTime, default=datetime.datetime.utcnow) - - -class Logger: - def __init__(self): +class EnglishLTLRating(Base): + __tablename__ = ENGLISH_LTL_TABLE + id = Column(Integer, primary_key=True) + english = Column(String) + ltl = Column(String) + comments = Column(String) + user_id = Column(String) + timestamp = Column(DateTime, default=datetime.datetime.utcnow) + + +class SentencePairRating(Base): + __tablename__ = SENTENCE_PAIR_RATING_TABLE + id = Column(Integer, primary_key=True) + user_id = Column(String, index=True) + dataset = Column(String, index=True) # "near" or "far" + row_index = Column(Integer, index=True) + base_english = Column(String) + mutant_english = Column(String) + base_ltl = Column(String) + mutant_ltl = Column(String) + misconception = Column(String) + likert_rating = Column(Integer) + awkward_flag = Column(Boolean, default=False) + awkward_notes = Column(String) + closest_distance = Column(Float) + max_distance = Column(Float) + avg_distance = Column(Float) + num_mutants = Column(Integer) + created_at = Column(DateTime, default=datetime.datetime.utcnow) + + +class Logger: + def __init__(self): db_uri = get_db_uri() self.engine = create_engine(db_uri) Base.metadata.create_all(self.engine) - self.session_factory = sessionmaker(bind=self.engine, expire_on_commit=True) - self.Session = scoped_session(self.session_factory) - - self.inspector = inspect(self.engine) - if STUDENT_RESPONSE_TABLE not in self.inspector.get_table_names(): - Base.metadata.tables[STUDENT_RESPONSE_TABLE].create(self.engine) - - if GENERATED_EXERCISE_TABLE not in self.inspector.get_table_names(): - Base.metadata.tables[GENERATED_EXERCISE_TABLE].create(self.engine) + self.session_factory = sessionmaker(bind=self.engine, expire_on_commit=True) + self.Session = scoped_session(self.session_factory) + + self.inspector = inspect(self.engine) + if STUDENT_RESPONSE_TABLE not in self.inspector.get_table_names(): + Base.metadata.tables[STUDENT_RESPONSE_TABLE].create(self.engine) + + if GENERATED_EXERCISE_TABLE not in self.inspector.get_table_names(): + Base.metadata.tables[GENERATED_EXERCISE_TABLE].create(self.engine) + + if SENTENCE_PAIR_RATING_TABLE not in self.inspector.get_table_names(): + Base.metadata.tables[SENTENCE_PAIR_RATING_TABLE].create(self.engine) def record(self, log): with self.Session() as session: @@ -156,10 +181,10 @@ def recordGeneratedExercise(self, userId, exercise_data, exercise_name): - def recordEnglishLTLPair(self, e_l_pair): - - if not isinstance(e_l_pair, dict): - raise ValueError("e_l_pair should be a dictionary") + def recordEnglishLTLPair(self, e_l_pair): + + if not isinstance(e_l_pair, dict): + raise ValueError("e_l_pair should be a dictionary") if 'english' not in e_l_pair or 'ltl' not in e_l_pair: raise ValueError("e_l_pair should contain 'english' and 'ltl' keys") @@ -170,8 +195,79 @@ def recordEnglishLTLPair(self, e_l_pair): userId = e_l_pair['user_id'] - log = EnglishLTLRating(english=english, ltl=ltl, comments=comments, user_id=userId) - self.record(log) + log = EnglishLTLRating(english=english, ltl=ltl, comments=comments, user_id=userId) + self.record(log) + + def logSentencePairRating(self, rating_data): + """Insert or update a user's rating for a specific benchmark row.""" + required = ['user_id', 'dataset', 'row_index'] + for key in required: + if key not in rating_data: + raise ValueError(f"rating_data missing required key '{key}'") + + unsure = bool(rating_data.get('unsure')) + likert = rating_data.get('likert_rating') + if not unsure: + if not isinstance(likert, int) or likert < 1 or likert > 7: + raise ValueError("likert_rating must be an int between 1 and 7 for 7-point scale") + else: + likert = None + + with self.Session() as session: + existing = session.query(SentencePairRating).filter( + SentencePairRating.user_id == rating_data['user_id'], + SentencePairRating.dataset == rating_data['dataset'], + SentencePairRating.row_index == rating_data['row_index'] + ).one_or_none() + + if existing: + existing.likert_rating = likert + existing.awkward_flag = rating_data.get('awkward_flag', False) + existing.awkward_notes = rating_data.get('awkward_notes', '') + existing.base_english = rating_data.get('base_english', existing.base_english) + existing.mutant_english = rating_data.get('mutant_english', existing.mutant_english) + existing.base_ltl = rating_data.get('base_ltl', existing.base_ltl) + existing.mutant_ltl = rating_data.get('mutant_ltl', existing.mutant_ltl) + existing.misconception = rating_data.get('misconception', existing.misconception) + existing.closest_distance = rating_data.get('closest_distance', existing.closest_distance) + existing.max_distance = rating_data.get('max_distance', existing.max_distance) + existing.avg_distance = rating_data.get('avg_distance', existing.avg_distance) + existing.num_mutants = rating_data.get('num_mutants', existing.num_mutants) + else: + log = SentencePairRating( + user_id=rating_data['user_id'], + dataset=rating_data['dataset'], + row_index=rating_data['row_index'], + base_english=rating_data.get('base_english', ''), + mutant_english=rating_data.get('mutant_english', ''), + base_ltl=rating_data.get('base_ltl', ''), + mutant_ltl=rating_data.get('mutant_ltl', ''), + misconception=rating_data.get('misconception', ''), + likert_rating=likert, + awkward_flag=rating_data.get('awkward_flag', False), + awkward_notes=rating_data.get('awkward_notes', ''), + closest_distance=rating_data.get('closest_distance'), + max_distance=rating_data.get('max_distance'), + avg_distance=rating_data.get('avg_distance'), + num_mutants=rating_data.get('num_mutants') + ) + session.add(log) + + session.commit() + + def getRatedSentencePairIndices(self, user_id, dataset): + """Return a set of row indices already rated by the user for the dataset.""" + if not isinstance(user_id, str): + raise ValueError("user_id should be a string") + if not isinstance(dataset, str): + raise ValueError("dataset should be a string") + + with self.Session() as session: + rows = session.query(SentencePairRating.row_index).filter( + SentencePairRating.user_id == user_id, + SentencePairRating.dataset == dataset + ).all() + return {row[0] for row in rows} def getComplexity(self, userId): @@ -227,4 +323,4 @@ def getCompletedExercises(self, userId, exercise_names_and_counts): if answered >= question_count: completed.add(exercise_name) return completed - \ No newline at end of file + diff --git a/src/ltlnode.py b/src/ltlnode.py index 2bd5363..117f057 100644 --- a/src/ltlnode.py +++ b/src/ltlnode.py @@ -243,9 +243,9 @@ def __to_english__(self): f"{lhs} holds until {rhs} occurs", f"maintain {lhs} until {rhs}" ] - return ltltoeng.capitalize_sentence(ltltoeng.choose_best_sentence(patterns)) + return ltltoeng.choose_best_sentence(patterns) - return ltltoeng.capitalize_sentence(f"{lhs} until {rhs}") + return f"{lhs} until {rhs}" def __forge__(self): return f"({self.left.__forge__()} UNTIL {self.right.__forge__()})" @@ -264,7 +264,7 @@ def __to_english__(self): if x is not None: return x op = self.operand.__to_english__().rstrip('.') - return ltltoeng.capitalize_sentence(f"in the next step, {op}") + return f"in the next step, {op}" def __forge__(self): return f"(NEXT_STATE {self.operand.__forge__()})" @@ -294,7 +294,7 @@ def __to_english__(self): ] english = ltltoeng.choose_best_sentence(patterns) - return ltltoeng.capitalize_sentence(english) + return english def __forge__(self): return f"(ALWAYS {self.operand.__forge__()})" @@ -321,9 +321,9 @@ def __to_english__(self): f"{op} will eventually occur", f"at some point, {op} will hold" ] - return ltltoeng.capitalize_sentence(ltltoeng.choose_best_sentence(patterns)) + return ltltoeng.choose_best_sentence(patterns) - return ltltoeng.capitalize_sentence(f"eventually, {op}") + return f"eventually, {op}" def __forge__(self): return f"(EVENTUALLY {self.operand.__forge__()})" @@ -353,9 +353,9 @@ def __to_english__(self): f"{lhs} or {rhs}", f"at least one of {lhs} or {rhs}" ] - return ltltoeng.capitalize_sentence(ltltoeng.choose_best_sentence(patterns)) + return ltltoeng.choose_best_sentence(patterns) - return ltltoeng.capitalize_sentence(f"either {lhs} or {rhs}") + return f"either {lhs} or {rhs}" @@ -379,9 +379,9 @@ def __to_english__(self): f"{lhs} and {rhs}", f"{lhs} together with {rhs}" ] - return ltltoeng.capitalize_sentence(ltltoeng.choose_best_sentence(patterns)) + return ltltoeng.choose_best_sentence(patterns) - return ltltoeng.capitalize_sentence(f"both {lhs} and {rhs}") + return f"both {lhs} and {rhs}" class NotNode(UnaryOperatorNode): @@ -403,9 +403,9 @@ def __to_english__(self): f"{op} does not hold", f"{op} is false" ] - return ltltoeng.capitalize_sentence(ltltoeng.choose_best_sentence(patterns)) + return ltltoeng.choose_best_sentence(patterns) else: - return ltltoeng.capitalize_sentence(f"it is not the case that {op}") + return f"it is not the case that {op}" class ImpliesNode(BinaryOperatorNode): symbol = IMPLIES_SYMBOL @@ -432,7 +432,7 @@ def __to_english__(self): # Choose the most fluent pattern rather than picking randomly english = ltltoeng.choose_best_sentence(patterns) - return ltltoeng.capitalize_sentence(english) + return english class EquivalenceNode(BinaryOperatorNode): @@ -459,7 +459,7 @@ def __to_english__(self): # Choose the most fluent pattern rather than picking randomly english = ltltoeng.choose_best_sentence(patterns) - return ltltoeng.capitalize_sentence(english) + return english diff --git a/src/ltltoeng.py b/src/ltltoeng.py index 386e251..e9ff991 100644 --- a/src/ltltoeng.py +++ b/src/ltltoeng.py @@ -104,11 +104,11 @@ def capitalize_sentence(text): return text[0].upper() + text[1:] if len(text) > 1 else text.upper() -def smooth_grammar(text): - """Apply grammar smoothing rules to improve readability. - - Fixes common awkward phrasings that arise from composition. - """ +def smooth_grammar(text): + """Apply grammar smoothing rules to improve readability. + + Fixes common awkward phrasings that arise from composition. + """ if not text: return text @@ -127,6 +127,10 @@ def smooth_grammar(text): # Fix "then then" -> "then" text = text.replace("then then", "then") + # Fix "hold hold" or "holds holds" -> "hold" or "holds" + text = text.replace("hold hold", "hold") + text = text.replace("holds holds", "holds") + # Fix awkward "it is the case that it is the case that" text = text.replace("it is the case that it is the case that", "it is the case that") @@ -134,20 +138,30 @@ def smooth_grammar(text): text = text.replace("it is not the case that it is not the case that", "") # Fix ", ," -> "," - text = text.replace(", ,", ",") - - # Normalize mid-sentence capitalization of connectives like "If" or "Then" - def _lowercase_mid_sentence(match): - return match.group(1).lower() - - text = re.sub(r"(? "or p until q") + # This handles patterns like "!p1 | (p0 U p2)" -> "either p1 is false or p0 until p2" + text = re.sub(r'\b(either|or|and|both)\s+the case where\s+(\S.*?\s+until\s+)', r'\1 \2', text, flags=re.IGNORECASE) + + # Remove redundant "the state that" before "until" + text = re.sub(r'\b(either|or|and|both)\s+the state that\s+(\S.*?\s+holds\s+until\s+)', r'\1 \2', text, flags=re.IGNORECASE) + + # Simplify "either X or the case where Y" -> "either X or Y" when Y is already a clause + text = re.sub(r'\b(either\s+[^,]+)\s+or\s+the case where\s+', r'\1 or ', text, flags=re.IGNORECASE) + + # Normalize mid-sentence capitalization of connectives like "If" or "Then" + def _lowercase_mid_sentence(match): + return match.group(1).lower() + + text = re.sub(r"(? + + + + Rate English Paraphrases + + + + + + + + + {% include 'navbar.html' %} + +
+
+

How Similar Are These English Sentences?

+
+

Your ratings help with our research, including improving the translation quality of LTL sentences into readable English. Thank you for contributing.

+

Tell us whether these two sentences differ in meaning (1 = completely different meaning, 7 = essentially the same meaning). You can also flag awkward English.

+ +
+ Rated: {{ progress.done }}/{{ progress.total }} +
+ + {% if done %} +
+ All benchmark pairs are rated. Thank you! +
+ {% else %} +
+
+
+
+
+
Sentence A
+
+

{{ highlighted_base|safe }}

+
+
+
+
+
+
Sentence B
+
+

{{ highlighted_mutant|safe }}

+
+
+
+
+ +
+ + + +
+ + +
{% for i in range(7) %}{% endfor %}
+
+ 1
Completely different meaning
+ 2
Mostly different
+ 3
Somewhat different
+ 4
Mixed / unclear overlap
+ 5
Mostly similar
+ 6
Very similar
+ 7
Essentially same meaning
+
+
+ Different meaning + Same meaning +
+
+ + + +
+
+ +
+ + +
+ + +
+
+
+ {% endif %} +
+ + + diff --git a/src/templates/navbar.html b/src/templates/navbar.html index 5b5efb2..14f168c 100644 --- a/src/templates/navbar.html +++ b/src/templates/navbar.html @@ -44,6 +44,25 @@ + {% if uid %} + + {% endif %} +