Construct the overall assembly theorem

This brings us roughly to parity with the old MM1 `assemble` tactic, which has been moved to `assembler-old.mm1`; it will probably be deleted once everything is moved over and the new assembler works. This part is largely bookkeeping - we have assembled all the procedures and need to append them together to prove that the full thing assembles, and then we finally prove the `end e. u64` side condition that was deferred during the assembly process to turn `assemble` into `assembled`. The last stage looks trivial but is actually somewhat expensive in the MM0 model of computation by verification, and it makes an interesting asymptotic analysis problem. We have the theorems theorem assembled_l: $ assembled ctx (A +asm B) $ > $ assembled ctx A $; theorem assembled_r: $ assembled ctx (A +asm B) $ > $ assembled ctx B $; which can be thought of as a fancy version of the rule `(p -> a /\ b) |- (p -> a)`, and we want to use it to decompose a large conjunction `p -> a /\ ... /\ z` with, let's say, `n` conjuncts, into `n` theorems asserting each conjunct. The trouble is that MM0 requires that each theorem stand alone in terms of term formation, which means that even if we have been good and the conjunction is a balanced binary tree so that each individual proof involves only `log n` steps, we still need to construct the term `p -> a /\ ... /\ z` which is O(n) in each conjunct extraction theorem, leading to O(n^2) work. We can improve on this by adding lemmas. Suppose that `n = k * m`, and we separate the conjunctions into `k` groups of size `m`. So we prove the lemmas p -> a[0] /\ ... /\ a[m-1], ... p -> a[(k-1)*m] /\ ... /\ a[(k-1)*m + m-1] and then decompose the pieces further into conjuncts. Ignoring lower order terms, each lemma can be proved in O(k*m) (because we have to state the full conjunct in the first step), so the total cost of all the lemmas is O(k^2 m) = O(k n). Adding in the cost of proving the conjuncts results in the following recurrence: T(k * m) <= k T(m) + O(k^2 * m) If we solve the groups with the naive method so that T(m) <= m^2, we get T(k*m) <= O(k m^2 + k^2 m) and the best choice is to take k = sqrt n, for a bound of O(n^(3/2)). But there is no reason to stop at just one subdivision: if we take a branching factor of `k` and apply this method recursively, we get: T(k^(i+1)) <= k T(k^i) + O(k^(i+2)) => T(k^i) = O(i * k^(i+1)) => T(n) = O(n * log n * (k / log k)) So we get O(n log n) time if we pick any constant branching factor, with the optimal choice being k = e. Well, that's annoying, but k = 2 (make a lemma at every node) is easy to implement and only 6% worse. I believe this to be optimal within the current constraints of the language, but with language changes this could be brought down to linear time. I considered this for the MMB format, with a `theorems` command that would allow you to prove multiple theorems in one go, reusing the term pool for all of them. In this case you could use it to prove all n theorems in O(n) steps and each theorem is constant size, and the term pool used in the proof is also O(n), so the overall cost would be O(n). In the end it turned out to be a lot of complication to ask of verifiers, and a log n overhead isn't *that* bad, especially considering that the MM0 model of computation by term construction already imposes a log n overhead over random access because term constructors have a constant branching factor.
digama0 · Nov 5, 2021 · 97abfae · 97abfae
1 parent 5a8788b
commit 97abfae
Show file tree

Hide file tree

Showing 9 changed files with 568 additions and 359 deletions.
diff --git a/examples/assembler-new.mm1 b/examples/assembler-new.mm1
@@ -28,7 +28,7 @@ do {
 
   ((fn xs @ scan xs @ fn (x) @ scan x mmc-th)
     '(fal tru eq ne d0 suc add mul le lt ltlei ltnei ltneri leid pr
-      znsub cons len sadd scons s0 s1 ch c2n h2n hex)
+      znsub cons len sadd scons s0 s1 ch c2n h2n hex s2n)
     (map dn (range 0 17))
     (join @ map (fn (x) (map (fn (i) (atom-app x i)) hexstrings))
       '(x h2n _x00x decsuc))
@@ -43,54 +43,12 @@ do {
       unopInc unopDec unopNot unopNeg))
 };
 
-@mmc-th local def is_asmp (p: nat) (s: string) (x y: nat) (P: set): wff =
-$ s != 0 /\ y = x + len s /\ (y e. u64 -> p <> s <> x e. P) $;
-
-@mmc-th local def asmp_A (A B: set): set =
-$ S\ p, S\ s, {x | E. t1 E. t2 E. z (s = t1 ++ t2 /\
-  p <> t1 <> x e. A /\ p <> t2 <> (x + len t1) e. B)} $;
-
-@mmc-th theorem is_asmp_A (p s t x y z A B)
-  (h1: $ is_asmp p s x y A $) (h2: $ is_asmp p t y z B $):
-  $ is_asmp p (s '+ t) x z (asmp_A A B) $ = 'sorry;
-
-@mmc-th local def asmp_at (n: nat) (A: set): set =
-$ S\ p, S\ s, {x | x = n /\ p <> s <> x e. A} $;
-
-@mmc-th theorem is_asmp_at (p s x y A)
-  (h1: $ is_asmp p s x y A $): $ is_asmp p s x y (asmp_at x A) $ = 'sorry;
-
-@mmc-th local def asmp_pad: set = $ S\ p, S\ s, {x | s != 0} $;
-
-theorem is_asmp_pad (p s x y)
-  (h1: $ x0 < n $) (h2: $ s = repeat 0 n $) (h3: $ x + n = y $):
-  $ is_asmp p s x y asmp_pad $ = 'sorry;
-
--- Builds the theorems:
---
--- theorem is_asmp_A_pad_9 (p: nat) (s: string) (x y z: nat) (A: set):
---   $ is_asmp p s x y A $ >
---   $ y + x9 = z $ >
---   $ is_asmp p (s '+ _x00x9) x z (asmp_A A asmp_pad) $;
---
--- for padding of size 1 to 15.
-do (ignore @ on-hexstrings @ fn (n)
-  @ let ([xn (atom-app 'x n)] [i (hex->number xn)] [t '(,(_x00xn i))])
-  @ if (not {i == 0}) @ begin
-  (def name (atom-app "is_asmp_A_pad_" n))
-  (add-tac-thm! name
-    '((p nat ()) (s string ()) (x nat ()) (y nat ()) (z nat ()) (A set ()))
-    '((h1 $ is_asmp p s x y A $) (h2 $ y + ,'(,xn) = z $))
-    $ is_asmp p (s '+ ,t) x z (asmp_A A asmp_pad) $ ()
-    @ fn () @ focus '(is_asmp_A h1 @ is_asmp_pad _ ,(_x00xn_eq i) h2) norm_num)
-  (mmc-th name)
-);
-
 @mmc-th local def strlen (s: string) (n: nat): wff = $ len s = n $;
 @mmc-th theorem strlen0: $ strlen s0 x0 $ = '(eqtr (leneq s2ns0) @ eqtr4 len0 h2n0);
 @mmc-th theorem strlen1: $ strlen (s1 c0) x1 $ = '(eqtr (leneq s2ns1) @ eqtr4 len1 h2n1);
 theorem strlenSi (h: $ strlen s a $) (h2: $ suc a = b $): $ strlen (c ': s) b $ =
 '(eqtr (leneq s2nscons) @ eqtr lenS @ eqtr (suceq h) h2);
+theorem strlen_repeat (h: $ s = repeat 0 n $): $ strlen s n $ = '(eqtr (leneq h) repeatlen);
 do {
   (def (strlenn n) @ atom-app 'strlen (hexstring n))
   (for 1 15 @ fn (a)
@@ -101,8 +59,41 @@ do {
       (map (fn (x) '(,x char ())) args) () '(strlen ,str ,xb) () @ fn ()
       '(strlenSi ,(strlenn a) ,(decsucn a)))
     (mmc-th name))
+  (ignore @ on-hexstrings @ fn (n)
+    @ let ([xn (atom-app 'x n)] [i (hex->number xn)] [t '(,(_x00xn i))])
+    (def name (atom-app "strlen_x00x" n))
+    (add-tac-thm! name () () $ strlen ,t ,'(,xn) $ () @ fn () '(strlen_repeat ,(_x00xn_eq i)))
+    (mmc-th name))
 };
 
+@mmc-th local def assemble (s: string) (x y: nat) (P: set): wff =
+$ s != 0 /\ y = x + len s /\ (y e. u64 -> s <> x e. P) $;
+
+@mmc-th local def assembleA (A B: set): set =
+$ S\ s, {x | E. t1 E. t2 E. z (s = t1 ++ t2 /\ t1 != 0 /\ t2 != 0 /\
+  t1 <> x e. A /\ t2 <> (x + len t1) e. B)} $;
+infixr assembleA: $++asm$ prec 120;
+
+@mmc-th theorem assembleA_I (s t x y z A B)
+  (h1: $ assemble s x y A $) (h2: $ assemble t y z B $):
+  $ assemble (s '+ t) x z (A ++asm B) $ = 'sorry;
+
+@mmc-th local def localAssemble (p: nat) (s: string) (x y: nat) (P: set): wff =
+$ assemble s x y (P @' p) $;
+
+@mmc-th local def localAssembleA (A B: set): set = $ S\ p, A @' p ++asm B @' p $;
+infixr localAssembleA: $+asm$ prec 122;
+
+@mmc-th theorem localAssembleA_I (p s t x y z A B)
+  (h1: $ localAssemble p s x y A $) (h2: $ localAssemble p t y z B $):
+  $ localAssemble p (s '+ t) x z (A +asm B) $ = 'sorry;
+
+@mmc-th local def asmAt (n: nat) (A: set): set =
+$ S\ p, S\ s, {x | x = n /\ p <> s <> x e. A} $;
+
+@mmc-th theorem asmAtI (p s x y A)
+  (h1: $ localAssemble p s x y A $): $ localAssemble p s x y (asmAt x A) $ = 'sorry;
+
 --| See `parseInst`. (The `*N` functions are proof-internal, used for proving compiler lemmas.)
 @_ local def parseInstN (p ip s: nat) (I: set): wff =
 $ ip e. u64 -> A. x (ip = x + len s -> p <> s <> x e. I) $;
@@ -112,7 +103,7 @@ $ ip e. u64 -> A. x (ip = x + len s -> p <> s <> x e. I) $;
 @mmc-th local def parseInst (p ip: nat) (s: string) (I: set): wff = $ parseInstN p ip s I $;
 
 @mmc-th theorem parseInstE (p s x y n I) (h1: $ strlen s n $) (h2: $ x + n = y $)
-  (h3: $ parseInst p y s I $): $ is_asmp p s x y I $ = 'sorry;
+  (h3: $ parseInst p y s I $): $ localAssemble p s x y I $ = 'sorry;
 
 --| `parseOpc p ip s rex opc I` means that after decoding the REX byte to `rex`,
 --| if the first byte of the instruction is `opc` and the remainder is `s`,
@@ -724,3 +715,69 @@ $ S\ p, S\ s, {x | E. l1 E. l2 (s = l1 ++ l2 /\
 
 @mmc-th theorem parseAssert:
   $ parseOpc p ip (ch x0 x2 ': ch x0 xf ': s1 (ch x0 xb)) rex (ch x7 c) (instAssert c) $ = 'sorry;
+
+--------------------------------------------
+-- Post-processing the result of assembly --
+--------------------------------------------
+
+@mmc-th local def asmProc (n: nat) (A: set): set =
+$ S\ s, {p | p = n /\ E. t1 E. t2 E. z (s = t1 ++ t2 /\ t1 != 0 /\ p <> t1 <> 0 e. A)} $;
+
+@mmc-th theorem asmProcI (s x n y A)
+  (h1: $ localAssemble x s x0 n A $)
+  (h2: $ x + n = y $):
+  $ assemble s x y (asmProc x A) $ = 'sorry;
+
+@mmc-th theorem assemble_pad
+  (h1: $ assemble s x y (asmProc x A) $)
+  (h2: $ y + n = z $)
+  (h3: $ strlen t n $):
+  $ assemble (s '+ t) x z (asmProc x A) $ = 'sorry;
+
+--| `assembled c P` (assembly output) asserts that if the full file content is `c`,
+--| then `P` holds of some nonempty substring in `c`.
+--| `P` will contain embedded assertions about the position or the
+--| string where relevant (using `asmAt`), so that the existential is uniquely determined.
+@_ local def assembledN (c: nat) (P: set): wff =
+$ E. x E. s (s != 0 /\ sublistAt x c s /\
+  text_start + x + len s e. u64 /\ s <> (text_start + x) e. P) $;
+
+--| `assembled c P` (assembly output) asserts that if the full file content is `c`,
+--| then `P` holds of some nonempty substring in `c`.
+--| `P` will contain embedded assertions about the position or the
+--| string where relevant (using `asmAt`), so that the existential is uniquely determined.
+@mmc-th local def assembled (c: string) (P: set): wff = $ assembledN c P $;
+
+@mmc-th local def isU64 (n: nat): wff = $ n e. u64 $;
+
+-- Proves, for digits i = 0, .., 15:
+-- theorem isU64_5 (a0 a1 a2 a3 a4 a5: hex):
+--   $ isU64 (a5 :x a4 :x a3 :x a2 :x a1 :x a0) $;
+-- (Note that the number of digits is one more than the index value.)
+do (ignore @ on-hexstrings @ fn (n)
+  @ let ([xn (atom-app 'x n)] [i (hex->number xn)] [t '(,(_x00xn i))]
+         [(ai j) (atom-app 'a j)])
+  (def name (atom-app "isU64_" n))
+  (add-tac-thm! name
+    (map (fn (j) '(,(ai j) hex ())) (range 0 {i + 1})) ()
+    '(isU64 ,(foldr (range 0 i) '(h2n ,(ai i)) @ fn (j x) '(hex ,x ,(ai j)))) () @ fn ()
+    '(xelu64i ,(iterate {i // 2} (fn (x) '(xelBits8S ,x)) (atom-app 'xelBits8 {{i % 2} + 1})))));
+
+@mmc-th theorem assembledI
+  (h1: $ assemble c ,0x400078 y A $) (h2: $ isU64 y $): $ assembled c A $ = 'sorry;
+
+@mmc-th theorem assembled_l (h1: $ assembled c (A ++asm B) $): $ assembled c A $ = 'sorry;
+
+@mmc-th theorem assembled_r (h1: $ assembled c (A ++asm B) $): $ assembled c B $ = 'sorry;
+
+@mmc-th local def localAssembled (ctx: nat) (P: set): wff =
+$ assembledN (fst ctx) (P @' snd ctx) $;
+
+@mmc-th theorem localAssembledI (h1: $ assembled c (asmProc p A) $):
+  $ localAssembled (c <> p) A $ = 'sorry;
+
+@mmc-th theorem localAssembled_l
+  (h1: $ localAssembled ctx (A +asm B) $): $ localAssembled ctx A $ = 'sorry;
+
+@mmc-th theorem localAssembled_r
+  (h1: $ localAssembled ctx (A +asm B) $): $ localAssembled ctx B $ = 'sorry;
diff --git a/examples/compiler-old.mm1 b/examples/compiler-old.mm1
@@ -2120,20 +2120,3 @@ $ S\ vs, (E.s a e. _V, (has_ty a (ty @' vs) *s
 $ ,(letrec ([(f n a x)
     @ if {n = 0} '(cons ,a ,x) (f {n - 1} '(fst ,a) @ f {n - 1} '(snd ,a) x)])
     (f 4 'a 'l)) $;
-
-do {
-  (def mmc-compiler (ref! (mmc-init)))
-  (def mmc-reset
-    (def c mmc-compiler)
-    (fn () (set! c (mmc-init))))
-  (def mmc-add
-    (def c mmc-compiler)
-    (fn xs (apply c '+ xs)))
-  (def mmc-finish
-    (def c mmc-compiler)
-    (fn xs (apply c 'finish xs)))
-  (def mmc-compiler)
-  (def (mmc-compile x . xs)
-    (apply mmc-add xs)
-    (mmc-finish x))
-};