diff --git a/examples/bfgs.dx b/examples/bfgs.dx
index 237664a39..16c121485 100644
--- a/examples/bfgs.dx
+++ b/examples/bfgs.dx
@@ -21,7 +21,7 @@ def zoom(
   c1:Float,
   c2:Float
 ) -> Float =
-  -- Zoom line search helper; Algorithm 3.6 from Nocedal and Wright (2006).
+  # Zoom line search helper; Algorithm 3.6 from Nocedal and Wright (2006).
 
   f0 = f_line 0.
   g0 = grad f_line 0.
@@ -32,10 +32,10 @@ def zoom(
     a_lo = get a_lo_ref
     a_hi = get a_hi_ref
 
-    --Find a trial step length between a_lo and a_hi.
+    #Find a trial step length between a_lo and a_hi.
     a_i = 0.5 * (a_lo + a_hi)
 
-    if (a_hi - a_lo) < 0.000001  -- The line search failed.
+    if (a_hi - a_lo) < 0.000001  # The line search failed.
       then Done a_i
       else
         f_ai = f_line a_i
@@ -57,7 +57,7 @@ def zoom_line_search(
   maxiter:Nat,
   f: (Float)->Float
   ) -> Float =
-  --Algorithm 3.5 from Nocedal and Wright (2006).
+  #Algorithm 3.5 from Nocedal and Wright (2006).
   c1 = 0.0001
   c2 = 0.9
   a_max = 1.
@@ -90,7 +90,7 @@ def backtracking_line_search(
   maxiter:Nat,
   f: (Float)->Float
   ) -> Float =
-  -- Algorithm 3.1 in Nocedal and Wright (2006).
+  # Algorithm 3.1 in Nocedal and Wright (2006).
   a_init = 1.
   f_0 = f 0.
   g_0 = grad f 0.
@@ -113,14 +113,14 @@ struct BFGSresults(n|Ix) =
   num_iter: Nat
 
 def bfgs_minimize(
-  f: (n=>Float)->Float,                 --Objective function.
-  x0: n=>Float,                       --Initial point.
-  H0: n=>n=>Float,                    --Initial inverse Hessian approximation.
-  linesearch: ((Float)->Float)->Float,  --Line search that returns a step size.
-  tol: Float,                         --Convergence tolerance (of the gradient L2 norm).
-  maxiter: Nat                        --Maximum number of BFGS iterations.
+  f: (n=>Float)->Float,                 #Objective function.
+  x0: n=>Float,                       #Initial point.
+  H0: n=>n=>Float,                    #Initial inverse Hessian approximation.
+  linesearch: ((Float)->Float)->Float,  #Line search that returns a step size.
+  tol: Float,                         #Convergence tolerance (of the gradient L2 norm).
+  maxiter: Nat                        #Maximum number of BFGS iterations.
   ) -> BFGSresults n given (n|Ix) =
-  -- Algorithm 6.1 in Nocedal and Wright (2006).
+  # Algorithm 6.1 in Nocedal and Wright (2006).
 
   xref <- with_state x0
   Href <- with_state H0
@@ -143,7 +143,7 @@ def bfgs_minimize(
         g_next = grad f x_next
         grad_diff = g_next - g
 
-        -- Update the inverse Hessian approximation.
+        # Update the inverse Hessian approximation.
         rho_inv = vdot grad_diff x_diff
         if not (rho_inv == 0.)
             then
@@ -157,7 +157,7 @@ def bfgs_minimize(
 
 
 def rosenbrock(coord:(Fin 2)=>Float) -> Float =
-  --A standard optimization test case; the optimum is (1, 1).
+  #A standard optimization test case; the optimum is (1, 1).
   x = coord[0@_]
   y = coord[1@_]
   pow (1 - x) 2 + 100 * pow (y - x * x) 2
@@ -188,8 +188,8 @@ def multiclass_logreg(
   res = bfgs_minimize ob_fun w0 eye (\f. zoom_line_search maxls f) tol maxiter
   res.fval
 
--- Define a version of `multiclass_logreg` with Int instead of Nat labels, callable from Python
--- (see benchmarks/bfgs.py).
+# Define a version of `multiclass_logreg` with Int instead of Nat labels, callable from Python
+# (see benchmarks/bfgs.py).
 def multiclass_logreg_int(
   xs:(Fin n)=>(Fin d)=>Float,
   ys:(Fin n)=>Int,
diff --git a/examples/brownian_motion.dx b/examples/brownian_motion.dx
index 60eab8076..371d87bc8 100644
--- a/examples/brownian_motion.dx
+++ b/examples/brownian_motion.dx
@@ -31,17 +31,17 @@ def sample_unit_brownian_bridge(
     key:Key,
     t:Float
     ) -> v given (v|VSpace) =
-  -- Can only handle t between 0 and 1.
-  -- iteratively subdivide to desired tolerance.
+  # Can only handle t between 0 and 1.
+  # iteratively subdivide to desired tolerance.
   num_iters = 10 + f_to_n (-log tolerance)
   init_state = SamplerState(key, zero, 1.0, t)
   result = fold init_state \i:(Fin num_iters) prev.
       [key_draw, key_left, key_right] = split_key key
-      -- add scaled noise
+      # add scaled noise
       t' = abs (prev.t - 0.5)
       new_y = prev.y + prev.sigma * (0.5 - t') .* sampler key_draw
 
-      -- zoom in left or right
+      # zoom in left or right
       new_key = if prev.t > 0.5
         then key_left
         else key_right
@@ -72,8 +72,8 @@ def sample_brownian_bridge(
     y0:v,            t1:Float,
     y1:v,            t:Float
     ) -> v given (v|VSpace) =
-  -- Linearly interpolate between the two bracketing samples
-  -- and add appropriately scaled Brownian bridge noise.
+  # Linearly interpolate between the two bracketing samples
+  # and add appropriately scaled Brownian bridge noise.
   unit_bb = \t. sample_unit_brownian_bridge (tolerance / (t1 - t0)) sampler key t
   bridge_val = scale_brownian_bridge unit_bb t0 t1 t
   interp_val = linear_interp t0 y0 t1 y1 t
@@ -85,8 +85,8 @@ def sample_brownian_motion(
     key:Key,
     t':Float
     ) -> v   given (v|VSpace) =
-  -- Can handle a t' anywhere on the real line.
-  -- Handle negative times by reflecting and using a different key.
+  # Can handle a t' anywhere on the real line.
+  # Handle negative times by reflecting and using a different key.
   [neg_key', key'] = split_key key
   (key, t) = if t' < 0.0
     then (neg_key', -t')
@@ -95,8 +95,8 @@ def sample_brownian_motion(
   [key_start, key_rest] = split_key key
   first_f = sampler key_start
 
-  -- Iteratively sample a point twice as far ahead
-  -- until we bracket the query time.
+  # Iteratively sample a point twice as far ahead
+  # until we bracket the query time.
   doublings = Fin (1 + (natlog2 (f_to_n t)))
   init_state = (0.0, zero, 1.0, first_f, key_rest)
   (t0, left_f, t1, right_f, key_draw) =
@@ -105,8 +105,8 @@ def sample_brownian_motion(
       [key_current, key_continue] = split_key key_rest
       new_right_f = left_f + (sqrt t1) .* sampler key_current
       (t1, right_f, t1 * 2.0, new_right_f, key_continue)
-  -- The above iterative loop could be mostly parallelized, but would
-  -- require some care to get the random keys right.
+  # The above iterative loop could be mostly parallelized, but would
+  # require some care to get the random keys right.
 
   sample_brownian_bridge tolerance sampler key_draw t0 left_f t1 right_f t
 
@@ -152,7 +152,7 @@ from a Gaussian process.  In particular, we can re-use the
 This process can be nested to arbitrarily many dimensions.
 
 interface HasBrownianSheet(a:Type, v|HasStandardNormal)
-                      -- tolerance -> key -> input -> output
+                      # tolerance -> key -> input -> output
   sample_brownian_sheet : (Float, Key, a) -> v
 
 instance HasBrownianSheet(Float, v) given (v|HasStandardNormal|VSpace)
@@ -161,8 +161,8 @@ instance HasBrownianSheet(Float, v) given (v|HasStandardNormal|VSpace)
 
 instance HasBrownianSheet((a, Float), v) given (a, v|VSpace) (HasBrownianSheet a v)
   def sample_brownian_sheet(tol, k, xab) =
-    -- the call to `sample_brownian_sheet` below will recurse
-    -- until the input is one-dimensional.
+    # the call to `sample_brownian_sheet` below will recurse
+    # until the input is one-dimensional.
     (xa, xb) = xab
     sample_a = \k. sample_brownian_sheet tol k xa
     sample_brownian_motion tol sample_a k xb
diff --git a/examples/ctc.dx b/examples/ctc.dx
index 34b26998f..e677501ac 100644
--- a/examples/ctc.dx
+++ b/examples/ctc.dx
@@ -13,7 +13,7 @@ However, Dex's autodiff can produce the backward
 pass automatically.  That makes this code much shorter than
 most implementations.
 
-import stats  -- for log-space representation of probabilities.
+import stats  # for log-space representation of probabilities.
 
 
 '## Custom index sets
@@ -103,7 +103,7 @@ project(to=AllButFirst (Bool `Either` Bool), 0@(Bool `Either` Bool))
 '### Helper functions
 
 def interleave_blanks(blank:v, seq: m=>v) -> (FenceAndPosts m)=>v given (m|Ix, v) =
-  -- Turns "text" into " t e x t ".
+  # Turns "text" into " t e x t ".
   for idx. case idx of
     Fence i -> seq[i]
     Posts i -> blank
@@ -127,8 +127,8 @@ def ctc_non_empty(
   ilabels = interleave_blanks blank labels
   label_probs = for t. normalize logits[t]
 
-  -- Initialize dynamic programming table to all zeros
-  -- except for starting with either a blank or the first token.
+  # Initialize dynamic programming table to all zeros
+  # except for starting with either a blank or the first token.
   prob_seq_at_start = yield_state zero \lpRef.
     lpRef!(Posts first_ix) := label_probs[first_ix,blank]
     lpRef!(Fence first_ix) := label_probs[first_ix,labels[first_ix]]
@@ -150,14 +150,14 @@ def ctc_non_empty(
       prob_transition_from_blank      = safe_idx_sub prev s 1
       prob_transition_from_last_token = safe_idx_sub prev s 2
 
-      -- Equation 6 from CTC paper.
+      # Equation 6 from CTC paper.
       transition_prob = if ilabels[s] == blank || same_token_as_last s
         then prob_no_transition + prob_transition_from_blank
         else prob_no_transition + prob_transition_from_blank +
           prob_transition_from_last_token
       transition_prob * label_probs[inject t, ilabels[s]]
 
-  -- Sum over last two entries in the table.  Eq 8 from paper.
+  # Sum over last two entries in the table.  Eq 8 from paper.
   end_with_label = prob_seq_final[Fence last_ix]
   end_with_space = prob_seq_final[Posts last_ix]
   end_with_label + end_with_space
@@ -201,11 +201,11 @@ Vocab = Fin 6
 blank = 0@Vocab
 Time = Fin 4
 
--- Create random logits and labels
+# Create random logits and labels
 logits : Time => Vocab => (LogSpace Float) = arb $ new_key 0
 labels = [(3@Vocab), (4@Vocab), (1@Vocab)]
 
--- Evaluate probability.
+# Evaluate probability.
 :p ls_to_f $ ctc blank logits labels
 > 0.00208998
 
@@ -216,7 +216,7 @@ computes `p(labels|logits)`, which should sum to 1.0 over all possible labels.
 They don't sum to one.  Maybe there is a bug in the above code
 or the paper.
 
--- Fin N=>Vocab is the set of all combinations of N tokens.
+# Fin N=>Vocab is the set of all combinations of N tokens.
 sum for i:(Fin 3=>Vocab).
   ls_to_f $ ctc blank logits i
 > 0.5653746
diff --git a/examples/dither.dx b/examples/dither.dx
index 40b2a9f5b..5acce7398 100644
--- a/examples/dither.dx
+++ b/examples/dither.dx
@@ -81,7 +81,7 @@ halftone = [[14.0,  10,  6, 13, 19, 23, 27, 20],
 
 ' # Dithering a real image
 
--- conversion from RGB to grayscale
+# conversion from RGB to grayscale
 
 def pixel(x:Char) -> Float32 =
   r = w8_to_i x
diff --git a/examples/fluidsim.dx b/examples/fluidsim.dx
index 64b6b1ddb..3d606f77a 100644
--- a/examples/fluidsim.dx
+++ b/examples/fluidsim.dx
@@ -6,13 +6,13 @@ import png
 import plot
 
 def wrapidx(n|Ix, i:Int) -> n =
-  -- Index wrapping around at ends.
+  # Index wrapping around at ends.
   asidx $ unsafe_i_to_n $ mod i $ n_to_i $ size n
 
-def incwrap(i:n) -> n given (n|Ix) =  -- Increment index, wrapping around at ends.
+def incwrap(i:n) -> n given (n|Ix) =  # Increment index, wrapping around at ends.
   asidx $ unsafe_i_to_n $ mod ((n_to_i $ ordinal i) + 1) $ n_to_i $ size n
 
-def decwrap(i:n) -> n given (n|Ix) =  -- Decrement index, wrapping around at ends.
+def decwrap(i:n) -> n given (n|Ix) =  # Decrement index, wrapping around at ends.
   asidx $ unsafe_i_to_n $ mod (n_to_i (ordinal i) - 1) $ n_to_i $ size n
 
 def finite_difference_neighbours(x:n=>a) -> n=>a given (n|Ix, a|Add|Sub) =
@@ -42,11 +42,11 @@ def add_neighbours_2d(x:n=>m=>a) -> (n=>m=>a) given (n|Ix, m|Ix, a|Add) =
   ax1 + ax2
 
 def project_vel(v: n=>m=>(Fin 2)=>a) -> n=>m=>(Fin 2)=>a given (n|Ix, m|Ix, a|VSpace) =
-  -- Project_vel the velocity field to be approximately mass-conserving,
-  -- using a few iterations of Gauss-Seidel.
+  # Project_vel the velocity field to be approximately mass-conserving,
+  # using a few iterations of Gauss-Seidel.
   h = 1.0 / n_to_f (size n)
 
-  -- unpack into two scalar fields
+  # unpack into two scalar fields
   vx = for i j. v[i,j, from_ordinal 0]
   vy = for i j. v[i,j, from_ordinal 1]
 
@@ -59,7 +59,7 @@ def project_vel(v: n=>m=>(Fin 2)=>a) -> n=>m=>(Fin 2)=>a given (n|Ix, m|Ix, a|VS
   vx = vx - (0.5 / h) .* fdx(p)
   vy = vy - (0.5 / h) .* fdy(p)
 
-  for i j. [vx[i,j], vy[i,j]]  -- pack back into a table.
+  for i j. [vx[i,j], vy[i,j]]  # pack back into a table.
 
 def bilinear_interp(
     right_weight:Float, bottom_weight:Float,
@@ -71,32 +71,32 @@ def bilinear_interp(
   left + right
 
 def advect(f: n=>m=>a, v: n=>m=>(Fin 2)=>Float) -> n=>m=>a given (n|Ix, m|Ix, a|VSpace) =
-  -- Move field f according to x and y velocities (u and v)
-  -- using an implicit Euler integrator.
+  # Move field f according to x and y velocities (u and v)
+  # using an implicit Euler integrator.
 
   cell_xs = linspace n 0.0 $ n_to_f (size n)
   cell_ys = linspace m 0.0 $ n_to_f (size m)
 
   for i j.
-    -- Location of source of flow for this cell.  No meshgrid!
+    # Location of source of flow for this cell.  No meshgrid!
     center_xs = cell_xs[i] - v[i,j, from_ordinal 0]
     center_ys = cell_ys[j] - v[i,j, from_ordinal 1]
 
-    -- Index of source cell.
+    # Index of source cell.
     source_col = floor center_xs
     source_row = floor center_ys
 
-    -- Relative weight of right-hand and bottom cells.
+    # Relative weight of right-hand and bottom cells.
     right_weight  = center_xs - source_col
     bottom_weight = center_ys - source_row
 
-    -- Cast back to indices, wrapping around edges.
+    # Cast back to indices, wrapping around edges.
     l = wrapidx n  (f_to_i source_col)
     r = wrapidx n ((f_to_i source_col) + 1)
     t = wrapidx m  (f_to_i source_row)
     b = wrapidx m ((f_to_i source_row) + 1)
 
-    -- A convex weighting of the 4 surrounding cells.
+    # A convex weighting of the 4 surrounding cells.
     bilinear_interp right_weight bottom_weight f[l,t] f[l,b] f[r,t] f[r,b]
 
 def fluidsim(
@@ -107,9 +107,9 @@ def fluidsim(
   with_state (color_init, v) \state.
     for i:(Fin num_steps).
       (color, v) = get state
-      v = advect v v          -- Move velocities
-      v = project_vel v       -- Project to be volume-preserving
-      color' = advect color v -- Move color
+      v = advect v v          # Move velocities
+      v = project_vel v       # Project to be volume-preserving
+      color' = advect color v # Move color
       state := (color', v)
       color
 
@@ -118,13 +118,13 @@ def fluidsim(
 N = Fin 50
 M = Fin 50
 
--- Create random velocity field.
+# Create random velocity field.
 def ixkey3(k:Key, i:n, j:m, k2:o) -> Key given (n|Ix, m|Ix, o|Ix) =
   hash (hash (hash k (ordinal i)) (ordinal j)) (ordinal k2)
 init_velocity = for i:N j:M k:(Fin 2).
   3.0 * (randn $ ixkey3 (new_key 0) i j k)
 
--- Create diagonally-striped color pattern.
+# Create diagonally-striped color pattern.
 init_color = for i:N j:M.
   i' = n_to_f $ ordinal i
   j' = n_to_f $ ordinal j
@@ -133,7 +133,7 @@ init_color = for i:N j:M.
   g = b_to_f $ (sin $ (j' + i') / 4.0) > 0.0
   [r, g, b]
 
--- Run fluid sim and plot it.
+# Run fluid sim and plot it.
 num_steps = 5
 :html imseqshow $ fluidsim num_steps init_color init_velocity
 > <html output>
@@ -142,7 +142,7 @@ num_steps = 5
 
 target = transpose init_color
 
--- This is partial
+# This is partial
 def last(xs:n=>a) -> a given (n|Ix, a) = xs[(unsafe_nat_diff (size n) 1)@_]
 
 def objective(v:N=>M=>(Fin 2)=>Float) -> Float =
diff --git a/examples/kernelregression.dx b/examples/kernelregression.dx
index 0135bd1f8..d16ceae55 100644
--- a/examples/kernelregression.dx
+++ b/examples/kernelregression.dx
@@ -8,7 +8,7 @@ struct ConjGradState(a|VSpace) =
   r : a
   p : a
 
--- Conjugate gradients solver
+# Conjugate gradients solver
 def solve'(mat:m=>m=>Float, b:m=>Float) -> m=>Float given (m|Ix) =
   x0 = zero
   ax = mat **. x0
@@ -42,7 +42,7 @@ where $k:\mathcal X \times \mathcal X \to \mathbb R$ is a positive semidefinite
 The optimal coefficients are found by solving a linear system $\alpha=G^{-1}y$,\
 where $G_{ij}:=k(x_i, x_j)+\delta_{ij}\lambda$, $\lambda>0$ and $y = (y_1,\dots,y_N)^\top\in\mathbb R^N$
 
--- Synthetic data
+# Synthetic data
 Nx = Fin 100
 noise = 0.1
 [k1, k2] = split_key (new_key 0)
@@ -53,7 +53,7 @@ def trueFun(x:Float) -> Float =
 xs : Nx=>Float = for i. rand (ixkey k1 i)
 ys : Nx=>Float = for i. trueFun xs[i] + noise * randn (ixkey k2 i)
 
--- Kernel ridge regression
+# Kernel ridge regression
 def regress(kernel: (a, a) -> Float, xs: Nx=>a, ys: Nx=>Float) -> (a) -> Float given (a) =
     gram = for i j. kernel xs[i] xs[j] + select (i==j) 0.0001 0.0
     alpha = solve' gram ys
@@ -64,7 +64,7 @@ def rbf(lengthscale:Float, x:Float, y:Float) -> Float =
 
 predict = regress (\x y. rbf 0.2 x y) xs ys
 
--- Evaluation
+# Evaluation
 Nxtest = Fin 1000
 xtest : Nxtest=>Float = for i. rand (ixkey k1 i)
 preds = map predict xtest
diff --git a/examples/latex.dx b/examples/latex.dx
index a8c159b5f..bdd33b5fb 100644
--- a/examples/latex.dx
+++ b/examples/latex.dx
@@ -33,9 +33,9 @@
 'LaTeX rendering should not occur in code blocks, nor in error output cells.
 
 def array_sum(x:a=>Int32) -> Int32 given (a|Ix) =
-  -- Note: the following line has `$ ... $`, but it should not trigger KaTeX.
-  -- Note: the incorrect usage of `with_state` below is intentional to verify
-  --       that `$ ... $` is not rendered as LaTeX in error output cells.
+  # Note: the following line has `$ ... $`, but it should not trigger KaTeX.
+  # Note: the incorrect usage of `with_state` below is intentional to verify
+  #       that `$ ... $` is not rendered as LaTeX in error output cells.
   snd $ with_state 0 \acc.
     for i.
       acc := (get acc) + x[i]
diff --git a/examples/linear-maps.dx b/examples/linear-maps.dx
index 400367099..3631c5c03 100644
--- a/examples/linear-maps.dx
+++ b/examples/linear-maps.dx
@@ -11,17 +11,17 @@ import linalg
 
 'Todo: factor out a more general linear map typeclass, and make this one inherit from it.
 
--- interface [VSpace m, VSpace in, VSpace out] LinearMap m in out
-  -- apply: m -> in -> out
-  -- transpose': m -> m
+# interface [VSpace m, VSpace in, VSpace out] LinearMap m in out
+  # apply: m -> in -> out
+  # transpose': m -> m
 
 '## Linear Endomorphisms
 a.k.a. linear maps from a space back to that same space.
 
 interface HasDeterminant(m)
   determinant': (m) -> Float
-  transposeType : Type  -- unused for now
-  transpose': (m) -> m      -- In future: m -> transposeType m
+  transposeType : Type  # unused for now
+  transpose': (m) -> m      # In future: m -> transposeType m
   identity': m
 
 interface LinearEndo(m|HasDeterminant, v|VSpace)
@@ -139,8 +139,8 @@ instance HasDeterminant(SkewSymmetricMap n) given (n|Ix)
     True -> zero
     False ->
       dense_rep = skew_symmetric_prod a.val eye
-      determinant dense_rep  -- Naive algorithm, could be done with Pfaffian
-  transposeType = (i:n)=>(..<i)=>Float  -- 
+      determinant dense_rep  # Naive algorithm, could be done with Pfaffian
+  transposeType = (i:n)=>(..<i)=>Float  # 
   def transpose'(x) = SkewSymmetricMap (-x.val)
   identity' = error "Skew symmetric matrices can't represent the identity map."
 
@@ -148,20 +148,20 @@ instance LinearEndo(SkewSymmetricMap n, n=>v) given (n|Ix, v|Mul|VSpace)
   def apply(x, y)  = skew_symmetric_prod x.val y
   def diag(x)      = zero
   def solve'(x, y) =
-    dense_rep = skew_symmetric_prod x.val eye  -- Fall back to naive algorithm
+    dense_rep = skew_symmetric_prod x.val eye  # Fall back to naive algorithm
     solve dense_rep y
 
 instance Arbitrary(SkewSymmetricMap n) given (n|Ix)
   def arb(k) = SkewSymmetricMap $ arb k
 
 
--- Todo: Sparse matrices. Need hashmaps for these to be practical.
+# Todo: Sparse matrices. Need hashmaps for these to be practical.
 
 
 
 '## Application 1: Gaussians
 
--- This typeclass will be obsolete once the `Basis` typeclass can be written.
+# This typeclass will be obsolete once the `Basis` typeclass can be written.
 interface HasStandardNormal(a:Type)
   randNormal : (Key) -> a
 
@@ -185,7 +185,7 @@ This single definition of a Gaussian log pdf should work
 efficiently for any type of covariance matrix for which
 an efficient solve and determinant is known.
 
--- This helper will be osbolete once the basis typeclass works.
+# This helper will be osbolete once the basis typeclass works.
 def get_VSpace_dim(x:v) -> Float given (v|Mul|VSpace|InnerProd) =
   one' : v = one
   inner_prod one' one'
@@ -257,11 +257,11 @@ def check_transpose(m|HasDeterminant|Arbitrary, given () (LinearEndo m v)) -> Bo
 
 check_transpose $ SkewSymmetricMap vec_len
 > True
--- Disabled until associated types are implemented.
--- check_transpose $ LowerTriMap vec_len
--- > True
--- check_transpose $ UpperTriMap vec_len
--- > True
+# Disabled until associated types are implemented.
+# check_transpose $ LowerTriMap vec_len
+# > True
+# check_transpose $ UpperTriMap vec_len
+# > True
 check_transpose $ vec_len=>vec_len=>Float
 > True
 check_transpose $ DiagMap vec_len
@@ -269,7 +269,7 @@ check_transpose $ DiagMap vec_len
 check_transpose ScalarMap
 > True
 
--- Check that 1D Gaussian sums to 1
+# Check that 1D Gaussian sums to 1
 sizen = Fin 2000
 span = 10.0
 xs = linspace sizen (-span) span
@@ -293,14 +293,14 @@ def check_2D_Gaussian_normalizes(m|Arbitrary, given () (LinearEndo m ((Fin 2) =>
     exp $ gaussian_log_pdf meanvec covroot x
   integral ~~ 1.0
 
--- Disable due to FP tolerance issue
--- check_2D_Gaussian_normalizes (SkewSymmetricMap (Fin 2))
--- > True
--- Disabled until associated types are implemented.
--- check_2D_Gaussian_normalizes (LowerTriMat (Fin 2) Float)
--- > True
--- check_2D_Gaussian_normalizes (UpperTriMat (Fin 2) Float)
--- > True
+# Disable due to FP tolerance issue
+# check_2D_Gaussian_normalizes (SkewSymmetricMap (Fin 2))
+# > True
+# Disabled until associated types are implemented.
+# check_2D_Gaussian_normalizes (LowerTriMat (Fin 2) Float)
+# > True
+# check_2D_Gaussian_normalizes (UpperTriMat (Fin 2) Float)
+# > True
 check_2D_Gaussian_normalizes $ (Fin 2)=>(Fin 2)=>Float
 > True
 check_2D_Gaussian_normalizes $ DiagMap (Fin 2)
@@ -323,9 +323,9 @@ def radon_nikodym(
   state:s,
   t:Time
   ) -> Float given (m, s|InnerProd) (LinearEndo m s) =
-  -- By Girsanov's theorem, this gives a simple Monte Carlo estimator
-  -- of the (loosely speaking) instantaneous KL divergence between
-  -- two SDEs that share a diffusion function.
+  # By Girsanov's theorem, this gives a simple Monte Carlo estimator
+  # of the (loosely speaking) instantaneous KL divergence between
+  # two SDEs that share a diffusion function.
   difference = (drift1 state t) - (drift2 state t)
   cur_diffusion = diffusion state t
   a = solve' cur_diffusion difference
@@ -357,7 +357,7 @@ def stationary_SDE_parts_to_SDE(
   (neg_energy_func, skew_symm_map, diffusion_func) = parts
   drift = \state time.
 
-    diffusion_prod = \vec.  -- Square the root of the covariance matrix
+    diffusion_prod = \vec.  # Square the root of the covariance matrix
       cur_diffusion_root = diffusion_func state
       0.5 .* (apply cur_diffusion_root (apply cur_diffusion_root vec))
   
@@ -376,4 +376,4 @@ def stationary_SDE_parts_to_SDE(
   diffusion = \state time. diffusion_func state
   (drift, diffusion)
 
--- Todo: tests for SDE functions.
+# Todo: tests for SDE functions.
diff --git a/examples/manifold-gradients.dx b/examples/manifold-gradients.dx
index f9f48a845..7674d8e9f 100644
--- a/examples/manifold-gradients.dx
+++ b/examples/manifold-gradients.dx
@@ -26,18 +26,18 @@ compiler is able to infer the tangent space type from the manifold type, but
 this feature is not yet available in Dex.
 
 interface TangentSpace(manifold, tangent|VSpace)
-  -- Given a point p on the manifold M and a point v in its tangent space TpM,
-  -- this returns another point q on the manifold.
-  -- Must satisfy tangentExponentialMap p zero == p
-  -- (tangentExponentialMap p) will be linearized around zero, so it is
-  -- important that any operations used have well-defined derivatives for inputs
-  -- p, zero.
+  # Given a point p on the manifold M and a point v in its tangent space TpM,
+  # this returns another point q on the manifold.
+  # Must satisfy tangentExponentialMap p zero == p
+  # (tangentExponentialMap p) will be linearized around zero, so it is
+  # important that any operations used have well-defined derivatives for inputs
+  # p, zero.
   tangentExponentialMap: (manifold, tangent) -> manifold
-  -- (tangentLogarithmMap p) is the pointwise inverse to
-  -- (tangentExponentialMap p).
-  -- Must satisfy tangentLogarithmMap p p == zero
-  -- (tangentLogarithmMap p) will be linearized around p, so it is important
-  -- that any operations used have well-defined derivatives for inputs p, p.
+  # (tangentLogarithmMap p) is the pointwise inverse to
+  # (tangentExponentialMap p).
+  # Must satisfy tangentLogarithmMap p p == zero
+  # (tangentLogarithmMap p) will be linearized around p, so it is important
+  # that any operations used have well-defined derivatives for inputs p, p.
   tangentLogarithmMap: (manifold, manifold) -> tangent
 
 
@@ -101,7 +101,7 @@ def myFunc(x : n => Float) -> Float given (n|Ix) =
 
 equivalence_example = for i:(Fin 10). n_to_f (ordinal i) / 10.
 original = linearize myFunc equivalence_example
--- Need to explicitly set up the types here, due to missing associated types.
+# Need to explicitly set up the types here, due to missing associated types.
 new : (Float, (Fin 10 => Float) -> Float) = manifoldLinearize myFunc equivalence_example
 
 fst original ~~ fst new
@@ -142,8 +142,8 @@ values, as long as they are not both zero, and we will identify `S{x, y}` with
 the point $(x, y)$ on $S^1$ given by normalizing the associated element of
 $\mathbb{R}^2$, that is $x$ = `x / sqrt(x*x + y*y)`, $y$ = `y / sqrt(x*x + y*y)`.
 
--- Normalizes an S1 instance. Note this doesn't modify the point on the circle
--- which it represents.
+# Normalizes an S1 instance. Note this doesn't modify the point on the circle
+# which it represents.
 def s1Normalize(s: S1) -> S1 =
   magnitude = sqrt $ s.x * s.x + s.y * s.y
   S1(s.x / magnitude, s.y / magnitude)
@@ -179,7 +179,7 @@ between them: $\cos \theta = (1 - d^2) / 2$.
 ' Now we can define the tangent space structure on `S1`. There is no arccos
 function in Dex so we copy the numerical approximation from `quaterions.dx`.
 
--- source: https://developer.nvidia.com/cg-toolkit
+# source: https://developer.nvidia.com/cg-toolkit
 def acos(x:Float) -> Float =
   negate = select (x < 0.0) 1.0 0.0
   x = abs x
@@ -199,22 +199,22 @@ instance TangentSpace(S1, Float)
     sn  = s1Normalize s
     sn' = s1Normalize s'
     dSquared = (sn.x - sn'.x) * (sn.x - sn'.x) + (sn.y - sn'.y) * (sn.y - sn'.y)
-    -- Check if we're using positive or negative inverse cos solution.
-    -- This is done by considering the sign of the cross product of the two
-    -- points in 3D space.
+    # Check if we're using positive or negative inverse cos solution.
+    # This is done by considering the sign of the cross product of the two
+    # points in 3D space.
     negate = select ((sn.x * sn'.y - sn'.x * sn.y) > 0.) 1.0 (-1.0)
     negate * (acos $ (1. - dSquared / 2.))
 
 ' Check that the tangent and logarithm are actually inverse to one another:
 
--- Convenience function to help check for equality.
+# Convenience function to help check for equality.
 def s1ToR2(s: S1) -> (Fin 2 => Float) = [s.x, s.y]
 
--- Two arbitrary points on the circle, not normalized.
+# Two arbitrary points on the circle, not normalized.
 circlePointOne = S1(3.4, 0.2)
 circlePointTwo = S1(-1.0,1.8)
 
--- Applying tangent mappings at circlePointTwo, to circlePointOne
+# Applying tangent mappings at circlePointTwo, to circlePointOne
 intermediateLog: Float = tangentLogarithmMap circlePointTwo circlePointOne
 roundTrip = tangentExponentialMap circlePointTwo intermediateLog
 
@@ -229,7 +229,7 @@ $- \sin \theta = -y$.
 
 def getX(s: S1) -> Float = s.x
 
--- Make types explicit
+# Make types explicit
 s1ManifoldGrad : ((S1) -> Float, S1) -> Float = manifoldGrad
 
 s1ManifoldGrad getX S1(0.0,1.0) ~~ -1.
@@ -264,22 +264,22 @@ struct Quaternion =
   z:Float
   w:Float
 
--- Conversion functions from a quaternion to an array of four components.
+# Conversion functions from a quaternion to an array of four components.
 def quatAsVec(q:Quaternion) -> Fin 4 => Float = [q.x, q.y, q.z, q.w]
 def quatFromVec(vec:Fin 4 => Float) -> Quaternion =
   [x, y, z, w] = vec
   Quaternion(x,y,z,w)
 
--- General normalization functions for arrays.
+# General normalization functions for arrays.
 def squareMagnitude(x:a=>Float) -> Float given (a|Ix) = sum $ for i. x[i] * x[i]
 def magnitude(x:a=>Float) -> Float given (a|Ix) = sqrt $ squareMagnitude x
 def normalize(x:a=>Float) -> a=> Float given (a|Ix) = (1.0 / (magnitude x)) .* x
 
--- Normalization for quaternions.
+# Normalization for quaternions.
 def quatNormalize(q: Quaternion) -> Quaternion =
   quatFromVec $ normalize $ quatAsVec q
 
--- The identity quaternion, representing a no-op rotation.
+# The identity quaternion, representing a no-op rotation.
 quatIdent = Quaternion(0.0, 0.0, 0.0, 1.0)
 
 
@@ -345,10 +345,10 @@ and running autodiff on the resulting linearizations may be inefficient.
 Taking higher-order deriviatives will also lose further precision as the number
 of approximation terms will be reduced by each successive differentiation.
 
--- Convenience alias for an array of three floats.
+# Convenience alias for an array of three floats.
 R3 = (Fin 3 => Float)
 
--- Taylor expansion for sinc (sqrt x).
+# Taylor expansion for sinc (sqrt x).
 def sincSqrt(x : Float) -> Float =
  (1.
   - (x  / 6.)
@@ -358,7 +358,7 @@ def sincSqrt(x : Float) -> Float =
   - (x * x * x * x * x/ 39916800.)
   + (x * x * x * x * x *  x / 6227020800.))
 
--- Taylor expansion for cos (sqrt x).
+# Taylor expansion for cos (sqrt x).
 def cosSqrt(x: Float) -> Float =
   (1.
    - (x / 2.)
@@ -368,7 +368,7 @@ def cosSqrt(x: Float) -> Float =
    - (x * x * x * x * x / 3628800.)
    + (x * x * x * x * x * x / 479001600.))
 
--- Taylor expansion for sinc (arccos x)
+# Taylor expansion for sinc (arccos x)
 def sincAcos(x: Float) -> Float =
   (2. / pi
    + x * 4. / (pow pi 2.)
@@ -377,11 +377,11 @@ def sincAcos(x: Float) -> Float =
    + x * x * x * x * (384. - 16. * pow pi 2. - 3. * pow pi 4.) / (12. * pow pi 5.)
    - x * x * x * x * x * 8. * (pow pi 4. - 120.) / (15. * pow pi 6.))
 
--- Exponential function at the identity.
+# Exponential function at the identity.
 def quatExpAtIdent(v : R3) -> Quaternion =
   [x', y', z'] = v
   m2 = squareMagnitude v
-  -- This is equal to sinc (m / 2) = 2 * sin (m / 2) / m
+  # This is equal to sinc (m / 2) = 2 * sin (m / 2) / m
   sincHalfM = sincSqrt $ m2 / 4.
   Quaternion(
      x' * sincHalfM / 2.,
@@ -389,10 +389,10 @@ def quatExpAtIdent(v : R3) -> Quaternion =
      z' * sincHalfM / 2.,
       cosSqrt $ m2 / 4. )
 
--- Inverse to exponential function at the identity.
+# Inverse to exponential function at the identity.
 def quatExpAtIdentInv(q': Quaternion) -> R3 =
-  -- Fine to use qNormalize here, since q should be approximately unit magnitude
-  -- (or at least non-zero), so the sqrt operation can be differentiated.
+  # Fine to use qNormalize here, since q should be approximately unit magnitude
+  # (or at least non-zero), so the sqrt operation can be differentiated.
   q = quatNormalize q'
   2. .* [q.x, q.y, q.z] / (sincAcos q.w)
 
@@ -411,23 +411,23 @@ looseTolerance = 0.005
 quatLooseTolerance = quatFromVec $ for i. looseTolerance
 tangentLooseTolerance : R3 = for i. looseTolerance
 
--- Just arbitrary quaternions.
+# Just arbitrary quaternions.
 quatForTestOne = quatNormalize (Quaternion(0.5, -0.2,  0.4,  1.0))
 quatForTestTwo = quatNormalize (Quaternion(0.5,  0.8, -0.9, -0.4))
 
 allclose quatLooseTolerance default_rtol (quatExpAtIdent $ quatExpAtIdentInv quatForTestOne) (quatForTestOne)
 > True
 
--- Same, but taking the logarithm and exponential at a non-identity point.
+# Same, but taking the logarithm and exponential at a non-identity point.
 logOfQuatForTestOne : R3 = tangentLogarithmMap quatForTestTwo quatForTestOne
 allclose quatLooseTolerance default_rtol (tangentExponentialMap quatForTestTwo logOfQuatForTestOne) quatForTestOne
 > True
 
--- Should be equal to quatIdent
+# Should be equal to quatIdent
 allclose quatLooseTolerance default_rtol (quatExpAtIdent $ quatExpAtIdentInv quatIdent) quatIdent
 > True
 
--- Now the other way around
+# Now the other way around
 tangentVectorForTest = [0.1, -0.2, 0.3]
 
 allclose tangentLooseTolerance default_rtol (quatExpAtIdentInv $ quatExpAtIdent tangentVectorForTest) tangentVectorForTest
@@ -465,18 +465,18 @@ allclose quatLooseTolerance default_rtol perturbedRotationDirect perturbedRotati
 
 ' Now we can test the linearization and gradient functions.
 
--- A function to get the Euler-angle roll from a quaternion.
+# A function to get the Euler-angle roll from a quaternion.
 def quatRoll(q : Quaternion) -> Float =
   sinrCosp = 2. * (q.w * q.x + q.y * q.z)
   cosrCosp = 1. - 2. * (q.x * q.x + q.y * q.y)
   atan2 sinrCosp cosrCosp
 
--- A fixed quaternion with roll = 0.5.
+# A fixed quaternion with roll = 0.5.
 quatSomeRoll = Quaternion(0.247, 0., 0., 0.969)
 quatRoll quatSomeRoll
 > 0.4991595
 
--- A function to compose a given quaternion with fixed rotation.
+# A function to compose a given quaternion with fixed rotation.
 def quatAddSomeRoll(q: Quaternion) -> Quaternion = q * quatSomeRoll
 
 ' According to our right-multiplication definition of the tangent space mappings,
@@ -492,20 +492,20 @@ allclose tangentLooseTolerance default_rtol (quatAddSomeRollLinearized linearInp
 
 ' Try taking the gradient of the `quatRoll` function, at various points.
 
--- Without associated types we have to define this, otherwise we get ambiguous
--- type variables.
+# Without associated types we have to define this, otherwise we get ambiguous
+# type variables.
 def quatGrad(f: (Quaternion) -> Float, x: Quaternion) -> R3 = manifoldGrad f x
 
--- Applying the gradient at a quaternion representing a "pure roll" rotation, we
--- see that the only way to modify the roll is by adding further rotation in the
--- roll direction.
+# Applying the gradient at a quaternion representing a "pure roll" rotation, we
+# see that the only way to modify the roll is by adding further rotation in the
+# roll direction.
 quatGrad quatRoll quatSomeRoll
 > [0.9999734, 0., 0.]
 
 
--- However, at other points, we can also modify the roll of the rotation by
--- moving in pitch and/or yaw directions. This demonstrates that the Euler
--- angles are not independent.
+# However, at other points, we can also modify the roll of the rotation by
+# moving in pitch and/or yaw directions. This demonstrates that the Euler
+# angles are not independent.
 quatGrad quatRoll quatForTestOne
 > [1.041024, 0.5948712, 1.490116e-08]
 
diff --git a/examples/mcmc.dx b/examples/mcmc.dx
index 4acc87ce7..4644a7479 100644
--- a/examples/mcmc.dx
+++ b/examples/mcmc.dx
@@ -37,7 +37,7 @@ def meanAndCovariance(xs:n=>d=>Float) -> (d=>Float, d=>d=>Float) given (n|Ix, d|
 
 '## Metropolis-Hastings implementation
 
-MHParams : Type = Float  -- step size
+MHParams : Type = Float  # step size
 
 def mhStep(
     stepSize: MHParams,
diff --git a/examples/mcts.dx b/examples/mcts.dx
index 163b6fe55..64bad5802 100644
--- a/examples/mcts.dx
+++ b/examples/mcts.dx
@@ -23,7 +23,7 @@ data Node(node_ix, a) =
 
 def Tree(node_ix|Ix, a:Type) -> Type =
   (node_ix=>(Maybe (Node node_ix a)), node_ix)
-  -- Node array, and pointer to last used node.
+  # Node array, and pointer to last used node.
 
 def root_of(tree:Tree(node_ix, a)) -> node_ix given (node_ix|Ix, a) = 0@node_ix
 
@@ -49,7 +49,7 @@ struct Game(state:Type, move:Type) =
 
 def NodeStats(state:Type) -> Type =
   (Nat, Nat, Nat, state)
-  -- (#winning rollouts for max/first player, #visit, level, state)
+  # (#winning rollouts for max/first player, #visit, level, state)
 
 def node_win_visit_stats(node:(Node node_ix (NodeStats state)))
   -> (Nat, Nat) given (node_ix, state) =
@@ -82,7 +82,7 @@ The upper confidence bound for trees of node $n$ is
 $$UCT(n) = \frac{Q(n)}{N(n)} + \sqrt{2\frac{ln(N(n_p))}{N(n)}}$$
 
 def uct(win:Nat, n:Nat, parent_n:Nat) -> Float =
-  -- wins stored at every node are w.r.t to the max_player
+  # wins stored at every node are w.r.t to the max_player
   (n_to_f win) / (n_to_f n) + sqrt (2.0 / (n_to_f n) * (log (n_to_f parent_n)))
 
 'Recursively find the most promising leaf (measured by UCT) to rollout next.
@@ -113,7 +113,7 @@ def mcts_select(
               Done ()
             else
               (_, p_visits) = node_win_visit_stats node
-              -- minimax
+              # minimax
               is_max_player = is_even $ node_level node
               selector = if is_max_player then argmax else argmin
               best_child = selector for i.
@@ -130,14 +130,14 @@ def mcts_expand(
     tree_ref: Ref h (Tree node_ix (NodeStats state)),
     node: node_ix
     ) -> {State h} () given (node_ix|Ix, state|Data, move, h) =
-    -- assume `node` is a leaf node (base) and has a non-terminal state (base_state)
+    # assume `node` is a leaf node (base) and has a non-terminal state (base_state)
 
-    -- extract info about the base node
+    # extract info about the base node
     base_node = from_just $ get (fst_ref tree_ref)!node
     Branch(base_stats, base_parent, _) = base_node
     (_, _, base_level, base_state) = base_stats
 
-    -- prepare child nodes
+    # prepare child nodes
     AsList(n, base_moves) = game.possible_moves base_state
     child_nodes = for i.
       new_move = base_moves[i]
@@ -145,13 +145,13 @@ def mcts_expand(
       new_stats = (0, 0, base_level + 1, new_state)
       Branch new_stats (Just node) (to_list [])
 
-    -- write to the tree
+    # write to the tree
     new_node = get (snd_ref tree_ref)
     base_children_ix = for i:(Fin n). new_node +| (1 + ordinal i)
-    -- update base's children list
+    # update base's children list
     new_base_node = Branch base_stats base_parent (to_list base_children_ix)
     (fst_ref tree_ref)!node := Just new_base_node
-    -- insert every child
+    # insert every child
     for i.
       child_ix = base_children_ix[i]
       (fst_ref tree_ref)!child_ix := Just child_nodes[i]
@@ -165,17 +165,17 @@ def mcts_rollout(
     key: Key,
     curr_state: state
     ) -> Bool given (move, state|Data)=
-    -- output is w.r.t. current player
+    # output is w.r.t. current player
     with_state curr_state \ref. iter \i.
       curr_state' = get ref
       if game.game_over curr_state'
         then
-          -- #win_or_lose is w.r.t. the last player
+          # #win_or_lose is w.r.t. the last player
           result = game.win_or_lose curr_state'
           Done $ xor (is_even i) result
         else
           AsList(_, new_moves) = game.possible_moves curr_state'
-          -- rollout policy: uniform random
+          # rollout policy: uniform random
           new_move = new_moves[rand_idx (hash key i)]
           ref := game.update curr_state' new_move
           Continue
@@ -189,23 +189,23 @@ def mcts_backpropagate(
     node: node_ix,
     new_rollout_wins: Nat
     ) -> {State h} () given (move, state|Data, node_ix|Eq|Ix, h) =
-    -- new_rollout_wins: #wins w.r.t max player
+    # new_rollout_wins: #wins w.r.t max player
 
     with_state node \node_ref. iter \_.
-      -- update win and loss counts
-      -- -unpack
+      # update win and loss counts
+      # -unpack
       curr_node = from_just $ get (fst_ref tree_ref)!(get node_ref)
       Branch(stats, parent, children) = curr_node
       (wins, visits, level, state) = stats
-      -- -compute
+      # -compute
       new_wins = wins + new_rollout_wins
-      new_visits = visits + 1  -- Todo: or to add ROLLOUTS_PER_SAMPLE
+      new_visits = visits + 1  # Todo: or to add ROLLOUTS_PER_SAMPLE
 
-      -- -pack
+      # -pack
       new_stats = (new_wins, new_visits, level, state)
       (fst_ref tree_ref)!(get node_ref) := Just (Branch new_stats parent children)
 
-      -- find the parent of the current node
+      # find the parent of the current node
       if is_nothing parent || (0@node_ix) == (get node_ref)
         then Done ()
         else
@@ -214,7 +214,7 @@ def mcts_backpropagate(
 
 ' Wrap up the procedure for taking one sample and extend the tree.
 
--- slow: every sample makes two copies of the tree array
+# slow: every sample makes two copies of the tree array
 def mcts_sample(
     game: Game state move,
     key: Key,
@@ -233,7 +233,7 @@ def mcts_sample(
       for i:(Fin ROLLOUTS_PER_SAMPLE).
         key' = ixkey key i
         leaf_result = mcts_rollout game key' $ node_state leaf
-                  -- w.r.t. the current player
+                  # w.r.t. the current player
         max_rollout_wins = b_to_n $ xor leaf_min_player leaf_result
         c += max_rollout_wins
 
@@ -264,7 +264,7 @@ key = new_key 0
 def test(key:Key, current_state:Nat) -> (Nat, Nat) =
   init_node_stats : NodeStats Count21State =
     (0, 0, 0, current_state)
-  -- (0 wins, 0 visits, level 0, game state)
+  # (0 wins, 0 visits, level 0, game state)
   start_tree = init_tree MAX_TREE_SIZE init_node_stats
   end_tree = fold start_tree \i:(Fin SAMPLES) tree.
     mcts_sample Count21 (ixkey key i) tree
diff --git a/examples/md.dx b/examples/md.dx
index 0581d5a93..47d047f20 100644
--- a/examples/md.dx
+++ b/examples/md.dx
@@ -13,11 +13,11 @@ def truncate(x: Float) -> Float =
     True -> -floor(-x)
     False -> floor x
 
--- A mod that matches np.mod and python mod.
+# A mod that matches np.mod and python mod.
 def pmod(x: Float, y: Float) -> Float =
   x - floor(x / y) * y
 
--- A mod that matches np.fmod and c fmod.
+# A mod that matches np.fmod and c fmod.
 def fmod(x: Float, y: Float) -> Float =
   x - truncate (x / y) * y
 
@@ -114,9 +114,9 @@ def fire_descent_step(
   energy: Energy n dim,
   state: FireDescentState n dim
   ) -> FireDescentState n dim given (n|Ix, dim|Ix) =
-  -- Constants that parameterize the FIRE algorithm.
-  -- TODO: Thread these constants through somehow.
-  -- dougalm@ is there a canonical way to do this?
+  # Constants that parameterize the FIRE algorithm.
+  # TODO: Thread these constants through somehow.
+  # dougalm@ is there a canonical way to do this?
   dt_start = 0.1
   dt_max = 0.4
   n_min = 5
@@ -129,27 +129,27 @@ def fire_descent_step(
 
   force = \r. -(grad energy) r
 
-  -- FIRE algorithm.
-  -- (MkFireDescentState {R, V, F, dt, alpha, n_pos}) = state
+  # FIRE algorithm.
+  # (MkFireDescentState {R, V, F, dt, alpha, n_pos}) = state
   dt = state.dt
   alpha = state.alpha
   n_pos = state.n_pos
-  -- Do a Velocity-Verlet step.
+  # Do a Velocity-Verlet step.
   R = for i. shift state.R[i] (state.V[i] *. dt + state.F[i] *. pow dt 2)
   F_old = state.F
   F = force R
   V = state.V + dt * 0.5 .* (F_old + F)
 
-  -- Rescale the velocity.
+  # Rescale the velocity.
   F_norm = sqrt $ sum $ sum for i j. pow F[i, j] 2
   V_norm = sqrt $ sum $ sum for i j. pow V[i, j] 2
 
   V = V + alpha .* (F *. V_norm / (F_norm + ε) - V)
 
-  -- Check whether the force is aligned with the velocity.
+  # Check whether the force is aligned with the velocity.
   FdotV = sum $ sum for i j. F[i, j] * V[i, j]
 
-  -- Decide whether to increase the speed of the simulation or reduce it.
+  # Decide whether to increase the speed of the simulation or reduce it.
   (n_pos, dt, alpha) = if FdotV >= 0.0
     then
       dt' = if n_pos >= n_min then (min (dt * f_inc) dt_max) else dt
@@ -182,7 +182,7 @@ N_small = 500
 d = 2
 L_small = box_size_at_number_density (n_to_i N_small) 1.2 (n_to_i d)
 
--- We will simulate in a box of this side length
+# We will simulate in a box of this side length
 L_small
 > 20.41241
 
@@ -271,8 +271,8 @@ are close enough to each other that we wish not to neglect them.
 get O(1) insertion at the end, we (currently) have to give an upper
 bound for the list's size.
 
--- TODO Can we encapsulate this BoundedList type as a `data` and still
--- define in-place mutation operations on it?
+# TODO Can we encapsulate this BoundedList type as a `data` and still
+# define in-place mutation operations on it?
 struct BoundedList(n|Ix, a|Data) =
   lim : n
   buf : n=>a
@@ -283,7 +283,7 @@ def unsafe_next_index(ix:n) -> n given (n|Ix)  =
 def empty_bounded_list(dummy_val: a) -> BoundedList n a given (a|Data, n|Ix) =
   BoundedList(unsafe_from_ordinal 0, for _. dummy_val)
 
--- The point of a `BoundedList` is O(1) push by in-place mutation
+# The point of a `BoundedList` is O(1) push by in-place mutation
 def bd_push(ref: Ref h (BoundedList n a), x: a) -> {State h} () given (h, a|Data, n|Ix) =
   sz = get ref.lim
   if ordinal sz < size n
@@ -291,9 +291,9 @@ def bd_push(ref: Ref h (BoundedList n a), x: a) -> {State h} () given (h, a|Data
       ref.buf!sz := x
       ref.lim := unsafe_next_index sz
     else
-      todo -- throw ()
+      todo # throw ()
 
--- Once we're done pushing, we can compact a `BoundedList` into a standard `List`.
+# Once we're done pushing, we can compact a `BoundedList` into a standard `List`.
 def as_list(lst: BoundedList n a) -> List a given (a|Data, n|Ix) =
   n_result = ordinal lst.lim
   AsList _ $ for i:(Fin n_result). lst.buf[unsafe_from_ordinal $ ordinal i]
@@ -310,13 +310,13 @@ that appear in each cell in the grid.
 def CellTable(dim|Ix, grid_size, bucket_size, atom_ix|Data) -> Type =
   GridIx dim grid_size => BoundedList (Fin bucket_size) atom_ix
 
--- Compute the cell an atom should go into.
+# Compute the cell an atom should go into.
 def target_cell(cell_size: Float, atom: Vec dim)
   -> GridIx dim grid_size given (dim|Ix, grid_size) =
   for dim. from_ordinal $ f_to_n $ atom[dim] / cell_size
 
--- A traversal of an atom array together with target cell.
--- We abstract this because we will use it twice.
+# A traversal of an atom array together with target cell.
+# We abstract this because we will use it twice.
 def traverse_cells(
   grid_size, cell_size,
   atoms: atom_ix => Vec dim,
@@ -337,14 +337,14 @@ def cell_table(
     traverse_cells grid_size cell_size atoms \ix cell.
       bd_push ref!cell ix
 
--- This is a helper for computing the bucket size.  Right now we end
--- up traversing the input twice (once to compute a size and once to
--- actually build the cell table); this is working around limitations
--- in Dex's support for mutable lists of statically unknown length.
+# This is a helper for computing the bucket size.  Right now we end
+# up traversing the input twice (once to compute a size and once to
+# actually build the cell table); this is working around limitations
+# in Dex's support for mutable lists of statically unknown length.
 def cell_table_bucket_size(
   grid_size, cell_size, atoms: atom_ix => Vec dim
   ) -> Nat given (dim|Ix, atom_ix|Ix) =
-  -- TODO Use yield_accum here
+  # TODO Use yield_accum here
   bucket_sizes = yield_state (for _:(GridIx dim grid_size). 0) \ref.
     traverse_cells grid_size cell_size atoms \ix cell.
       ref!cell := get ref!cell + 1
@@ -358,7 +358,7 @@ def grid_params(L, desired_cell_size) =
   grid_size = unsafe_i_to_n $ f_to_i cells_per_side
   (cell_size, grid_size)
 
-desired_cell_size = 1.0 -- unit interaction range
+desired_cell_size = 1.0 # unit interaction range
 
 (cell_size, grid_size) = grid_params L_small desired_cell_size
 
@@ -390,13 +390,13 @@ cell_neighbors_2d = [[-1, -1], [-1, 0], [-1, 1], [0, -1],
 :t cell_neighbors_2d
 > ((Fin 9) => (Fin 2) => Int32)
 
--- Toroidal index offsetting
+# Toroidal index offsetting
 def torus_offset(ix: (Fin n), offset: Int) -> (Fin n) given (n) =
   unsafe_from_ordinal $ unsafe_i_to_n $
     mod (n_to_i (ordinal ix) + offset) (n_to_i n)
 
--- A traversal of pairs of atoms from adjacent or equal cells in a
--- cell table.
+# A traversal of pairs of atoms from adjacent or equal cells in a
+# cell table.
 def traverse_pairs(
     tbl: CellTable(TwoDimensions, grid_size, bucket_size, atom_ix),
     atoms: atom_ix => Vec TwoDimensions,
@@ -431,8 +431,8 @@ def neighbor_list(
       if is_neighbor(atom1, atom2)
         then bd_push(ref, (atom1, atom2))
 
--- Also a helper to pre-compute how large a buffer to allocate for the
--- neighbor list, by traversing the input an extra time.
+# Also a helper to pre-compute how large a buffer to allocate for the
+# neighbor list, by traversing the input an extra time.
 def count_neighbor_list_size(
     tbl: CellTable(TwoDimensions, grid_size, bucket_size, atom_ix),
     is_neighbor: (atom_ix, atom_ix) -> Bool,
@@ -498,8 +498,8 @@ energy_nl(L_small, as_list res)(state_small'.R)
 energy (state_small'.R)
 > 1.230966
 
--- Package the above up into a function that just computes the
--- neighbor list from an array of atoms.
+# Package the above up into a function that just computes the
+# neighbor list from an array of atoms.
 def just_neighbor_list(
     desired_cell_size,
     L,
@@ -525,7 +525,7 @@ energy R_init_small
 energy $ fire_descent_step(free_shift, energy, state_small).R
 > 71.78407
 
--- A helper for short-circuiting `any` computation
+# A helper for short-circuiting `any` computation
 def fast_any(f: (n) -> {|eff} Bool) -> {|eff} Bool given (n|Ix, eff) =
   iter \ct.
     if ct >= size n
@@ -541,7 +541,7 @@ some atom moves more than `halo/2` away from where it was when the
 neighbor list is computed, because otherwise it remains a sound
 approximation.
 
--- TODO(Issue 1133) Can't use scan with a body that has an effect?
+# TODO(Issue 1133) Can't use scan with a body that has an effect?
 def simulate(
     displacement : Displacement TwoDimensions,
     halo_size,
diff --git a/examples/mnist-nearest-neighbors.dx b/examples/mnist-nearest-neighbors.dx
index 4448420f5..ab8832b5e 100644
--- a/examples/mnist-nearest-neighbors.dx
+++ b/examples/mnist-nearest-neighbors.dx
@@ -6,7 +6,7 @@ load dxbo "scratch/mnist.dxbo" as mnist
 
 :t mnist
 
--- TODO: these should come from the data set itself
+# TODO: these should come from the data set itself
 type Img = 28=>28=>Float
 type NTrain = 60000
 type NTest  = 10000
@@ -27,11 +27,11 @@ fracTrue xs = mean for i. float (b2i xs.i)
 example = asidx @NTest 123
 :plotmat xsTest.example
 
--- look at closest match in the training set
+# look at closest match in the training set
 closest = findNearestNeighbor imgDistance xsTrain (xsTest.example)
 :plotmat xsTrain.closest
 
--- Make a subset of the test set (evaluating a single test example takes ~80ms)
+# Make a subset of the test set (evaluating a single test example takes ~80ms)
 type NTestSmall = 1000
 xsTest' = slice @NTestSmall xsTest 0
 ysTest' = slice @NTestSmall ysTest 0
diff --git a/examples/nn.dx b/examples/nn.dx
index d8513e7c0..fe8869f7e 100644
--- a/examples/nn.dx
+++ b/examples/nn.dx
@@ -20,7 +20,7 @@ struct DenseParams(a|Ix, b|Ix) =
   weights : a=>b=>Float
   biases  : b=>Float
 
--- TODO: these instances are all exactly the same as those for pairs. We should derive them!
+# TODO: these instances are all exactly the same as those for pairs. We should derive them!
 instance Add(DenseParams(a, b)) given (a|Ix, b|Ix)
   def (+)(p1, p2) = DenseParams( p1.weights + p2.weights
                                    , p1.biases  + p2.biases)
@@ -120,14 +120,14 @@ def simple(h1|Ix) =
     (ndense1.init k1, ndense2.init k2)
   Layer(forward, init)
 
--- :t simple
--- > ((h1:Type)
--- >  -> (v#0:(Ix h1))
--- >  ?=> Layer
--- >        ((Fin 2) => Float32)
--- >        ((Fin 2) => Float32)
--- >        ((((Fin 2) => h1 => Float32) & (h1 => Float32))
--- >         & ((h1 => (Fin 2) => Float32) & ((Fin 2) => Float32))))
+# :t simple
+# > ((h1:Type)
+# >  -> (v#0:(Ix h1))
+# >  ?=> Layer
+# >        ((Fin 2) => Float32)
+# >        ((Fin 2) => Float32)
+# >        ((((Fin 2) => h1 => Float32) & (h1 => Float32))
+# >         & ((h1 => (Fin 2) => Float32) & ((Fin 2) => Float32))))
 
 'Train a multiclass classifier with minibatch SGD
 '`minibatch * minibatches = batch`
@@ -155,7 +155,7 @@ def trainClass(
               loss
        (get params, loss)
 
--- todo : Do I have to give minibatches as a param?
+# todo : Do I have to give minibatches as a param?
 simple_model = simple (Fin 10)
 (all_params,losses) = trainClass simple_model xs (for i. (y[i] @ (Fin 2))) (Fin 500) (Fin 100) (Fin 1)
 
@@ -198,20 +198,20 @@ def lenet(h1|Ix, h2|Ix, h3|Ix) =
     (ncnn1.init k1, ncnn2.init k2, ndense1.init k3, ndense2.init k4)
   Layer(forward, init)
 
--- :t lenet
--- > ((h1:Type)
--- >  -> (h2:Type)
--- >  -> (h3:Type)
--- >  -> (v#0:(Ix h1))
--- >  ?=> (v#1:(Ix h2))
--- >  ?=> (v#2:(Ix h3))
--- >  ?=> Layer
--- >        ((Fin 1) => (Fin 28) => (Fin 28) => Float32)
--- >        ((Fin 10) => Float32)
--- >        (((h1 => (Fin 1) => (Fin 3) => (Fin 3) => Float32) & (h1 => Float32))
--- >         & (((h2 => h1 => (Fin 3) => (Fin 3) => Float32) & (h2 => Float32))
--- >            & ((((h2 & (Fin 7 & Fin 7)) => h3 => Float32) & (h3 => Float32))
--- >               & ((h3 => (Fin 10) => Float32) & ((Fin 10) => Float32))))))
+# :t lenet
+# > ((h1:Type)
+# >  -> (h2:Type)
+# >  -> (h3:Type)
+# >  -> (v#0:(Ix h1))
+# >  ?=> (v#1:(Ix h2))
+# >  ?=> (v#2:(Ix h3))
+# >  ?=> Layer
+# >        ((Fin 1) => (Fin 28) => (Fin 28) => Float32)
+# >        ((Fin 10) => Float32)
+# >        (((h1 => (Fin 1) => (Fin 3) => (Fin 3) => Float32) & (h1 => Float32))
+# >         & (((h2 => h1 => (Fin 3) => (Fin 3) => Float32) & (h2 => Float32))
+# >            & ((((h2 & (Fin 7 & Fin 7)) => h3 => Float32) & (h3 => Float32))
+# >               & ((h3 => (Fin 10) => Float32) & ((Fin 10) => Float32))))))
 
 
 '## Data Loading
@@ -233,39 +233,39 @@ def pixel(x:Char) -> Float32 =
 
 ' `wget https://github.com/srush/learns-dex/raw/main/labels.bin`
 
--- getIm : Batch=>Image =
---   AsList(_, im) = unsafe_io \. read_file "examples/mnist.bin"
---   raw = unsafe_cast_table(to=Full, im)
---   for b: Batch  c:(Fin 1) i:(Fin W) j:(Fin H).
---     pixel raw[(ordinal (b, i, j)) @ Full]
+# getIm : Batch=>Image =
+#   AsList(_, im) = unsafe_io \. read_file "examples/mnist.bin"
+#   raw = unsafe_cast_table(to=Full, im)
+#   for b: Batch  c:(Fin 1) i:(Fin W) j:(Fin H).
+#     pixel raw[(ordinal (b, i, j)) @ Full]
 
--- getLabel : Batch => Class =
---   AsList(_, im2) = unsafe_io \. read_file "examples/labels.bin"
---   r = unsafe_cast_table(to=Batch, im2)
---   for i. (w8_to_n r[i] @ Class)
+# getLabel : Batch => Class =
+#   AsList(_, im2) = unsafe_io \. read_file "examples/labels.bin"
+#   r = unsafe_cast_table(to=Batch, im2)
+#   for i. (w8_to_n r[i] @ Class)
 
--- ims = getIm
--- labels = getLabel
+# ims = getIm
+# labels = getLabel
 
--- small_ims = for i: (Fin 10). ims[(ordinal i)@_]
--- small_labels = for i: (Fin 10). labels[(ordinal i)@_]
+# small_ims = for i: (Fin 10). ims[(ordinal i)@_]
+# small_labels = for i: (Fin 10). labels[(ordinal i)@_]
 
--- :p small_labels
+# :p small_labels
 
--- Epochs = (Fin 5)
--- Minibatches = (Fin 1)
--- Minibatch = (Fin 10)
+# Epochs = (Fin 5)
+# Minibatches = (Fin 1)
+# Minibatch = (Fin 10)
 
--- :t ims[2@_]
+# :t ims[2@_]
 
--- model = lenet (Fin 1) (Fin 1) (Fin 20)
--- init_param = model.init $ new_key 0
--- :p model.forward(init_param, ims[2@Batch])
+# model = lenet (Fin 1) (Fin 1) (Fin 20)
+# init_param = model.init $ new_key 0
+# :p model.forward(init_param, ims[2@Batch])
 
 ' Sanity check
 
--- :t (grad ((\x param. sum (forward model param x)) (ims.(2@_)))) init_param
---
--- (all_params', losses') = trainClass model small_ims small_labels Epochs Minibatch Minibatches
---
--- :p losses'
+# :t (grad ((\x param. sum (forward model param x)) (ims.(2@_)))) init_param
+#
+# (all_params', losses') = trainClass model small_ims small_labels Epochs Minibatch Minibatches
+#
+# :p losses'
diff --git a/examples/ode-integrator.dx b/examples/ode-integrator.dx
index 97336c724..1618aad93 100644
--- a/examples/ode-integrator.dx
+++ b/examples/ode-integrator.dx
@@ -10,19 +10,19 @@ import plot
 
 Time = Float
 
--- Should this go in the prelude?
+# Should this go in the prelude?
 def length(x: d=>Float) -> Float given (d|Ix) = sqrt $ sum for i. sq x[i]
 def (./)(x: d=>Float, y: d=>Float) -> d=>Float given (d|Ix) = for i. x[i] / y[i]
 
 def fit_4th_order_polynomial(z0:v, z1:v, z_mid:v, dz0:v, dz1:v, dt:Time)
     -> (Fin 5)=>v given (v|VSpace) =
-  -- dz0 and dz1 are gradient evaluations.
+  # dz0 and dz1 are gradient evaluations.
   a = -2. * dt .* dz0 + 2. * dt .* dz1 -  8. .* z0 -  8. .* z1 + 16. .* z_mid
   b =  5. * dt .* dz0 - 3. * dt .* dz1 + 18. .* z0 + 14. .* z1 - 32. .* z_mid
   c = -4. * dt .* dz0 +      dt .* dz1 - 11. .* z0 -  5. .* z1 + 16. .* z_mid
   d = dt .* dz0
   e = z0
-  [a, b, c, d, e]  -- polynomial coefficients.
+  [a, b, c, d, e]  # polynomial coefficients.
 
 dps_c_mid = [
    6025192743. /30085553152. /2., 0., 51252292925. /65400821598. /2.,
@@ -31,7 +31,7 @@ dps_c_mid = [
 
 def interp_fit_dopri(z0:v, z1:v, k:(Fin 7)=>v, dt:Time)
     -> (Fin 5)=>v given (v|VSpace) =
-  -- Fit a polynomial to the results of a Runge-Kutta step.
+  # Fit a polynomial to the results of a Runge-Kutta step.
   z_mid = z0 + dt .* (dot dps_c_mid k)
   fit_4th_order_polynomial z0 z1 z_mid k[0@_] k[1@_] dt
 
@@ -40,8 +40,8 @@ def initial_step_size(
     t0:Time, z0:d=>Float,
     order:Int, rtol:Float, atol:Float, f0:d=>Float
     ) -> Time given (d|Ix) =
-  -- Algorithm from: E. Hairer, S. P. Norsett G. Wanner,
-  -- Solving Ordinary Differential Equations I: Nonstiff Problems, Sec. II.4.
+  # Algorithm from: E. Hairer, S. P. Norsett G. Wanner,
+  # Solving Ordinary Differential Equations I: Nonstiff Problems, Sec. II.4.
   scale = for i. atol + ((abs z0[i]) * rtol)
   d0 = length (z0 ./ scale)
   d1 = length (f0 ./ scale)
@@ -54,10 +54,10 @@ def initial_step_size(
   h1 = select ((d1 <= 1.0e-15) && (d2 <= 1.0e-15)) left right
   min (100. * h0) h1
 
--- Dopri5 Butcher tableaux
+# Dopri5 Butcher tableaux
 alpha = [1. / 5., 3. / 10., 4. / 5., 8. / 9., 1., 1.]
 
-beta : ((i:Fin 6)=>(..i)=>Float) =  -- triangular array type.
+beta : ((i:Fin 6)=>(..i)=>Float) =  # triangular array type.
   [coerce_table _ [1. / 5.],
    coerce_table _ [3. / 40., 9. / 40.],
    coerce_table _ [44. / 45., -56. / 15., 32.0 / 9.],
@@ -119,15 +119,15 @@ def odeint(
     func: (d=>Float, Time) -> d=>Float,
     z0: d=>Float, t0: Time, times: n=>Time
     ) -> n=>d=>Float given (n|Ix, d|Ix) =
-  -- Adaptive stepsize (Dormand-Prince) Runge-Kutta odeint implementation.
-  --  Args:
-  --    func: time derivative of the solution z at time t.
-  --    z0: the initial value for the state.
-  --    t: times for evaluation. values must be strictly increasing.
-  --  Returns:
-  --    Values of the solution at each time point in times.
-  rtol = 1.4e-8 -- relative local error tolerance for solver.
-  atol = 1.4e-8 -- absolute local error tolerance for solver.
+  # Adaptive stepsize (Dormand-Prince) Runge-Kutta odeint implementation.
+  #  Args:
+  #    func: time derivative of the solution z at time t.
+  #    z0: the initial value for the state.
+  #    t: times for evaluation. values must be strictly increasing.
+  #  Returns:
+  #    Values of the solution at each time point in times.
+  rtol = 1.4e-8 # relative local error tolerance for solver.
+  atol = 1.4e-8 # absolute local error tolerance for solver.
   max_iters = 10000
   V : Type = d=>Float
   S : Type = ODEState V
@@ -148,7 +148,7 @@ def odeint(
       stay_state = ODEState(     z, state.f, state.t, new_dt, state.last_t , state.interp_coeffs)
       select (ratio <= 1.0) move_state stay_state
 
-    -- Take steps until we pass target_t
+    # Take steps until we pass target_t
     new_state = yield_state init_carry \stateRef.
       iter \_.
         state = get stateRef
@@ -158,14 +158,14 @@ def odeint(
             Continue
           else Done ()
 
-    -- Interpolate to the target time.
+    # Interpolate to the target time.
     relative_output_time = (target_t - new_state.last_t) / (new_state.t - new_state.last_t)
     z_target = evalpoly new_state.interp_coeffs relative_output_time
     (new_state, z_target)
 
   f0 = func z0 t0
   init_step = initial_step_size func t0 z0 4 rtol atol f0
-  init_interp_coeff = zero  -- dummy vals
+  init_interp_coeff = zero  # dummy vals
   init_carry = ODEState(z0, f0, t0, init_step, t0, init_interp_coeff)
   snd $ scan init_carry integrate_to_next_time
 
@@ -183,7 +183,7 @@ approx_e = odeint myDyn z0 t0 t1
 
 exact_e = [[exp 1.0]]
 
-:p (approx_e - exact_e) -- amount of numerical error
+:p (approx_e - exact_e) # amount of numerical error
 > [[0.001894474]]
 
 times = linspace (Fin 100) 0.00001 1.0
diff --git a/examples/particle-filter.dx b/examples/particle-filter.dx
index e149da347..b1b6b5682 100644
--- a/examples/particle-filter.dx
+++ b/examples/particle-filter.dx
@@ -60,4 +60,4 @@ truth = for i:(Fin timesteps).
 
 filtered = particleFilter num_particles _ gaussModel mean (map snd truth) (new_key 0)
 
--- :p for i. (truth[i], filtered[i])
+# :p for i. (truth[i], filtered[i])
diff --git a/examples/particle-swarm-optimizer.dx b/examples/particle-swarm-optimizer.dx
index 45eb37dc4..3a4f7ba75 100644
--- a/examples/particle-swarm-optimizer.dx
+++ b/examples/particle-swarm-optimizer.dx
@@ -54,12 +54,12 @@ We have **arguments**:
 '**Returns**: the optimal point found with-in the bounds on the input domain of `f`.
 
 def optimize(
-    np':Nat,                      -- number of particles
-    niter:Nat,                    -- number of iteration
-    key:Key,                      -- random seed
-    f:(d=>Float)->Float,          -- function to optimize
-    bounds:(d=>Float, d=>Float),  -- bounds
-    params:(Float, Float, Float)  -- momentum/pRate/gRate
+    np':Nat,                      # number of particles
+    niter:Nat,                    # number of iteration
+    key:Key,                      # random seed
+    f:(d=>Float)->Float,          # function to optimize
+    bounds:(d=>Float, d=>Float),  # bounds
+    params:(Float, Float, Float)  # momentum/pRate/gRate
     ) -> d=>Float given (d|Ix) =
     (lb,ub) = bounds
     (momentum, gRate, pRate) = params
@@ -96,7 +96,7 @@ def optimize(
     (dc0,(finalGscore, finalGloc),dc1,dc2,dc3) = res
     finalGloc
 
-'---
+'#-
 Let's see how it goes.
 Run it for more iterations and the result should improve.
 Which it indeed does.
diff --git a/examples/raytrace.dx b/examples/raytrace.dx
index 993d6203c..c4aa278e5 100644
--- a/examples/raytrace.dx
+++ b/examples/raytrace.dx
@@ -16,7 +16,7 @@ def Mat(n:Nat, m:Nat) -> Type = Fin n => Fin m => Float
 
 def relu(x:Float) -> Float = max x 0.0
 def length(x: d=>Float) -> Float given (d|Ix) = sqrt $ sum for i:d. sq x[i]
--- TODO: make a newtype for normal vectors
+# TODO: make a newtype for normal vectors
 def normalize(x: d=>Float) -> d=>Float given (d|Ix) = x / (length x)
 def directionAndLength(x: d=>Float) -> (d=>Float, Float) given (d|Ix) =
   l = length x
@@ -39,7 +39,7 @@ def cross(a:Vec 3, b:Vec 3) -> Vec 3 =
   [b1, b2, b3] = b
   [a2 * b3 - a3 * b2, a3 * b1 - a1 * b3, a1 * b2 - a2 * b1]
 
--- TODO: Use `data Color = Red | Green | Blue` and ADTs for index sets
+# TODO: Use `data Color = Red | Green | Blue` and ADTs for index sets
 data Image =
  MkImage(height:Nat, width:Nat, Fin height => Fin width => Color)
 
@@ -47,7 +47,7 @@ xHat : Vec 3 = [1., 0., 0.]
 yHat : Vec 3 = [0., 1., 0.]
 zHat : Vec 3 = [0., 0., 1.]
 
-Angle = Float  -- angle in radians
+Angle = Float  # angle in radians
 
 def rotateX(p:Vec 3, angle:Angle) -> Vec 3 =
   c = cos angle
@@ -85,7 +85,7 @@ def sampleCosineWeightedHemisphere(normal: Vec 3, k:Key) -> Vec 3 =
 Distance = Float
 
 Position  = Vec 3
-Direction = Vec 3  -- Should be normalized. TODO: use a newtype wrapper
+Direction = Vec 3  # Should be normalized. TODO: use a newtype wrapper
 
 BlockHalfWidths = Vec 3
 Radius = Float
@@ -106,7 +106,7 @@ struct OrientedSurface =
 
 data Object =
   PassiveObject(ObjectGeom, Surface)
-  -- position, half-width, intensity (assumed to point down)
+  # position, half-width, intensity (assumed to point down)
   Light(Position, Float, Radiance)
 
 struct Ray =
@@ -120,7 +120,7 @@ struct Params =
  maxBounces : Nat
  shareSeed  : Bool
 
--- TODO: use a list instead, once they work
+# TODO: use a list instead, once they work
 struct Scene(n|Ix) =
   objects : n=>Object
 
@@ -128,15 +128,15 @@ def sampleReflection(surf:OrientedSurface, ray:Ray, k:Key) -> Ray =
   nor = surf.normal
   newDir = case surf.surface of
     Matte _ -> sampleCosineWeightedHemisphere nor k
-    -- TODO: surely there's some change-of-solid-angle correction we need to
-    -- consider when reflecting off a curved surface.
+    # TODO: surely there's some change-of-solid-angle correction we need to
+    # consider when reflecting off a curved surface.
     Mirror  -> ray.dir - (2.0 * dot ray.dir nor) .* nor
   Ray(ray.origin, newDir)
 
 def probReflection(surf:OrientedSurface, _:Ray, out_ray:Ray) -> Float =
   case surf.surface of
     Matte _ -> relu $ dot surf.normal out_ray.dir
-    Mirror  -> 0.0  -- TODO: this should be a delta function of some sort
+    Mirror  -> 0.0  # TODO: this should be a delta function of some sort
 
 def applyFilter(filter:Filter, radiance:Radiance) -> Radiance =
   for i. filter[i] * radiance[i]
@@ -169,21 +169,21 @@ def calcNormal(obj:Object, pos:Position) -> Direction =
   grad(\p:Position. sdObject(p, obj)) pos | normalize
 
 data RayMarchResult =
-  -- incident ray, surface normal, surface properties
+  # incident ray, surface normal, surface properties
   HitObj(Ray, OrientedSurface)
   HitLight(Radiance)
-  -- Could refine with failure reason (beyond horizon, failed to converge etc)
+  # Could refine with failure reason (beyond horizon, failed to converge etc)
   HitNothing
 
 def raymarch(scene:Scene n, ray:Ray) -> RayMarchResult given (n|Ix) =
   maxIters : Nat = 100
   tol = 0.01
-  startLength = 10.0 * tol  -- trying to escape the current surface
+  startLength = 10.0 * tol  # trying to escape the current surface
   with_state (10.0 * tol) \rayLength.
     bounded_iter maxIters HitNothing \_.
       rayPos = ray.origin + get rayLength .* ray.dir
       (obj, d) = sdScene scene $ rayPos
-      -- 0.9 ensures we come close to the surface but don't touch it
+      # 0.9 ensures we come close to the surface but don't touch it
       rayLength := get rayLength + 0.9 * d
       case d < tol of
         False -> Continue
@@ -191,11 +191,11 @@ def raymarch(scene:Scene n, ray:Ray) -> RayMarchResult given (n|Ix) =
           surfNorm = calcNormal obj rayPos
           case positiveProjection ray.dir surfNorm of
             True ->
-              -- Oops, we didn't escape the surface we're leaving..
-              -- (Is there a more standard way to do this?)
+              # Oops, we didn't escape the surface we're leaving..
+              # (Is there a more standard way to do this?)
               Continue
             False ->
-              -- We made it!
+              # We made it!
               Done $ case obj of
                 PassiveObject(_, surf) ->
                   newRay = Ray(rayPos, ray.dir)
@@ -226,7 +226,7 @@ def sampleLightRadiance(
         (dirToLight, distToLight) = directionAndLength $
                                       lightPos + sampleSquare hw k - inRay.origin
         if positiveProjection dirToLight osurf.normal then
-          -- light on this far side of current surface
+          # light on this far side of current surface
           fracSolidAngle = (relu $ dot dirToLight yHat) * sq hw / (pi * sq distToLight)
           outRay = Ray(inRay.origin, dirToLight)
           coeff = fracSolidAngle * probReflection osurf inRay outRay
@@ -241,7 +241,7 @@ def trace(params:Params, scene:Scene n, initRay:Ray, k:Key) -> Color given (n|Ix
         case raymarch scene $ get ray of
           HitNothing -> Done ()
           HitLight intensity ->
-            if i == 0 then radiance += intensity   -- TODO: scale etc
+            if i == 0 then radiance += intensity   # TODO: scale etc
             Done ()
           HitObj(incidentRay, osurf) ->
             [k1, k2] = split_key(n=2, hash k i)
@@ -251,16 +251,16 @@ def trace(params:Params, scene:Scene n, initRay:Ray, k:Key) -> Color given (n|Ix
             radiance += applyFilter (get filter) lightRadiance
             Continue
 
--- Assumes we're looking towards -z.
+# Assumes we're looking towards -z.
 struct Camera =
   numPix     : Nat
-  pos        : Position  -- pinhole position
-  halfWidth  : Float     -- sensor half-width
-  sensorDist : Float     -- pinhole-sensor distance
+  pos        : Position  # pinhole position
+  halfWidth  : Float     # sensor half-width
+  sensorDist : Float     # pinhole-sensor distance
 
--- TODO: might be better with an anonymous dependent pair for the result
+# TODO: might be better with an anonymous dependent pair for the result
 def cameraRays(n:Nat, camera:Camera) -> Fin n => Fin n => ((Key) -> Ray) =
-  -- images indexed from top-left
+  # images indexed from top-left
   halfWidth = camera.halfWidth
   pixHalfWidth = halfWidth / n_to_f n
   ys = reverse $ linspace (Fin n) (neg halfWidth) halfWidth
@@ -310,13 +310,13 @@ defaultCamera = Camera 250 (10.0 .* zHat) 0.3 1.0
 
 testCamera = Camera 10 (10.0 .* zHat) 0.3 1.0
 
--- We change to a small num pix here to reduce the compute needed for tests
+# We change to a small num pix here to reduce the compute needed for tests
 params = defaultParams
 camera = if dex_test_mode()
   then testCamera
   else defaultCamera
 
--- %time
+# %time
 MkImage(_, _, image) = takePicture params theScene camera
 :html imshow image
 > <html output>
diff --git a/examples/regression.dx b/examples/regression.dx
index 905f6bd86..e3855f264 100644
--- a/examples/regression.dx
+++ b/examples/regression.dx
@@ -8,7 +8,7 @@ struct SolverState(n|Ix) =
   r : n=>Float
   p : n=>Float
 
--- Conjugate gradients solver
+# Conjugate gradients solver
 def solve(mat:m=>m=>Float, b:m=>Float) -> m=>Float given (m|Ix) =
   x0 = zero :: m=>Float
   r0 = b - (mat **. x0)
diff --git a/examples/rejection-sampler.dx b/examples/rejection-sampler.dx
index c60e69014..2069cf0c8 100644
--- a/examples/rejection-sampler.dx
+++ b/examples/rejection-sampler.dx
@@ -10,7 +10,7 @@ def rejectionSample(try: (Key) -> Maybe a, k:Key) -> a  given (a|Data) =
 Prob    = Float
 LogProb = Float
 
--- log probability density of a Binomial distribution
+# log probability density of a Binomial distribution
 def logBinomialProb(n':Nat, p:Prob, counts':Nat) -> LogProb =
   n      = n_to_f n'
   counts = n_to_f counts'
@@ -35,13 +35,13 @@ def trySampleBinomial(n:Nat, p:Prob, k:Key) -> Maybe Nat =
 
 'We test the implementation by sampling from a Binomial distribution with 10 trials and success probability 0.4.
 
--- parameters
+# parameters
 n = 10
 p = 0.4
 numSamples = 5000
 k0 = new_key 0
 
--- TODO: use currying sugar (or even better, effects)
+# TODO: use currying sugar (or even better, effects)
 rejectionSamples = rand_vec numSamples (\k. rejectionSample (\k'. trySampleBinomial n p k') k) k0
 
 :p slice rejectionSamples 0 $ Fin 10
diff --git a/examples/schrodinger.dx b/examples/schrodinger.dx
index 8d1c09c34..53221455d 100644
--- a/examples/schrodinger.dx
+++ b/examples/schrodinger.dx
@@ -19,45 +19,45 @@ import png
 
 '## Parameters
 
--- The number of points on the grid in each dimension e.g. (41x41 for the 2d case)
+# The number of points on the grid in each dimension e.g. (41x41 for the 2d case)
 gridSize = 41
 D = Fin gridSize
 
--- Space discretization
+# Space discretization
 dx = 1. / n_to_f gridSize
 
--- Time discretization (solution unstable if much higher)
+# Time discretization (solution unstable if much higher)
 dt = 0.000001
 
--- The number of dt timesteps to simulate
+# The number of dt timesteps to simulate
 Steps = Fin 155000
 
--- The number of frames to output in each GIF animation
+# The number of frames to output in each GIF animation
 gifFrames = 100
 
 '## Helpers
 
--- Define i_=sqrt(-1) in a way that won't clash with indices named 'i'
+# Define i_=sqrt(-1) in a way that won't clash with indices named 'i'
 i_ = Complex(0., 1.)
 
--- Shorthand conversion
+# Shorthand conversion
 def fToC(x:Float) -> Complex = Complex(x, 0.)
 
--- Translate from index to Float representation of real space
--- Simulates the range [0.0, 1.0] in each dimension
+# Translate from index to Float representation of real space
+# Simulates the range [0.0, 1.0] in each dimension
 def ixToF(i:n) -> Float given (n|Ix) = (n_to_f (ordinal i)) / (n_to_f gridSize - 1.)
 
--- Helper to generate zero-delay GIF animations
+# Helper to generate zero-delay GIF animations
 def animate(imgs:t=>n=>m=>(Fin 3)=>Float) -> Html given (t|Ix, n|Ix, m|Ix) =
   img_to_html $ pngs_to_gif 0 $ map img_to_png imgs
 
--- Cuts an array down to a smaller dimension.
--- Useful for cutting out animation frames for a faster GIF.
+# Cuts an array down to a smaller dimension.
+# Useful for cutting out animation frames for a faster GIF.
 def cut(x:n=>a) -> m=>a given (n|Ix, m|Ix, a) =
   s = idiv (size n) (size m)
   for i:m. x[from_ordinal $ s * ordinal i]
 
--- Converts a given 2D matrix to a greyscale image.
+# Converts a given 2D matrix to a greyscale image.
 def toImg(xs:n=>m=>Float) -> (n=>m=>Fin 3=>Float) given (n|Ix, m|Ix) =
   scale = 1. / maximum (map maximum xs)
   for h.
@@ -67,19 +67,19 @@ def sqr(x:a) -> a given (a|Mul) = x * x
 
 '## Computing $\psi_{t+\Delta t}$
 
--- Are the given indices at the bounds of the grid
+# Are the given indices at the bounds of the grid
 def atBounds(i:n, j:m) -> Bool given (n|Ix, m|Ix) =
   (ordinal i == 0
    || n_to_i (ordinal i) == (n_to_i gridSize - 1)
    || ordinal j == 0
    || n_to_i (ordinal j) == (n_to_i gridSize - 1))
 
--- Operators for performing unsafe ordinal arithmetic
+# Operators for performing unsafe ordinal arithmetic
 def (+!)(i:n, off:Int) -> n given (n|Ix) =
   unsafe_from_ordinal $ unsafe_i_to_n (n_to_i (ordinal i) + off)
 def (-!)(i:n, off:Int) -> n given (n|Ix) = i +! (-1 * off)
 
--- Run a single forward-step to compute psi_(t+1) from psi_t with the given potential.
+# Run a single forward-step to compute psi_(t+1) from psi_t with the given potential.
 def step(psi:m=>n=>Complex, v:m=>n=>Float) -> m=>n=>Complex given (m|Ix, n|Ix) =
   for i j.
     if atBounds i j
@@ -94,11 +94,11 @@ def step(psi:m=>n=>Complex, v:m=>n=>Float) -> m=>n=>Complex given (m|Ix, n|Ix) =
              * (psi_l + psi_u + (fToC 4. * psi[i, j]) + psi_r + psi_d))
          - (i_ * (fToC $ dt * v[i, j]) * psi[i, j]))
 
--- Computes time evolution of wavefunction.
+# Computes time evolution of wavefunction.
 def evolve(psi:m=>n=>Complex, v:m=>n=>Float) -> Steps=>m=>n=>Complex given (m|Ix, n|Ix) =
   scan' psi (\i x. step x v)
 
--- Compute the Born rule probability distribution by |psi|**2
+# Compute the Born rule probability distribution by |psi|**2
 def pdf(psi:n=>m=>Complex) -> n=>m=>Float given (n|Ix, m|Ix) =
   pdf = for i j. complex_mag (sqr psi[i, j])
   scale = sum (map sum pdf)
@@ -106,7 +106,7 @@ def pdf(psi:n=>m=>Complex) -> n=>m=>Float given (n|Ix, m|Ix) =
 
 '## Examples
 
--- Generate gaussian initial conditions with zero at the bounds.
+# Generate gaussian initial conditions with zero at the bounds.
 def gaussian(u:(Float, Float), o:(Float, Float)) -> (n=>m=>Complex) given (n|Ix, m|Ix) =
   (ux, uy) = u
   (ox, oy) = o
@@ -118,7 +118,7 @@ def gaussian(u:(Float, Float), o:(Float, Float)) -> (n=>m=>Complex) given (n|Ix,
         then fToC 0.
         else fToC $ exp $ -0.5 * (x2 / sqr ox + y2 / sqr oy)
 
--- Create an animation of the evolution of the given wavefunction under the given potential.
+# Create an animation of the evolution of the given wavefunction under the given potential.
 def run(psi:m=>n=>Complex, v:m=>n=>Float) -> Html given (n|Ix, m|Ix) =
   animate (map toImg (cut (map pdf (evolve psi v)))::(Fin gifFrames=>_))
 
diff --git a/examples/sgd.dx b/examples/sgd.dx
index fcf991c8a..298d726a4 100644
--- a/examples/sgd.dx
+++ b/examples/sgd.dx
@@ -13,7 +13,7 @@ def sgd_step(
   new_x = x - step_size .* new_m
   (new_x, new_m)
 
--- In-place optimization loop.
+# In-place optimization loop.
 def sgd(
     step_size:Float,
     decay:Float,
diff --git a/examples/simplex.dx b/examples/simplex.dx
index ecc93b8b8..339915835 100644
--- a/examples/simplex.dx
+++ b/examples/simplex.dx
@@ -18,8 +18,8 @@ import plot
 
 '# General utility functions.
 
--- Hopefully these instances can one day be generated automatically for
--- all record types.
+# Hopefully these instances can one day be generated automatically for
+# all record types.
 instance [Eq cons, Eq a] Eq {constraints: cons | objective: Unit | extra:a}
   (==) = \r1 r2.
     case r1 of
@@ -73,13 +73,13 @@ instance [Eq a, Eq b] Eq {artificials: a | realvars: b}
 
 
 def argmax_suchthat [Ord o] (xs:n=>o) (cond:o->Bool): n =
-  -- Assumes that there's at least one element of xs
-  -- that satisfies the condition.
+  # Assumes that there's at least one element of xs
+  # that satisfies the condition.
   cmp = \x y. (not $ cond y) || (x > y && cond x)
   argscan cmp xs
 
 def argmax_suchthat_table [Ord o] (xs:n=>o) (cond:n=>Bool): n =
-  -- This variant takes a table of pre-evaluated conditions.
+  # This variant takes a table of pre-evaluated conditions.
   cmp = \i j. (not $ cond.j) || (xs.i > xs.j && cond.i)
   argscan cmp (for i. i)
 
@@ -94,7 +94,7 @@ data SimplexResult n:Type =
 data ConstraintType =
   LessThan
   GreaterThan
-  -- Todo: Add equality constraints
+  # Todo: Add equality constraints
 
 data ObjectiveType =
   Maximize
@@ -105,12 +105,12 @@ def Constraint (n:Type) : Type =
 
 def RowIx (cons:Type) (a:Type) : Type =
   {constraints:cons | objective:Unit | extra:a}
-  -- extra is a field that could be left unused or (Fin 1)
+  # extra is a field that could be left unused or (Fin 1)
 
 def ColIx (cons:Type) (vars:Type) : Type =
   {variables:vars | slacks:cons | value:Unit}
-  -- Todo: we assigned a slack variable for every constraints
-  -- even though equality constraints do not need them.
+  # Todo: we assigned a slack variable for every constraints
+  # even though equality constraints do not need them.
 
 Void : Type = Fin 0
 
@@ -119,15 +119,15 @@ def AbsTableau (cons:Type) (vars:Type) (extra:Type) : Type =
   rowtype = RowIx cons extra
   tabletype = rowtype => coltype => Float
   (cons => coltype & vars => coltype & tabletype)
-  -- (basics, nonbasics, table)
+  # (basics, nonbasics, table)
 
 def Tableau (cons:Type) (vars:Type) : Type =
-  AbsTableau cons vars Void -- no need for extra row
+  AbsTableau cons vars Void # no need for extra row
 
 def FeasibilityTableau (cons:Type) (vars:Type) : Type =
-  -- The feasibility tableau has extra columns for the artificial variables.
-  -- Todo: use a List of artificial variables, since not each constraint
-  -- actually requires an artificial variable.
+  # The feasibility tableau has extra columns for the artificial variables.
+  # Todo: use a List of artificial variables, since not each constraint
+  # actually requires an artificial variable.
   allvars = {realvars: vars | artificials: cons}
   AbsTableau cons allvars Unit
 
@@ -189,9 +189,9 @@ The type of this normal table is:
 
 def cononicalize_constraint (inequality: Constraint vars):
   (vars=>Float & Float) =
-  -- Turn all constraints into less-than inequalities,
-  -- so after this step we don't have to track which direction
-  -- the inequality was originallly.
+  # Turn all constraints into less-than inequalities,
+  # so after this step we don't have to track which direction
+  # the inequality was originallly.
   ({coeffs=coeffs, offset=offset, conType=conType}) = inequality
   case conType of
     LessThan    ->  (coeffs,  offset)
@@ -208,13 +208,13 @@ def build_tableau [Eq cons]
       (coeffs, offset) = cononicalize_constraint ineqs.i
       for j. case j of  
         {|variables=j|} -> coeffs.j
-        {|slacks=j|}    -> select (i == j) 1.0 0.0  -- identity matrix.
+        {|slacks=j|}    -> select (i == j) 1.0 0.0  # identity matrix.
         {|value=j|}     -> offset
     {|objective=()|} -> for j. case j of
       {|variables=j|} -> case objType of
         Maximize ->  objective.j
         Minimize -> -objective.j
-      {|slacks=j|} -> 0.0  -- slack variables never contribute to loss.
+      {|slacks=j|} -> 0.0  # slack variables never contribute to loss.
       {|value=j|}  -> 0.0
 
   basics    = for r:cons. {|slacks=r|}
@@ -236,9 +236,9 @@ def negate_constraints_with_negative_offsets [Eq cons]
 def build_feasibility_tableau [Eq cons]
       (tableau: (Tableau cons vars))
       : FeasibilityTableau cons vars =
-  -- Builds an expanded tableau, with an extra variable for each constraint.
-  -- If these new variables can be driven to zero,
-  -- while satisfying all constraints, then the original problem is feasible.
+  # Builds an expanded tableau, with an extra variable for each constraint.
+  # If these new variables can be driven to zero,
+  # while satisfying all constraints, then the original problem is feasible.
 
   tableau' = negate_constraints_with_negative_offsets tableau
   (basics, nonbasics, table) = tableau'
@@ -288,8 +288,8 @@ def isFeasible (tableau: FeasibilityTableau cons vars) : Bool =
 ' #### Choose `pivot` and apply
 
 def chooseColumn (tableau: (AbsTableau cons vars extra)) : ColIx cons vars =
-  -- An unchecked assumption is that there exists
-  -- at least one nonbasic variable with a positive objective coefficient.
+  # An unchecked assumption is that there exists
+  # at least one nonbasic variable with a positive objective coefficient.
   (_, nonbasics, table) = tableau
   objective_coeffs = for c:vars.
     table.{|objective=()|}.(nonbasics.c)
@@ -299,7 +299,7 @@ def chooseColumn (tableau: (AbsTableau cons vars extra)) : ColIx cons vars =
 def findPivotIndex
       (tableau: (AbsTableau cons vars extra))
       : Maybe (RowIx cons extra & ColIx cons vars) =
-  -- Chooses row and column to pivot around next.
+  # Chooses row and column to pivot around next.
   (_, _, table) = tableau
   pivotcol = chooseColumn tableau  
   unbounded = all for r. 
@@ -308,7 +308,7 @@ def findPivotIndex
   case unbounded of
     True -> Nothing
     False ->
-      -- pick row index minimizing the quotient amongst positive quotients
+      # pick row index minimizing the quotient amongst positive quotients
       quotients = for r. -table.{|objective|extra|...r|}.{|value=()|} /
                           table.{|objective|extra|...r|}.pivotcol
       cond = for r. table.{|objective|extra|...r|}.pivotcol > 0.0
@@ -330,11 +330,11 @@ def pivot [Eq cons, Eq extra, Eq vars]
 
   i' = from_just $ (match_with #?constraints) pivotrow
 
-  -- Swap in new basic variable.
+  # Swap in new basic variable.
   newBasics = yield_state basics \ref.
     ref!i' := pivotcol
 
-  -- Move old basic variable to nonbasics.
+  # Move old basic variable to nonbasics.
   newNonbasics = for i. case nonbasics.i == pivotcol of
     True -> basics.i'
     False -> nonbasics.i
@@ -362,7 +362,7 @@ def find_non_artificial_pivcol [Eq cons, Eq vars]
 def pivot_artificials [Eq cons, Eq vars]
       (tableau: FeasibilityTableau cons vars)
       : (FeasibilityTableau cons vars) =
-  -- Pivot out all the artificial variables out of the basis
+  # Pivot out all the artificial variables out of the basis
   yield_state tableau \tRef.
     tableau = get tRef
     (basics, _, _) = tableau
@@ -395,9 +395,9 @@ def deduce_nonbasics [Eq cons, Eq vars]
   all_columns = for i. i
   nonBasicsList = filter not_in_basics all_columns
 
-  -- We know that there will be one nonbasic for every variable,
-  -- but don't know how to tell the compiler that, so
-  -- need to use an unsafe cast.
+  # We know that there will be one nonbasic for every variable,
+  # but don't know how to tell the compiler that, so
+  # need to use an unsafe cast.
   (AsList n nonBasicsTable) = nonBasicsList
   unsafe_cast_table vars nonBasicsTable
 
@@ -408,20 +408,20 @@ def eliminate_artificials [Eq cons, Eq vars]
 
   (basics, nonbasics, table) = tableau
 
-  -- copy to new tableau without the artificial columns
+  # copy to new tableau without the artificial columns
   newBasics = for r:cons. case basics.r of
     {|slacks=r|} -> {|slacks=r|}
     {|variables=r|} -> case r of
       {|realvars=r|} -> {|variables=r|}
       {|artificials=r|} -> error("leftover artificial variable")
 
-  -- keep real vars only
+  # keep real vars only
   t = for i. for j:(ColIx cons vars). case j of
       {|slacks=j|}    -> table.i.{|slacks=j|}
       {|value=j|}     -> table.i.{|value=()|}
       {|variables=j|} -> table.i.{|variables={|realvars=j|}|}
 
-  -- replace objective row
+  # replace objective row
   t2 = for i:(RowIx cons Void). case i of
     {|constraints=i|} -> t.{|constraints=i|}
     {|objective=()|} -> t.{|extra=()|}
@@ -446,7 +446,7 @@ def constraints_satisfied (constraints: m=>Constraint n) (x: n=>Float) : Bool =
     case conType of
       LessThan    -> dot coeffs x <= offset
       GreaterThan -> dot coeffs x >= offset
-  -- also enforces that all vars are >= 0.0
+  # also enforces that all vars are >= 0.0
   constraints_ok && all for i:n. x.i >= 0.0 || x.i ~~ 0.0
 
 def extractSolution (tableau: AbsTableau cons vars extra) : vars=>Float =
@@ -454,7 +454,7 @@ def extractSolution (tableau: AbsTableau cons vars extra) : vars=>Float =
   yield_accum (AddMonoid Float) \varRef.
     for v:cons.
       var = basics.v
-      case var of  -- If this basic variable is an original variable, write it.
+      case var of  # If this basic variable is an original variable, write it.
         {|variables=var|} ->
           varRef!var += table.{|constraints=v|}.{|value=()|}
         {|slacks=var|} -> ()
@@ -467,12 +467,12 @@ def simplex [Eq cons, Eq vars]
       (objective: vars=>Float)
       (objType:ObjectiveType)
       : SimplexResult vars =
-  -- Note: Also enforces that all vars are >= 0.0
-  -- Operates in two phases:
-  -- First, finds a point within the simplex (or return infeasible)
-  -- Second, optimizes within the simplex.
+  # Note: Also enforces that all vars are >= 0.0
+  # Operates in two phases:
+  # First, finds a point within the simplex (or return infeasible)
+  # Second, optimizes within the simplex.
 
-  -- Find a feasible initial solution.
+  # Find a feasible initial solution.
   init_tableau = build_tableau ineqs objective objType
   initFeasibilityTableau = build_feasibility_tableau init_tableau
   feasibilityAns = yield_state initFeasibilityTableau \tableauRef. iter \_.
@@ -492,9 +492,9 @@ def simplex [Eq cons, Eq vars]
     True ->
       tf = feasibilityTableauToNormalTableau feasibilityAns
 
-      -- Optimize within the simplex.
+      # Optimize within the simplex.
       with_state tf \tableauRef. iter \_.
-        tableau = get tableauRef  -- Whole tableau will be overwritten.
+        tableau = get tableauRef  # Whole tableau will be overwritten.
         isopt = isOptimal tableau
         case isopt of
           True -> Done $ Solution $ extractSolution tableau
@@ -507,7 +507,7 @@ def simplex [Eq cons, Eq vars]
 
 '# Tests
 
--- An impossible problem.
+# An impossible problem.
 impossible_constraints = for i.
     angles = linspace (Fin 3) 0.0 (2.0 * pi)
     cs = [-(sin (angles.i)), -(cos (angles.i))]
@@ -518,7 +518,7 @@ impossible_constraints = for i.
 
 initft = build_feasibility_tableau $ build_tableau impossible_constraints [1.0, 1.0] Maximize
 
--- An unbounded problem.
+# An unbounded problem.
 unbounded_constraints =
   angles = linspace (Fin 3) 0.0 1.0
   for i.
@@ -529,9 +529,9 @@ unbounded_constraints =
 :p simplex unbounded_constraints [1.0, 1.0] Maximize
 > Unbounded
 
--- A feasible problem: Example 3.5 in
--- Introduction to Linear Optimization
--- by D. Bertsimas and J. Tsitsiklis
+# A feasible problem: Example 3.5 in
+# Introduction to Linear Optimization
+# by D. Bertsimas and J. Tsitsiklis
 obj = [10.0, 12.0, 12.0]
 a =
   [[1., 2., 2.],
@@ -544,7 +544,7 @@ textbook_constraints =
 > (Solution [4., 4., 4.])
 
 
--- A feasible problem from Waterloo notes
+# A feasible problem from Waterloo notes
 obj' = [1., -1.,  1.]
 a' =
   [[ 2., -1.,  2.],
@@ -578,8 +578,8 @@ dup_constraints =
 > (Solution [0.5, 0.])
 
 
--- differs from dup_constraints only in the ordering
--- artificial variables are in the basis of FeasibilityAns
+# differs from dup_constraints only in the ordering
+# artificial variables are in the basis of FeasibilityAns
 dup_constraints2 =
   [{coeffs=equal_a.(1@_), offset=equal_b.(1@_), conType=GreaterThan},
   {coeffs=equal_a.(0@_), offset=equal_b.(0@_), conType=LessThan},
@@ -589,7 +589,7 @@ dup_constraints2 =
 > (Solution [0.5, 0.])
 
 
--- the first constraint is always satisfied
+# the first constraint is always satisfied
 redundant_obj = [1.0, 1.0]
 redundant_constraints =
   [{coeffs=[0.0, 0.0], offset=0.0, conType=LessThan},
@@ -615,7 +615,7 @@ def pixToReal (px:n) (py:m): (Fin 2)=>Float =
 
 def draw_point [Eq n, Eq m]
   (image: n=>m=>(Fin 3)=>Float) (row:n) (col:m) : n=>m=>(Fin 3)=>Float =
-  -- todo: in-place drawing
+  # todo: in-place drawing
   for j:n. case row == j of
     True -> for k:m. case col == k of
       True  -> [1.0, 0.0, 0.0]
@@ -625,19 +625,19 @@ def draw_point [Eq n, Eq m]
 
 '### Pentagon example
 
--- XXX This used to typecheck, now fails, so coefficients are hardcoded below.
+# XXX This used to typecheck, now fails, so coefficients are hardcoded below.
 
--- def polygon_constraints (num_sides: Int) (radius:Float)
---   (center:(Fin 2)=>Float) : (Fin num_sides)=>Constraint (Fin 2) =
---   angles = linspace (Fin num_sides) 0.0 (2.0 * pi)
---   for i.
---     cs = [-(sin (0.23 + angles.i)),
---           -(cos (0.23 + angles.i))]
---     {coeffs=cs,
---      offset=radius + (dot cs center),
---      conType=LessThan}
---
--- pentagon = polygon_constraints 5 0.2 [0.5, 0.5]
+# def polygon_constraints (num_sides: Int) (radius:Float)
+#   (center:(Fin 2)=>Float) : (Fin num_sides)=>Constraint (Fin 2) =
+#   angles = linspace (Fin num_sides) 0.0 (2.0 * pi)
+#   for i.
+#     cs = [-(sin (0.23 + angles.i)),
+#           -(cos (0.23 + angles.i))]
+#     {coeffs=cs,
+#      offset=radius + (dot cs center),
+#      conType=LessThan}
+#
+# pentagon = polygon_constraints 5 0.2 [0.5, 0.5]
 
 pentagon = [
   {coeffs = [-0.227978, -0.973666], conType = LessThan, offset = -0.400822}
@@ -654,12 +654,12 @@ image = for i:(Fin 150). for j:(Fin 150).
     False -> [0.0, 0.0, 0.0]
 
 
--- Find optimum by simplex method
+# Find optimum by simplex method
 simplexSolution = simplex pentagon objective Maximize
 :p simplexSolution
 > (Solution [0.55636, 0.740704])
 
--- Plot objective function and location of answer (red dot).
+# Plot objective function and location of answer (red dot).
 simplexLocation = case simplexSolution of
   Solution ans -> ans
 (x, y) : (Fin 150 & Fin 150) = float2pix simplexLocation
diff --git a/examples/tutorial-old.dx b/examples/tutorial-old.dx
index 3822158c9..7e5dbd372 100644
--- a/examples/tutorial-old.dx
+++ b/examples/tutorial-old.dx
@@ -24,19 +24,19 @@ See the README for installation instructions.
 indentation rules.
 
 :p
-   x = 1.     -- let binding
-   y = (      -- let binding of a nested let expression
+   x = 1.     # let binding
+   y = (      # let binding of a nested let expression
      z = 2.
      z + 1.)
-   x + y                 -- body of let expression
+   x + y                 # body of let expression
 > 4.0
 
 'We also have lambda, tuple construction and (tuple) pattern-matching (destructuring):
 
 :p
-   (x, y) = (1., 2.)     -- let binding (with pattern-matching), tuple construction
-   f = lam z. x + z * y  -- let binding of a lambda function
-   f 1. + f 2.           -- body of let expression
+   (x, y) = (1., 2.)     # let binding (with pattern-matching), tuple construction
+   f = lam z. x + z * y  # let binding of a lambda function
+   f 1. + f 2.           # body of let expression
 > 8.0
 
 'We use white space for function application (we write `f x y` instead of `f(x, y)`).
@@ -298,7 +298,7 @@ For example, adding two vectors of the same size:
 :p
    addVec : n=>Float -> n=>Float -> n=>Float
    addVec x y = for i. x.i + y.i
-   --
+   #
    addVec [1.0, 2.0] [0.1, 0.2]
 > [1.1, 2.2]
 
@@ -307,7 +307,7 @@ For example, adding two vectors of the same size:
 :p
    myTranspose : n=>m=>a -> m=>n=>a
    myTranspose x = for i j. x.j.i
-   --
+   #
    myTranspose [[1,2,3],[10,20,30]]
 > [[1, 10], [2, 20], [3, 30]]
 
@@ -316,7 +316,7 @@ For example, adding two vectors of the same size:
 :p
    diag : n=>n=>a -> n=>a
    diag x = for i. x.i.i
-   --
+   #
    diag [[1,2],[10,20]]
 > [1, 20]
 
@@ -326,7 +326,7 @@ For example, adding two vectors of the same size:
   * visible type application
   * existentials, packing/unpacking
   * input and output
-  * compiler internals -- normalized IR, imperative IR, LLVM
+  * compiler internals # normalized IR, imperative IR, LLVM
   * currying tables
   * mention prelude
   * splittable PRNG
diff --git a/examples/tutorial.dx b/examples/tutorial.dx
index 1e91f9827..eaa6f5b72 100644
--- a/examples/tutorial.dx
+++ b/examples/tutorial.dx
@@ -266,7 +266,7 @@ def table_add5'(x : n => Float32) -> n => Float32 given (n|Ix) =
 def transFloat(x : m => n => Float32) -> n => m => Float32 given (n|Ix, m|Ix) =
   for i. for j. x[j, i]
 
-' The types of the input and the output are not the same -- Dex will detect situations
+' The types of the input and the output are not the same # Dex will detect situations
   where a transposition was omitted or inserted by accident.
 
 ' We can even make it more generic by abstracting over the value type.
@@ -327,14 +327,14 @@ def pixel(x:Char) -> Float32 =
     False -> r
 
 def getIm() -> Batch => Image =
-  -- File is unsigned bytes offset with 16 starting bytes
+  # File is unsigned bytes offset with 16 starting bytes
   AsList(_, im) = unsafe_io \. read_file "examples/t10k-images-idx3-ubyte"
   raw = unsafe_cast_table(for i:Full. im[ordinal(i) + 16 @ _])
   for b: Batch i j.
     pixel raw[ordinal((b, i, j)) @ Full]
 
 def getLabel() -> Batch => Class =
-  -- File is unsigned bytes offset with 8 starting bytes
+  # File is unsigned bytes offset with 8 starting bytes
   AsList(_, lab) = unsafe_io \. read_file "examples/t10k-labels-idx1-ubyte"
   r = unsafe_cast_table(for i:Batch. lab[ordinal(i) + 8 @ _])
   for i. (w8_to_n r[i] @ Class)
@@ -422,13 +422,13 @@ im2 : Fin 4 => Fin 4 => Fin 7 => Fin 7 => Float32 = imtile(ims[0@_])
   and `:=` assignment). Here's what it looks like:
 
 def table_mean(x : n => Float32) -> Float32 given (n|Ix) =
-  -- acc = 0
+  # acc = 0
   with_state(0.0) \acc.
-    -- for i in range(len(x))
+    # for i in range(len(x))
     for i.
-      -- acc = acc + x[i]
+      # acc = acc + x[i]
       acc := get(acc) + x[i]
-    -- return acc / len(x)
+    # return acc / len(x)
     get(acc) / n_to_f(size n)
 
 
@@ -477,7 +477,7 @@ with_state(10) \state.
   "let-binding"). This one is built into the language and can be used anywhere.
 
 :t for i:Height.
-        -- Bind a temporary variable `temp`, as an example.
+        # Bind a temporary variable `temp`, as an example.
         temp = ordinal(i) + 10
         for j:Width.
             temp
@@ -576,8 +576,8 @@ instance VSpace((a, b)) given (a|VSpace, b|VSpace)
 def table_mean'(x : n => v) -> v given (n|Ix, v|VSpace) =
   with_state(zero) \acc.
     for i.
-      acc := get(acc) + x[i]       -- `Add` requirement
-    get(acc) / n_to_f(size(n))       -- `VSpace` requirement
+      acc := get(acc) + x[i]       # `Add` requirement
+    get(acc) / n_to_f(size(n))       # `VSpace` requirement
 
 table_mean'([0.1, 0.5, 0.9])
 > 0.5
diff --git a/examples/vega-plotting.dx b/examples/vega-plotting.dx
index 2209fef42..eda00dd53 100644
--- a/examples/vega-plotting.dx
+++ b/examples/vega-plotting.dx
@@ -9,7 +9,7 @@
 ' ## JSON Implementation
 
 def join (joiner: List a) (lists:n=>(List a)) : List a =
-    -- Join together lists with an intermediary joiner
+    # Join together lists with an intermediary joiner
     concat $ for i.
         case ordinal i == (size n - 1) of
              True -> lists.i
@@ -17,7 +17,7 @@ def join (joiner: List a) (lists:n=>(List a)) : List a =
 
 ' A serialized JSON Value
 
--- TODO - once Dex supports recursive ADT JValue becomes Value.
+# TODO - once Dex supports recursive ADT JValue becomes Value.
 data JValue = AsJValue String
 
 ' Simple JSON Data Type
@@ -91,7 +91,7 @@ instance [ToJSON v] ToJSON (List (String & v))
 
 
 def wrapCol [ToJSON d]  (iso: Iso a (d & c)) (x:n=>a) : n=> Value =
-    -- Helper function. Returns JSON of a column of a record
+    # Helper function. Returns JSON of a column of a record
     for i. toJSON $ get_at iso x.i
 
 ' ## Declarative Graph Grammars
@@ -210,17 +210,17 @@ data VLChart row col v =
 instance [ToJSON v] ToJSON (VLChart r c v)
    toJSON = \ x.
     (AsVLDescriptor mark opts df) = x
-    -- Make the mark
+    # Make the mark
     (WithOpts mtype options) = mark
     jmark = ("mark", mergeOpts options [("type", show mtype)])
 
-    -- Make the data
+    # Make the data
     jdf = toJSON $ for row : r. toJSON $ for col : c.
                    ("col" <> (show $ ordinal col),
                     toJSON (get_at #rows df.col).row)
     jdata = ("data", toJSON [("values", jdf)])
 
-    -- Make the encodings
+    # Make the encodings
     jencodings = toJSON $ concat $ for col : c.
         (AsList v encopts) = get_at #encodings df.col
         AsList v $ for f.
diff --git a/lib/complex.dx b/lib/complex.dx
index b7c544ef0..3b2b40191 100644
--- a/lib/complex.dx
+++ b/lib/complex.dx
@@ -32,7 +32,7 @@ instance VSpace(Complex)
 instance Arbitrary(Complex)
   def arb(key) = Complex(randn key, randn key)
 
--- Todo: Hook up to (/) operator.  Might require two-parameter VSpace.
+# Todo: Hook up to (/) operator.  Might require two-parameter VSpace.
 def complex_division(x:Complex, y:Complex) -> Complex =
   (a, b, c, d) = (x.re, x.im, y.re, y.im)
   Complex((a * c + b * d) / (c * c + d * d), (b * c - a * d) / (c * c + d * d))
@@ -75,8 +75,8 @@ instance Fractional(Complex)
   def divide(x, y) = complex_division(x, y)
 
 def complex_floor(c:Complex) -> Complex =
-  -- from "Complex Floor" by Eugene McDonnell
-  -- https://www.jsoftware.com/papers/eem/complexfloor.htm
+  # from "Complex Floor" by Eugene McDonnell
+  # https://www.jsoftware.com/papers/eem/complexfloor.htm
   fr = floor c.re
   fi = floor c.im
   x = c.re - fr
@@ -91,8 +91,8 @@ def complex_ceil(x:Complex)  -> Complex = -(complex_floor (-x))
 def complex_round(x:Complex) -> Complex = complex_floor Complex(0.5, 0.0)
 
 def complex_lgamma(x:Complex) -> Complex =
-  todo  -- This one is pretty hairy.
-        -- See https://cs.uwaterloo.ca/research/tr/1994/23/CS-94-23.pdf
+  todo  # This one is pretty hairy.
+        # See https://cs.uwaterloo.ca/research/tr/1994/23/CS-94-23.pdf
 
 def complex_erf(x:Complex) -> Complex =
   todo
diff --git a/lib/diagram.dx b/lib/diagram.dx
index 46fffd400..7b5c52b7c 100644
--- a/lib/diagram.dx
+++ b/lib/diagram.dx
@@ -9,7 +9,7 @@ struct Point =
 data Geom =
   PointGeom
   Circle(Float)
-  Rectangle(Float, Float)  -- width, height
+  Rectangle(Float, Float)  # width, height
   Line(Point)
   Text(String)
 
@@ -34,7 +34,7 @@ struct GeomStyle =
 
 default_geom_style = GeomStyle Nothing (Just black) 1
 
--- TODO: consider sharing attributes among a set of objects for efficiency
+# TODO: consider sharing attributes among a set of objects for efficiency
 Object : Type = (GeomStyle, Point, Geom)
 struct Diagram =
   val : (List Object)
@@ -46,9 +46,9 @@ instance Monoid(Diagram)
 def concat_diagrams(diagrams:n=>Diagram) -> Diagram given (n|Ix) =
   Diagram $ concat $ each diagrams \d. d.val
 
--- TODO: arbitrary affine transformations. Our current representation of
--- rectangles and circles means we can only do scale/flip/rotate90.
--- Should we use lenses/isomorphisms for these instead?
+# TODO: arbitrary affine transformations. Our current representation of
+# rectangles and circles means we can only do scale/flip/rotate90.
+# Should we use lenses/isomorphisms for these instead?
 def apply_transformation(
     transformPoint: (Point) -> Point,
     transformGeom:  (Geom) -> Geom,
@@ -97,8 +97,8 @@ def update_geom(update: (GeomStyle) -> GeomStyle, d:Diagram) -> Diagram =
     (       attr, point, geoms) = obj
     (update attr, point, geoms)
 
--- TODO: these would be better if we had field-access-based ref projections, so we could
--- write `geom~fillColor := c` instead of unpack and packing explicitly.
+# TODO: these would be better if we had field-access-based ref projections, so we could
+# write `geom~fillColor := c` instead of unpack and packing explicitly.
 def set_fill_color(d:Diagram, c:HtmlColor) -> Diagram =
   update_geom (\s. GeomStyle (Just c) s.strokeColor s.strokeWidth) d
 
@@ -165,7 +165,7 @@ def attr_string(attr:GeomStyle) -> String =
 
 @noinline
 def render_geom(attr:GeomStyle, p:Point, geom:Geom) -> String =
-  -- For things that are solid. SVG says they have fill=stroke.
+  # For things that are solid. SVG says they have fill=stroke.
   solidAttr = GeomStyle attr.strokeColor attr.strokeColor attr.strokeWidth
   groupEle = \attr:GeomStyle s:String. tag_brackets_attr "g" (attr_string attr) s
   case geom of
@@ -192,8 +192,8 @@ def render_geom(attr:GeomStyle, p:Point, geom:Geom) -> String =
       textEle = \s:String. tag_brackets_attr("text",
         ("x" <=> p.x <.>
          "y" <=> p.y <.>
-         "text-anchor" <=> "middle" <.>    -- horizontal center
-         "dominant-baseline" <=> "middle"  -- vertical center
+         "text-anchor" <=> "middle" <.>    # horizontal center
+         "dominant-baseline" <=> "middle"  # vertical center
         ), s)
       groupEle solidAttr $ textEle content
 
@@ -208,8 +208,8 @@ def compute_bounds(d:Diagram) -> BoundingBox =
         PointGeom       -> 0.0
         Circle r        -> op r
         Rectangle(w, h) -> op $ (sel Point(w,h))/2.0
-        Line q          -> op $ max 0.0 $ op $ sel q  -- equivalent to either `-min(0, sel q)` or `max(0.0, sel q)` depending on op
-        Text _          -> 0.0  -- no size info possible as it is scale invariant
+        Line q          -> op $ max 0.0 $ op $ sel q  # equivalent to either `-min(0, sel q)` or `max(0.0, sel q)` depending on op
+        Text _          -> 0.0  # no size info possible as it is scale invariant
 
   AsList(_, objs) = d.val
   (
@@ -257,7 +257,7 @@ def move_y(d:Diagram, y:Float) -> Diagram = move_xy(d, 0.0, y)
        | set_fill_color blue | set_stroke_color red)
   <> (circle 5.0 | move_xy(40.0, 41.0))
   <> (rect  10.0 20.0 | move_xy(5.0, 10.0)  | set_stroke_color red)
-  <> (text "DexLang"  | move_xy(30.0, 10.0) | set_stroke_color green)
+  <> (text "types are good"  | move_xy(30.0, 10.0) | set_stroke_color green)
   <> (point_diagram   | move_xy(15.0, 5.0)  | set_stroke_color red)
   )
   render_scaled_svg mydiagram
@@ -269,7 +269,7 @@ def move_y(d:Diagram, y:Float) -> Diagram = move_xy(d, 0.0, y)
   concentric_diagram : Diagram = (
        (rect 2.0 2.0 | set_fill_color red)
     <> (circle 1.0 | set_fill_color blue)
-    <> (text "DexLang" | set_stroke_color white)
+    <> (text "types are good" | set_stroke_color white)
   ) | move_xy(5.0, 5.0)
   render_scaled_svg concentric_diagram
 > <html output>
diff --git a/lib/fft.dx b/lib/fft.dx
index df6fece04..87b28c0fd 100644
--- a/lib/fft.dx
+++ b/lib/fft.dx
@@ -12,7 +12,7 @@ import complex
 '## Helper functions
 
 def odd_sized_palindrome(mid:a, seq:n=>a) -> ((n `Either` () `Either` n)=>a) given (a, n|Ix) =
-  -- Turns sequence 12345 into 543212345.
+  # Turns sequence 12345 into 543212345.
   for i.
     case i of
       Left i -> case i of
@@ -27,10 +27,10 @@ data FTDirection =
   InverseFT
 
 def butterfly_ixs(j':halfn, pow2:Nat) -> (n, n, n, n) given (halfn|Ix, n|Ix) =
-  -- Re-index at a finer frequency.
-  -- halfn must have half the size of n.
-  -- For explanation, see https://en.wikipedia.org/wiki/Butterfly_diagram
-  -- Note: with fancier index sets, this might be replacable by reshapes.
+  # Re-index at a finer frequency.
+  # halfn must have half the size of n.
+  # For explanation, see https://en.wikipedia.org/wiki/Butterfly_diagram
+  # Note: with fancier index sets, this might be replacable by reshapes.
   j = ordinal j'
   k = ((idiv j pow2) * pow2 * 2) + mod j pow2
   left_write_ix  = unsafe_from_ordinal k
@@ -44,7 +44,7 @@ def power_of_2_fft(
   direction: FTDirection,
   x: ((Fin log2_n)=>(Fin 2))=>Complex
   ) -> ((Fin log2_n)=>(Fin 2))=>Complex given (log2_n:Nat) =
-  -- (Fin n)=>(Fin 2) has 2^n elements, so (Fin log2_n)=>(Fin 2) has exactly n.
+  # (Fin n)=>(Fin 2) has 2^n elements, so (Fin log2_n)=>(Fin 2) has exactly n.
 
   dir_const = case direction of
     ForwardFT -> -pi
@@ -56,17 +56,17 @@ def power_of_2_fft(
       xRef = snd_ref combRef
       ipow2 = get ipow2Ref
 
-      log2_half_n = unsafe_nat_diff log2_n 1  -- TODO: use `i` as a proof that log2_n > 0
+      log2_half_n = unsafe_nat_diff log2_n 1  # TODO: use `i` as a proof that log2_n > 0
       xRef := yield_accum (AddMonoid Complex) \bufRef.
-        for j:((Fin log2_half_n)=>(Fin 2)).  -- Executes in parallel.
+        for j:((Fin log2_half_n)=>(Fin 2)).  # Executes in parallel.
           (left_read_ix, right_read_ix,
            left_write_ix, right_write_ix) = butterfly_ixs j ipow2
 
-          -- Read one element from the last buffer, scaled.
+          # Read one element from the last buffer, scaled.
           angle = dir_const * (n_to_f $ mod (ordinal j) ipow2) / n_to_f ipow2
           v = (get xRef!right_read_ix) * (Complex (cos angle) (sin angle))
 
-          -- Add and subtract it to the relevant places in the new buffer.
+          # Add and subtract it to the relevant places in the new buffer.
           bufRef!left_write_ix  += (get (xRef!left_read_ix)) + v
           bufRef!right_write_ix += (get (xRef!left_read_ix)) - v
       ipow2Ref := ipow2 * 2
@@ -87,8 +87,8 @@ def convolve_complex(
     u:n=>Complex,
     v:m=>Complex
     ) -> (Either n m=>Complex) given (n|Ix, m|Ix) =
-  -- Convolve by pointwise multiplication in the Fourier domain.
-  -- Pad and convert to Fourier domain.
+  # Convolve by pointwise multiplication in the Fourier domain.
+  # Pad and convert to Fourier domain.
   min_convolve_size = (size n + size m) -| 1
   log_working_size = nextpow2 min_convolve_size
   u_padded = pad_to_power_of_2 log_working_size zero u
@@ -96,10 +96,10 @@ def convolve_complex(
   spectral_u = power_of_2_fft ForwardFT u_padded
   spectral_v = power_of_2_fft ForwardFT v_padded
 
-  -- Pointwise multiply.
+  # Pointwise multiply.
   spectral_conv = for i. spectral_u[i] * spectral_v[i]
 
-  -- Convert back to primal domain and undo padding.
+  # Convert back to primal domain and undo padding.
   padded_conv = power_of_2_fft InverseFT spectral_conv
   slice padded_conv 0 (Either n m)
 
@@ -110,9 +110,9 @@ def convolve(u:n=>Float, v:m=>Float) -> (Either n m =>Float) given (n|Ix, m|Ix)
   for i. ans[i].re
 
 def bluestein(x: n=>Complex) -> n=>Complex given (n|Ix) =
-  -- Bluestein's algorithm.
-  -- Converts the general FFT into a convolution,
-  -- which is then solved with calls to a power-of-2 FFT.
+  # Bluestein's algorithm.
+  # Converts the general FFT into a convolution,
+  # which is then solved with calls to a power-of-2 FFT.
   im = Complex 0.0 1.0
   wks = for i.
     i_squared = n_to_f $ sq $ ordinal i
diff --git a/lib/linalg.dx b/lib/linalg.dx
index a22ad7398..037f692a0 100644
--- a/lib/linalg.dx
+++ b/lib/linalg.dx
@@ -22,7 +22,7 @@ def lower_tri_identity() -> LowerTriMat n a given(n|Ix, a|Add|Mul)  =
 
 '## Representing inequalities between indices
 
--- These would be unnecessary if there were syntax for dependent pairs.
+# These would be unnecessary if there were syntax for dependent pairs.
 data LowerTriIx(n)    = MkLowerTriIx(   i:n, j:(..i))
 data UpperTriIx(n)    = MkUpperTriIx(   i:n, j:(i..))
 data LowerTriIxExc(n) = MkLowerTriIxExc(i:n, j:(..<i))
@@ -30,7 +30,7 @@ data UpperTriIxExc(n) = MkUpperTriIxExc(i:n, j:(i<..))
 
 '## Flipping inequalities between indices
 
--- TODO: Put these in instances of an Isomorphism interface?
+# TODO: Put these in instances of an Isomorphism interface?
 def transpose_upper_ix(i:n, j:(i..)) -> LowerTriIx n given (n|Ix) =
   j' = inject j
   i' = unsafe_project i
@@ -180,28 +180,28 @@ def relax_ee(i:(p<..), j:(..<p)) -> LowerTriIxExc n given(n|Ix, p:n) =
 
 def skew_symmetric_prod(lower: (i:n)=>(..<i)=>Float, y: n=>v) -> n=>v given (n|Ix, v|VSpace) =
   upper = transpose_lower_to_upper_no_diag lower
-  -- We could probably fuse these two for loops.
+  # We could probably fuse these two for loops.
   lower_prod = for i. sum for j. lower[i,j] .* y[inject j]
   upper_prod = for i. sum for j. upper[i,j] .* y[inject j]
   lower_prod - upper_prod
 
 def forward_substitute(a:LowerTriMat n Float, b:n=>v) -> n=>v given (n|Ix, v|VSpace) =
-  -- Solves lower triangular linear system (inverse a) **. b
+  # Solves lower triangular linear system (inverse a) **. b
   yield_state zero \sRef.
     for i:n.
-      s = sum for k:(..i).  -- dot product
+      s = sum for k:(..i).  # dot product
         a[i,k] .* get sRef!(inject k)
       sRef!i := (b[i] - s) / (lower_tri_diag a)[i]
 
 def backward_substitute(a:UpperTriMat n Float, b:n=>v) -> n=>v given (n|Ix, v|VSpace) =
-  -- Solves upper triangular linear system (inverse a) **. b
+  # Solves upper triangular linear system (inverse a) **. b
   yield_state zero \sRef.
     rof i:n.
-      s = sum for k:(i..).  -- dot product
+      s = sum for k:(i..).  # dot product
         a[i,k] .* get sRef!(inject k)
       sRef!i := (b[i] - s) / (upper_tri_diag a)[i]
 
--- Todo: get rid of these by writing a dependent indexing (!) operator.
+# Todo: get rid of these by writing a dependent indexing (!) operator.
 def lower_tri_mat(ref:Ref h (LowerTriMat a b), i:a, j:(..i)) -> Ref h b given (a|Data|Ix, b|Data, h) =
   d = %indexRef(ref, i)
   d!j
@@ -230,10 +230,10 @@ def chol(x:n=>n=>Float) -> LowerTriMat n Float given (n|Ix) =
 
 '## Permutations
 
--- The sign of the determinant of a permutation is either 1.0 or -1.0
+# The sign of the determinant of a permutation is either 1.0 or -1.0
 PermutationSign = Float
 
--- TODO: use a struct once we can handle reference projections of struct fields
+# TODO: use a struct once we can handle reference projections of struct fields
 struct Permutation(n|Ix) =
   p    : n=>n
   sign : PermutationSign
@@ -253,7 +253,7 @@ def swap_in_place(perm: Ref h (Permutation n), i:n, j:n) -> {State h} () given (
 '## LU decomposition functions
 
 def pivotize(a:n=>n=>Float) -> Permutation n given (n|Ix) =
-  -- Gives a row permutation that makes Gaussian elimination more stable.
+  # Gives a row permutation that makes Gaussian elimination more stable.
   yield_state identity_permutation \permRef.
     for j:n.
       row_with_largest = argmax for i:(j..). abs a[inject(to=n, i),j]
@@ -261,8 +261,8 @@ def pivotize(a:n=>n=>Float) -> Permutation n given (n|Ix) =
         swap_in_place permRef j (inject(to=n, row_with_largest))
 
 def lu(a: n=>n=>Float) -> (LowerTriMat n Float, UpperTriMat n Float, Permutation n) given (n|Ix) =
-  -- Computes lower, upper, and permuntation matrices from a square matrix,
-  -- such that apply_permutation permutation a == lower ** upper.
+  # Computes lower, upper, and permuntation matrices from a square matrix,
+  # such that apply_permutation permutation a == lower ** upper.
   permutation = pivotize a
   a = apply_permutation permutation a
 
@@ -273,23 +273,23 @@ def lu(a: n=>n=>Float) -> (LowerTriMat n Float, UpperTriMat n Float, Permutation
     lRef = fst_ref stateRef
     uRef = snd_ref stateRef
 
-  -- For reference, here's code to compute the LU decomposition
-  -- with standard flat matrices:
-  --  for j:n.
-  --    for i:(..j).
-  --      i = inject _ i
-  --      s = sum for k':(..i).
-  --        k = inject _ k'
-  --        (get uRef!k!j) * (get lRef!i!k)
-  --      uRef!i!j := a.i.j - s
-
-  --    for i':(j<..).
-  --      i = inject _ i'
-  --      s = sum for k':(..j).
-  --        k = inject _ k'
-  --        (get uRef!k!j) * (get lRef!i!k)
-  --      lRef!i!j := (a.i.j - s) / (get uRef!j!j)
-  --    for i:n. ()
+  # For reference, here's code to compute the LU decomposition
+  # with standard flat matrices:
+  #  for j:n.
+  #    for i:(..j).
+  #      i = inject _ i
+  #      s = sum for k':(..i).
+  #        k = inject _ k'
+  #        (get uRef!k!j) * (get lRef!i!k)
+  #      uRef!i!j := a.i.j - s
+
+  #    for i':(j<..).
+  #      i = inject _ i'
+  #      s = sum for k':(..j).
+  #        k = inject _ k'
+  #        (get uRef!k!j) * (get lRef!i!k)
+  #      lRef!i!j := (a.i.j - s) / (get uRef!j!j)
+  #    for i:n. ()
 
     for j:n.
       for i:(..j).
@@ -320,10 +320,10 @@ def lu(a: n=>n=>Float) -> (LowerTriMat n Float, UpperTriMat n Float, Permutation
 '## General linear algebra functions
 
 def solve(a:n=>n=>Float, b:n=>v) -> n=>v  given (n|Ix, v|VSpace) =
-  -- There's a small speedup possible by exploiting the fact
-  -- that l always has ones on the diagonal.  It would just require a
-  -- custom forward_substitute routine that doesn't divide
-  -- by the diagonal entries.
+  # There's a small speedup possible by exploiting the fact
+  # that l always has ones on the diagonal.  It would just require a
+  # custom forward_substitute routine that doesn't divide
+  # by the diagonal entries.
   (l, u, perm) = lu a
   b' = apply_permutation perm b
   y = forward_substitute l b'
diff --git a/lib/netpbm.dx b/lib/netpbm.dx
index ee28bd7f0..a5cd6043f 100644
--- a/lib/netpbm.dx
+++ b/lib/netpbm.dx
@@ -8,12 +8,12 @@ data Image =
   MkImage(rows:Nat, cols:Nat, pixels:(Fin rows => Fin cols => Fin 3 => Word8))
 
 parse_p6 : Parser Image = MkParser \h.
-  -- Loads a raw PPM file in P6 format.
-  -- The header will look something like:
-  --   P6
-  --   220 220    (width, height)
-  --   255        (max color value)
-  -- followed by a flat block of height x width x 3 chars.
+  # Loads a raw PPM file in P6 format.
+  # The header will look something like:
+  #   P6
+  #   220 220    (width, height)
+  #   255        (max color value)
+  # followed by a flat block of height x width x 3 chars.
   parse h $ p_char 'P'
   parse h $ p_char '6'
   parse h $ parse_any
diff --git a/lib/parser.dx b/lib/parser.dx
index 805cd71dc..12b82bdc0 100644
--- a/lib/parser.dx
+++ b/lib/parser.dx
@@ -9,8 +9,8 @@ def from_ordinal_exc(n|Ix, i:Nat) -> {Except} n =
     then unsafe_from_ordinal i
     else throw()
 
--- TODO: allow this to happen in-place
--- TODO: if it takes too long to make that possible, start with a bounded version
+# TODO: allow this to happen in-place
+# TODO: if it takes too long to make that possible, start with a bounded version
 def push(ref:Ref h (List a), x:a) -> {State h} () given (h, a|Data) =
   l = get ref
   ref := l <> AsList _ [x]
diff --git a/lib/plot.dx b/lib/plot.dx
index 189ca543b..f7c008854 100644
--- a/lib/plot.dx
+++ b/lib/plot.dx
@@ -9,13 +9,13 @@ data CompactSet(a:Type) =
 
 struct Scale(a:Type) =
   mapping : (a) -> Maybe Float
-  range   : List (CompactSet Float)  -- non-overlapping, ordered
+  range   : List (CompactSet Float)  # non-overlapping, ordered
 
 struct ScaledData(n|Ix, a:Type) =
   scale : Scale a
   dat   : n => a
 
--- TODO: bundle up the type params into a triple of types
+# TODO: bundle up the type params into a triple of types
 struct Plot(n|Ix, a:Type, b:Type, c:Type) =
   xs : ScaledData n a
   ys : ScaledData n b
@@ -60,7 +60,7 @@ def plot_to_diagram(plot:Plot n a b c) -> Diagram given (a:Type, b:Type, c:Type,
     x = get_scaled plot.xs i
     y = get_scaled plot.ys i
     c = get_scaled plot.cs i
-    -- TODO: nested may-fail patterns would make this much better
+    # TODO: nested may-fail patterns would make this much better
     case x of
       Just x' -> case y of
         Just y' -> case c of
@@ -78,20 +78,20 @@ def blank_data(n|Ix) -> ScaledData n () =
   ScaledData unit_type_scale (for i:n. ())
 
 def blank_plot(n|Ix) -> Plot n () () () =
- -- TODO: figure out why we need the annotations here. Top-down inference should work.
+ # TODO: figure out why we need the annotations here. Top-down inference should work.
   Plot(blank_data(n), blank_data(n), blank_data(n))
 
--- TODO: generalize beyond Float with a type class for auto scaling
+# TODO: generalize beyond Float with a type class for auto scaling
 def auto_scale(xs:n=>Float) -> ScaledData n Float given (n|Ix) =
   max = maximum xs
   min = minimum xs
-  -- Add 10% padding away from the plot area
+  # Add 10% padding away from the plot area
   space = (max - min) * 0.05
   padding = maximum [space, max * 0.001, 0.000001]
   ScaledData (float_scale (min - padding) (max + padding)) xs
 
 def set_x_data(plot:Plot n a b c, xs:ScaledData n new) -> Plot n new b c given (n|Ix, a:Type, b:Type, c:Type, new:Type) =
-  -- We can't use `setAt` here because we're changing the type
+  # We can't use `setAt` here because we're changing the type
   Plot xs plot.ys plot.cs
 
 def set_y_data(plot:Plot n a b c, ys:ScaledData n new) -> Plot n a new c given (n|Ix, a:Type, b:Type, c:Type, new:Type) =
@@ -115,12 +115,12 @@ def y_plot(ys:n=>Float) -> Plot n Float Float () given (n|Ix) =
   xs = for i:n. n_to_f $ ordinal i
   xy_plot xs ys
 
--- xs = linspace (Fin 100) 0. 1.0
--- :html showPlot $ xycPlot xs xs xs
+# xs = linspace (Fin 100) 0. 1.0
+# :html showPlot $ xycPlot xs xs xs
 
 '## Heatmap-style plots
 
--- TODO: scales
+# TODO: scales
 def matshow(img:n=>m=>Float) -> Html given (n|Ix, m|Ix) =
   low  = minimum $ flatten2D(img)
   high = maximum $ flatten2D(img)
diff --git a/lib/png.dx b/lib/png.dx
index 3937b0350..9d3541de8 100644
--- a/lib/png.dx
+++ b/lib/png.dx
@@ -15,7 +15,7 @@ encoding_table =
    'w', 'x', 'y', 'z', '0', '1', '2', '3',
    '4', '5', '6', '7', '8', '9', '+', '/']
 
--- TODO: make `Word8` an index set instead of using `Fin 256`
+# TODO: make `Word8` an index set instead of using `Fin 256`
 decoding_table : Fin 256 => Maybe Char =
   for i:(Fin 256).
     idx = linear_search encoding_table (n_to_w8 (ordinal i))
@@ -23,10 +23,10 @@ decoding_table : Fin 256 => Maybe Char =
       Nothing -> Nothing
       Just x  -> Just $ n_to_w8 $ ordinal x
 
-Base64 = Byte -- first two bits should be zero
+Base64 = Byte # first two bits should be zero
 
--- This could go in the prelude, or in a library of array-dicing functions.
--- An explicit "view" builder would be good here, to avoid copies
+# This could go in the prelude, or in a library of array-dicing functions.
+# An explicit "view" builder would be good here, to avoid copies
 def get_chunks(chunkSize:Nat, padVal:a, xs:n=>a)
     -> List (Fin chunkSize => a) given (n|Ix, a:Type) =
   numChunks = idiv_ceil (size n) chunkSize
@@ -43,7 +43,7 @@ def base64s_to_bytes(chunk : Fin 4 => Base64) -> Fin 3 => Byte =
 
 def bytes_to_base64s(chunk : Fin 3 => Byte) -> Fin 4 => Base64 =
   [a, b, c] = chunk
-  -- '?' is 00111111
+  # '?' is 00111111
   tmp =  [                 a .>>. 2
          , (a .<<. 4) .|. (b .>>. 4)
          , (b .<<. 2) .|. (c .>>. 6)
@@ -56,7 +56,7 @@ def base64_to_ascii(x:Base64) -> Char =
 def encode_chunk(chunk : Fin 3 => Char) -> Fin 4 => Char =
   each (bytes_to_base64s chunk) base64_to_ascii
 
--- TODO: the `AsList` unpacking is very tedious. Daniel's change will help
+# TODO: the `AsList` unpacking is very tedious. Daniel's change will help
 def base64_encode(s:String) -> String =
   AsList(n, cs) = s
   AsList(numChunks, chunks) = get_chunks 3 '\NUL' cs
@@ -77,7 +77,7 @@ def decode_chunk(chunk : Fin 4 => Char) -> Maybe (Fin 3 => Char) =
     Nothing -> Nothing
     Just base64s -> Just $ base64s_to_bytes base64s
 
--- TODO: put this in prelude?
+# TODO: put this in prelude?
 def replace(pair:(a,a), x:a) -> a given (a|Eq) =
   (old, new) = pair
   case x == old of
@@ -88,7 +88,7 @@ def base64_decode(s:String) -> Maybe String =
   AsList(n, cs) = s
   numValidInputChars = sum for i:(Fin n). b_to_n $ cs[i] /= '='
   numValidOutputChars = idiv (numValidInputChars * 3) 4
-  csZeroed = each cs \c. replace(('=', 'A'), c)  -- swap padding char with 'zero' char
+  csZeroed = each cs \c. replace(('=', 'A'), c)  # swap padding char with 'zero' char
   AsList(_, chunks) = get_chunks 4 '\NUL' csZeroed
   case chunks | each(decode_chunk) | seq_maybes of
     Nothing -> Nothing
diff --git a/lib/set.dx b/lib/set.dx
index 8a67a0177..babe2ee46 100644
--- a/lib/set.dx
+++ b/lib/set.dx
@@ -22,11 +22,11 @@ def all_except_last(xs:n=>a) -> List a given (n|Ix, a) =
   AsList _ allButLast
 
 def merge_unique_sorted_lists(xlist:List a, ylist:List a) -> List a given (a|Eq) =
-  -- This function is associative, for use in a monoidal reduction.
-  -- Assumes all xs are <= all ys.
-  -- The element at the end of xs might equal the
-  -- element at the beginning of ys.  If so, this
-  -- function removes the duplicate when concatenating the lists.
+  # This function is associative, for use in a monoidal reduction.
+  # Assumes all xs are <= all ys.
+  # The element at the end of xs might equal the
+  # element at the beginning of ys.  If so, this
+  # function removes the duplicate when concatenating the lists.
   AsList(nx, xs) = xlist
   AsList(_ , ys) = ylist
   case last xs of
@@ -45,9 +45,9 @@ def remove_duplicates_from_sorted(xs:n=>a) -> List a given (n|Ix, a|Eq) =
 '## Sets
 
 data Set(a|Ord) =
-  -- Guaranteed to be in sorted order with unique elements,
-  -- as long as no one else uses this constructor.
-  -- Instead use the "toSet" function below.
+  # Guaranteed to be in sorted order with unique elements,
+  # as long as no one else uses this constructor.
+  # Instead use the "toSet" function below.
   UnsafeAsSet(n:Nat, elements:(Fin n => a))
 
 def to_set(xs:n=>a) -> Set a given (n|Ix, a|Ord) =
@@ -81,7 +81,7 @@ def set_intersect(
     ) -> Set a given (a|Ord) =
   UnsafeAsSet(nx, xs) = sx
   UnsafeAsSet(ny, ys) = sy
-  -- This could be done in O(nx + ny) instead of O(nx log ny).
+  # This could be done in O(nx + ny) instead of O(nx log ny).
   isInYs = \x. case search_sorted_exact ys x of
     Just x -> True
     Nothing -> False
@@ -91,13 +91,13 @@ def set_intersect(
 
 '## Sets as a type, whose inhabitants can index arrays
 
--- TODO Implicit arguments to data definitions
--- (Probably `a` should be implicit)
+# TODO Implicit arguments to data definitions
+# (Probably `a` should be implicit)
 struct Element(set:(Set a)) given (a|Ord) =
   val: Nat
 
--- TODO The set argument could be implicit (inferred from the Element
--- type), but maybe it's easier to read if it's explicit.
+# TODO The set argument could be implicit (inferred from the Element
+# type), but maybe it's easier to read if it's explicit.
 def member(x:a, set:(Set a)) -> Maybe (Element set) given (a|Ord) =
   UnsafeAsSet(_, elts) = set
   case search_sorted_exact elts x of
diff --git a/lib/sort.dx b/lib/sort.dx
index 1178dd0cc..e61ee33a1 100644
--- a/lib/sort.dx
+++ b/lib/sort.dx
@@ -18,20 +18,20 @@ def concat_table(leftin: a=>v, rightin: b=>v) -> (Either a b=>v) given (a|Ix, b|
     Right i -> rightin[i]
 
 def merge_sorted_tables(xs:m=>a, ys:n=>a) -> (Either m n=>a) given (a|Ord, m|Ix, n|Ix) =
-  -- Possible improvements:
-  --  1) Using a SortedTable type.
-  --  2) Avoid needlessly initializing the return array.
-  init = concat_table xs ys  -- Initialize array of correct size.
+  # Possible improvements:
+  #  1) Using a SortedTable type.
+  #  2) Avoid needlessly initializing the return array.
+  init = concat_table xs ys  # Initialize array of correct size.
   yield_state init \buf.
     with_state (0, 0) \countrefs.
       for i:(Either m n).
         (cur_x, cur_y) = get countrefs
-        if cur_y >= size n  -- no ys left
+        if cur_y >= size n  # no ys left
           then
             countrefs := (cur_x + 1, cur_y)
             buf!i := xs[unsafe_from_ordinal cur_x]
           else
-            if cur_x < size m -- still xs left
+            if cur_x < size m # still xs left
               then
                 if xs[unsafe_from_ordinal cur_x] <= ys[unsafe_from_ordinal cur_y]
                   then
@@ -42,8 +42,8 @@ def merge_sorted_tables(xs:m=>a, ys:n=>a) -> (Either m n=>a) given (a|Ord, m|Ix,
                     buf!i := ys[unsafe_from_ordinal cur_y]
 
 def merge_sorted_lists(lx: List a, ly: List a) -> List a given (a|Ord) =
-  -- Need this wrapper because Dex can't automatically weaken
-  -- (a | b)=>c to ((Fin d)=>c)
+  # Need this wrapper because Dex can't automatically weaken
+  # (a | b)=>c to ((Fin d)=>c)
   AsList(nx, xs) = lx
   AsList(ny, ys) = ly
   sorted = merge_sorted_tables xs ys
@@ -51,15 +51,15 @@ def merge_sorted_lists(lx: List a, ly: List a) -> List a given (a|Ord) =
   AsList _ $ unsafe_cast_table(to=Fin newsize, sorted)
 
 def sort(xs: n=>a) -> n=>a given (n|Ix, a|Ord) =
-  -- Warning: Has quadratic runtime cost for now.
+  # Warning: Has quadratic runtime cost for now.
 
   xlists = for i:n. (AsList 1 [xs[i]])
 
-  -- Merge sort monoid:
+  # Merge sort monoid:
   mempty = AsList 0 []
   mcombine = merge_sorted_lists
 
-  -- reduce might someday recursively subdivide the problem.
+  # reduce might someday recursively subdivide the problem.
   AsList(_, r) = reduce mempty mcombine xlists
   unsafe_cast_table(to=n, r)
 
diff --git a/lib/stats.dx b/lib/stats.dx
index e4e688ccb..3ee8cc177 100644
--- a/lib/stats.dx
+++ b/lib/stats.dx
@@ -36,8 +36,8 @@ instance Arbitrary(LogSpace Float)
   def arb(k) = Exp (randn k)
 
 def is_infinite(x:f) -> Bool given (f|Fractional|Sub|Mul|Ord) =
-  -- Note: According to this function, nan is not infinite.
-  -- Todo: Make polymorphic versions of these and put them in the prelude.
+  # Note: According to this function, nan is not infinite.
+  # Todo: Make polymorphic versions of these and put them in the prelude.
   infinity = divide one zero
   neg_infinity = zero - infinity
   x == infinity || x == neg_infinity
@@ -81,15 +81,15 @@ def ls_sum(x:n=>LogSpace f) -> LogSpace f
 Simulation and evaluation of [probability distributions](https://en.wikipedia.org/wiki/Probability_distribution). Probability distributions which can be sampled should implement `Random`, and those which can be evaluated should implement the `Dist` interface. Distributions over an ordered space (such as typical *univariate* distributions) should also implement `OrderedDist`.
 
 interface Random(d, a)
-  draw : (d, Key) -> a                -- function for random draws
+  draw : (d, Key) -> a                # function for random draws
 
 interface Dist(d, a, f)
-  density : (d, a) -> LogSpace f      -- either the density function or mass function
+  density : (d, a) -> LogSpace f      # either the density function or mass function
 
 interface OrderedDist(d, a, f, given () (Dist d a f))
-  cumulative : (d, a) -> LogSpace f   -- the cumulative distribution function (CDF)
-  survivor   : (d, a) -> LogSpace f   -- the survivor function (complement of CDF)
-  quantile   : (d, f) -> a            -- the quantile function (inverse CDF)
+  cumulative : (d, a) -> LogSpace f   # the cumulative distribution function (CDF)
+  survivor   : (d, a) -> LogSpace f   # the survivor function (complement of CDF)
+  quantile   : (d, f) -> a            # the quantile function (inverse CDF)
 
 '## Univariate probability distributions
 Some commonly encountered univariate distributions.
@@ -245,7 +245,7 @@ instance OrderedDist(Normal, Float, Float)
     Exp $ log (0.5 * erfc ((d.loc - x) / (d.scale * sqrt(2.0))))
   def survivor(d, x) =
     Exp $ log (0.5 * erfc ((x - d.loc) / (d.scale * sqrt(2.0))))
-  def quantile(_, _)  = todo  -- Add `erfinv`.
+  def quantile(_, _)  = todo  # Add `erfinv`.
 
 
 '### Poisson distribution
diff --git a/static/style.css b/static/style.css
index 450f70bfd..38ee9e86b 100644
--- a/static/style.css
+++ b/static/style.css
@@ -34,7 +34,7 @@ body {
 }
 
 .code-block, .cell-results, .err-block, .result-block {
-  margin: 0em 0em 0em 4em;
+  margin: 0em 0em 0em 2em;
   padding: 0em 0em 0em 2em;
   display: block;
   font-family: monospace;