From 65b5311fa5e9f884bf84a456ad5bf87361d349da Mon Sep 17 00:00:00 2001
From: Zhanlue Yang <jim19930609@gmail.com>
Date: Thu, 20 Oct 2022 11:26:42 +0800
Subject: [PATCH] [misc] Rc v1.2.0 cherry-pick PR number 2 (#6384)

Issue: #

### Brief Summary

Co-authored-by: Mingrui Zhang <33411325+erizmr@users.noreply.github.com>
Co-authored-by: Ailing  <ailzhang@users.noreply.github.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 docs/lang/articles/debug/debugging.md         |   8 +
 python/taichi/ad/_ad.py                       |   6 +-
 python/taichi/lang/kernel_impl.py             |   8 +-
 taichi/program/kernel.cpp                     |   5 +-
 tests/python/test_ad_gdar_diffmpm.py          | 185 ++++++++++++++++++
 ...test_ad_global_data_access_rule_checker.py |  32 +++
 6 files changed, 240 insertions(+), 4 deletions(-)
 create mode 100644 tests/python/test_ad_gdar_diffmpm.py

diff --git a/docs/lang/articles/debug/debugging.md b/docs/lang/articles/debug/debugging.md
index bb2c55ba7b71b..9616d8da17ce7 100644
--- a/docs/lang/articles/debug/debugging.md
+++ b/docs/lang/articles/debug/debugging.md
@@ -68,6 +68,14 @@ def inside_taichi_scope():
 
 `print` in the Taichi scope is supported on the CPU, CUDA, and Vulkan backends only.
 
+:::note
+To enable printing on vulkan backend, please
+- make sure validation layer is installed via [vulkan sdk](https://vulkan.lunarg.com/sdk/home).
+- turn on debug mode by `ti.init(debug=True)`.
+
+Note printing is not supported on macOS vulkan backend.
+:::
+
 :::note
 `print` does not work in Graphical Python Shells, such as IDLE and Jupyter Notebook. This is because these backends print outputs to the console, not to the GUI.
 :::
diff --git a/python/taichi/ad/_ad.py b/python/taichi/ad/_ad.py
index f384dc1fe93fb..3e99da1681d3c 100644
--- a/python/taichi/ad/_ad.py
+++ b/python/taichi/ad/_ad.py
@@ -216,7 +216,11 @@ def __exit__(self, _type, value, tb):
             calls[0].autodiff_mode = mode
 
     def insert(self, func, args):
-        assert func.autodiff_mode == AutodiffMode.NONE, "Inserted funcs should be forward kernels."
+        # Kernels with mode `AutodiffMode.NONE` and `AutodiffMode.VALIDATION` are all forward kernels.
+        # The difference is there are `assert` for global data access rule check in VALIDATION kernels.
+        assert func.autodiff_mode in (
+            AutodiffMode.NONE, AutodiffMode.VALIDATION
+        ), "Inserted funcs should be forward kernels."
         self.modes.append(func.autodiff_mode)
         if self.validation:
             func.autodiff_mode = AutodiffMode.VALIDATION
diff --git a/python/taichi/lang/kernel_impl.py b/python/taichi/lang/kernel_impl.py
index cb479b38bf174..3a705d5d119e0 100644
--- a/python/taichi/lang/kernel_impl.py
+++ b/python/taichi/lang/kernel_impl.py
@@ -522,6 +522,8 @@ def materialize(self, key=None, args=None, arg_features=None):
             grad_suffix = "_forward_grad"
         elif self.autodiff_mode == AutodiffMode.REVERSE:
             grad_suffix = "_reverse_grad"
+        elif self.autodiff_mode == AutodiffMode.VALIDATION:
+            grad_suffix = "_validate_grad"
         kernel_name = f"{self.func.__name__}_c{self.kernel_counter}_{key[1]}{grad_suffix}"
         _logging.trace(f"Compiling kernel {kernel_name}...")
 
@@ -850,7 +852,11 @@ def __call__(self, *args, **kwargs):
         # Both the class kernels and the plain-function kernels are unified now.
         # In both cases, |self.grad| is another Kernel instance that computes the
         # gradient. For class kernels, args[0] is always the kernel owner.
-        if self.autodiff_mode == AutodiffMode.NONE and self.runtime.target_tape and not self.runtime.grad_replaced:
+
+        # No need to capture grad kernels because they are already bound with their primal kernels
+        if self.autodiff_mode in (
+                AutodiffMode.NONE, AutodiffMode.VALIDATION
+        ) and self.runtime.target_tape and not self.runtime.grad_replaced:
             self.runtime.target_tape.insert(self, args)
 
         if self.autodiff_mode != AutodiffMode.NONE and impl.current_cfg(
diff --git a/taichi/program/kernel.cpp b/taichi/program/kernel.cpp
index fc54d84204de2..a5efe6a096117 100644
--- a/taichi/program/kernel.cpp
+++ b/taichi/program/kernel.cpp
@@ -420,9 +420,10 @@ void Kernel::init(Program &program,
 
   this->arch = program.this_thread_config().arch;
 
-  if (autodiff_mode == AutodiffMode::kNone ||
-      autodiff_mode == AutodiffMode::kCheckAutodiffValid) {
+  if (autodiff_mode == AutodiffMode::kNone) {
     name = primal_name;
+  } else if (autodiff_mode == AutodiffMode::kCheckAutodiffValid) {
+    name = primal_name + "_validate_grad";
   } else if (autodiff_mode == AutodiffMode::kForward) {
     name = primal_name + "_forward_grad";
   } else if (autodiff_mode == AutodiffMode::kReverse) {
diff --git a/tests/python/test_ad_gdar_diffmpm.py b/tests/python/test_ad_gdar_diffmpm.py
new file mode 100644
index 0000000000000..349a16b3c3744
--- /dev/null
+++ b/tests/python/test_ad_gdar_diffmpm.py
@@ -0,0 +1,185 @@
+import pytest
+
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test(require=ti.extension.assertion, debug=True, exclude=[ti.cc])
+def test_gdar_mpm():
+    real = ti.f32
+
+    dim = 2
+    N = 30  # reduce to 30 if run out of GPU memory
+    n_particles = N * N
+    n_grid = 120
+    dx = 1 / n_grid
+    inv_dx = 1 / dx
+    dt = 3e-4
+    p_mass = 1
+    p_vol = 1
+    E = 100
+    mu = E
+    la = E
+    max_steps = 32
+    steps = 32
+    gravity = 9.8
+    target = [0.3, 0.6]
+
+    scalar = lambda: ti.field(dtype=real)
+    vec = lambda: ti.Vector.field(dim, dtype=real)
+    mat = lambda: ti.Matrix.field(dim, dim, dtype=real)
+
+    x = ti.Vector.field(dim,
+                        dtype=real,
+                        shape=(max_steps, n_particles),
+                        needs_grad=True)
+    x_avg = ti.Vector.field(dim, dtype=real, shape=(), needs_grad=True)
+    v = ti.Vector.field(dim,
+                        dtype=real,
+                        shape=(max_steps, n_particles),
+                        needs_grad=True)
+    grid_v_in = ti.Vector.field(dim,
+                                dtype=real,
+                                shape=(max_steps, n_grid, n_grid),
+                                needs_grad=True)
+    grid_v_out = ti.Vector.field(dim,
+                                 dtype=real,
+                                 shape=(max_steps, n_grid, n_grid),
+                                 needs_grad=True)
+    grid_m_in = ti.field(dtype=real,
+                         shape=(max_steps, n_grid, n_grid),
+                         needs_grad=True)
+    C = ti.Matrix.field(dim,
+                        dim,
+                        dtype=real,
+                        shape=(max_steps, n_particles),
+                        needs_grad=True)
+    F = ti.Matrix.field(dim,
+                        dim,
+                        dtype=real,
+                        shape=(max_steps, n_particles),
+                        needs_grad=True)
+    init_v = ti.Vector.field(dim, dtype=real, shape=(), needs_grad=True)
+    loss = ti.field(dtype=real, shape=(), needs_grad=True)
+
+    @ti.kernel
+    def set_v():
+        for i in range(n_particles):
+            v[0, i] = init_v[None]
+
+    @ti.kernel
+    def p2g(f: ti.i32):
+        for p in range(n_particles):
+            base = ti.cast(x[f, p] * inv_dx - 0.5, ti.i32)
+            fx = x[f, p] * inv_dx - ti.cast(base, ti.i32)
+            w = [0.5 * (1.5 - fx)**2, 0.75 - (fx - 1)**2, 0.5 * (fx - 0.5)**2]
+            new_F = (ti.Matrix.diag(dim=2, val=1) + dt * C[f, p]) @ F[f, p]
+            F[f + 1, p] = new_F
+            J = (new_F).determinant()
+            r, s = ti.polar_decompose(new_F)
+            cauchy = 2 * mu * (new_F - r) @ new_F.transpose() + \
+                    ti.Matrix.diag(2, la * (J - 1) * J)
+            stress = -(dt * p_vol * 4 * inv_dx * inv_dx) * cauchy
+            affine = stress + p_mass * C[f, p]
+            for i in ti.static(range(3)):
+                for j in ti.static(range(3)):
+                    offset = ti.Vector([i, j])
+                    dpos = (ti.cast(ti.Vector([i, j]), real) - fx) * dx
+                    weight = w[i](0) * w[j](1)
+                    grid_v_in[f, base + offset] += weight * (p_mass * v[f, p] +
+                                                             affine @ dpos)
+                    grid_m_in[f, base + offset] += weight * p_mass
+
+    bound = 3
+
+    @ti.kernel
+    def grid_op(f: ti.i32):
+        for i, j in ti.ndrange(n_grid, n_grid):
+            inv_m = 1 / (grid_m_in[f, i, j] + 1e-10)
+            v_out = inv_m * grid_v_in[f, i, j]
+            v_out[1] -= dt * gravity
+            if i < bound and v_out[0] < 0:
+                v_out[0] = 0
+            if i > n_grid - bound and v_out[0] > 0:
+                v_out[0] = 0
+            if j < bound and v_out[1] < 0:
+                v_out[1] = 0
+            if j > n_grid - bound and v_out[1] > 0:
+                v_out[1] = 0
+            grid_v_out[f, i, j] = v_out
+
+    @ti.kernel
+    def g2p(f: ti.i32):
+        for p in range(n_particles):
+            base = ti.cast(x[f, p] * inv_dx - 0.5, ti.i32)
+            fx = x[f, p] * inv_dx - ti.cast(base, real)
+            w = [
+                0.5 * (1.5 - fx)**2, 0.75 - (fx - 1.0)**2, 0.5 * (fx - 0.5)**2
+            ]
+            new_v = ti.Vector([0.0, 0.0])
+            new_C = ti.Matrix([[0.0, 0.0], [0.0, 0.0]])
+
+            for i in ti.static(range(3)):
+                for j in ti.static(range(3)):
+                    dpos = ti.cast(ti.Vector([i, j]), real) - fx
+                    g_v = grid_v_out[f, base(0) + i, base(1) + j]
+                    weight = w[i](0) * w[j](1)
+                    new_v += weight * g_v
+                    new_C += 4 * weight * g_v.outer_product(dpos) * inv_dx
+
+            # Here violate global data access rule, should be captured by the checker
+            v[f, p] = new_v
+            x[f + 1, p] = x[f, p] + dt * v[f, p]
+            C[f + 1, p] = new_C
+
+    @ti.kernel
+    def compute_x_avg():
+        for i in range(n_particles):
+            x_avg[None] += (1 / n_particles) * x[steps - 1, i]
+
+    @ti.kernel
+    def compute_loss():
+        dist = (x_avg[None] - ti.Vector(target))**2
+        loss[None] = 0.5 * (dist(0) + dist(1))
+
+    def substep(s):
+        p2g(s)
+        grid_op(s)
+        g2p(s)
+
+    # initialization
+    init_v[None] = [0, 0]
+
+    for i in range(n_particles):
+        F[0, i] = [[1, 0], [0, 1]]
+
+    for i in range(N):
+        for j in range(N):
+            x[0, i * N + j] = [dx * (i * 0.7 + 10), dx * (j * 0.7 + 25)]
+
+    set_v()
+
+    losses = []
+
+    for i in range(2):
+        grid_v_in.fill(0)
+        grid_m_in.fill(0)
+
+        x_avg[None] = [0, 0]
+
+        with pytest.raises(ti.TaichiAssertionError):
+            with ti.ad.Tape(loss=loss, validation=True):
+                set_v()
+                for s in range(steps - 1):
+                    substep(s)
+
+                compute_x_avg()
+                compute_loss()
+
+        l = loss[None]
+        losses.append(l)
+        grad = init_v.grad[None]
+        print('loss=', l, '   grad=', (grad[0], grad[1]))
+        learning_rate = 10
+        init_v[None][0] -= learning_rate * grad[0]
+        init_v[None][1] -= learning_rate * grad[1]
diff --git a/tests/python/test_ad_global_data_access_rule_checker.py b/tests/python/test_ad_global_data_access_rule_checker.py
index db85fda79165a..70c0c630cea8a 100644
--- a/tests/python/test_ad_global_data_access_rule_checker.py
+++ b/tests/python/test_ad_global_data_access_rule_checker.py
@@ -161,3 +161,35 @@ def kernel_2():
         func_calls = t.calls
     for f, _ in func_calls:
         assert f.autodiff_mode == AutodiffMode.NONE
+
+
+@test_utils.test(require=ti.extension.assertion, exclude=[ti.cc], debug=True)
+def test_validation_kernel_capture():
+    N = 16
+    T = 8
+    x = ti.field(dtype=ti.f32, shape=N, needs_grad=True)
+    loss = ti.field(dtype=ti.f32, shape=(), needs_grad=True)
+    b = ti.field(dtype=ti.f32, shape=(), needs_grad=True)
+
+    @ti.kernel
+    def kernel_1():
+        loss[None] = x[1] * b[None]
+
+    @ti.kernel
+    def kernel_2():
+        loss[None] = x[1] * b[None]
+
+    def forward(T):
+        for t in range(T):
+            kernel_1()
+            kernel_2()
+
+    for i in range(N):
+        x[i] = i
+
+    b[None] = 10
+    loss.grad[None] = 1
+
+    with ti.ad.Tape(loss=loss, validation=True) as t:
+        forward(T)
+        assert len(t.calls) == 2 * T and len(t.modes) == 2 * T