hipudding
diff --git a/‎ggml/src/ggml-cann/acl_ops.h
-72 b/‎ggml/src/ggml-cann/acl_ops.h
-72
diff --git a/‎ggml/src/ggml-cann/acl_tensor.cpp
+112-123 b/‎ggml/src/ggml-cann/acl_tensor.cpp
+112-123
@@ -1,11 +1,30 @@
+/**
+ * Copyright (c) 2023-2024 The ggml authors
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
 #include "acl_tensor.h"
 
 #include <algorithm>
 #include <cstring>
 
-/**
- * Mapping ggml_tensor type to acl_tensor type.
- */
 aclDataType type_mapping(ggml_type type) {
     switch (type) {
         case GGML_TYPE_F32:
@@ -24,50 +43,51 @@ aclDataType type_mapping(ggml_type type) {
     return ACL_DT_UNDEFINED;
 }
 
-
-/**
- * Transform ggml_tensor to acl_tensor. Note that ggml_tensor dimension order
- * is reversed compared to acl_tensor.
- *
- * If bcast_ne and bcast_nb is nullptr, use ggml_tensor's ne and nb.
- * otherwise, use bcast_ne bcast_nb, which means tensor dims should be
- * changed to satisfy the broadcast. @sa: get_bcast_shape.
- */
-aclTensor* create_acl_tensor(const ggml_tensor* tensor, int64_t* bcast_ne,
-                             size_t* bcast_nb, int64_t bcast_dims,
-                             aclFormat format, size_t offset) {
+aclTensor* create_acl_tensor(const ggml_tensor* tensor, int64_t* ne, size_t* nb,
+                             int64_t dims, aclFormat format, size_t offset) {
     // If tensor is bcasted, Up to GGML_MAX_DIMS additional dimensions will be
     // added.
     int64_t acl_ne[GGML_MAX_DIMS * 2], acl_stride[GGML_MAX_DIMS * 2];
-    int64_t acl_storage_ne = 0;
-    if (bcast_ne == nullptr) {
-        acl_storage_ne = ggml_nbytes(tensor);
+
+    int64_t acl_storage_len = 0;
+    if (ne == nullptr) {
+        acl_storage_len = ggml_nbytes(tensor);
         for (int i = 0; i < GGML_MAX_DIMS; i++) {
             acl_ne[i] = tensor->ne[i];
             // The step size of acl is in elements.
             acl_stride[i] = tensor->nb[i] / ggml_element_size(tensor);
         }
     } else {
         // With bcast
-        for (int i = 0; i < bcast_dims; i++) {
-            acl_storage_ne += (bcast_ne[i] - 1)*bcast_nb[i];
-            acl_ne[i] = bcast_ne[i];
-            acl_stride[i] = bcast_nb[i] / ggml_element_size(tensor);
+        for (int i = 0; i < dims; i++) {
+            acl_storage_len += (ne[i] - 1) * nb[i];
+            acl_ne[i] = ne[i];
+            acl_stride[i] = nb[i] / ggml_element_size(tensor);
         }
     }
 
-    int64_t dims = (bcast_dims == 0 ? GGML_MAX_DIMS : bcast_dims);
-    std::reverse(acl_ne, acl_ne + dims);
-    std::reverse(acl_stride, acl_stride + dims);
+    // Reverse ne and stride.
+    int64_t final_dims = (dims == 0 ? GGML_MAX_DIMS : dims);
+    std::reverse(acl_ne, acl_ne + final_dims);
+    std::reverse(acl_stride, acl_stride + final_dims);
 
-    aclTensor* acl_tensor = aclCreateTensor(
-        acl_ne, dims, type_mapping(tensor->type), acl_stride,
-        offset / ggml_element_size(tensor), format, &acl_storage_ne, 1,
-        tensor->data);
+    aclTensor* acl_tensor =
+        aclCreateTensor(acl_ne, final_dims, type_mapping(tensor->type),
+                        acl_stride, offset / ggml_element_size(tensor), format,
+                        &acl_storage_len, 1, tensor->data);
 
     return acl_tensor;
 }
 
+bool need_bcast(const ggml_tensor* t0, const ggml_tensor* t1) {
+    for (int i = 0; i < GGML_MAX_DIMS; i++) {
+        if (t1->ne[i] != t0->ne[i] && t1->ne[i] != 1) {
+            return true;
+        }
+    }
+    return false;
+}
+
 aclTensor* create_acl_tensor(void* data_ptr, aclDataType dtype,
                              size_t type_size, int64_t* ne, size_t* nb,
                              int64_t dims, aclFormat format, size_t offset) {
@@ -82,126 +102,95 @@ aclTensor* create_acl_tensor(void* data_ptr, aclDataType dtype,
     std::reverse(tmp_ne, tmp_ne + dims);
     std::reverse(tmp_stride, tmp_stride + dims);
 
-    int64_t acl_storage_ne = 0;
+    int64_t acl_storage_len = 0;
     for (int i = 0; i < dims; i++) {
-        acl_storage_ne += (ne[i] - 1)*nb[i];
+        acl_storage_len += (ne[i] - 1) * nb[i];
     }
 
-    aclTensor* acl_tensor = aclCreateTensor(tmp_ne, dims, dtype, tmp_stride,
-                                            offset / type_size, format, &acl_storage_ne,
-                                            1, data_ptr);
+    aclTensor* acl_tensor =
+        aclCreateTensor(tmp_ne, dims, dtype, tmp_stride, offset / type_size,
+                        format, &acl_storage_len, 1, data_ptr);
 
     return acl_tensor;
 }
 
-/**
- * Add extra dims to satisfy acl kernel's broadcast rules (same as numpy).
- * ggml_tensor dimension order is reversed compared to Python.
- * bcast src1 with src0 though adding a extra dim.
- * for example:
- * src0 -> (32,10,10,10)
- * src1 -> (16,10,10,10)
- * bcast_ne_src0 -> (16,2,10,10,10)
- * bcast_ne_src1 -> (16,1,10,10,10)
- *
- * if dim0 has padding.
- * a -> (2, 2) padding = 2
- *  a: [[1, 2, *, *]
- *      [2, 3, *, *]]
- * nb = (8, 4, 2)
- *
- * if a should bcast with b -> (2, 4)
- * b' -> (2, 2, 2)
- * b : [[1, 2, 3, 4, *, *]
- *      [5, 6, 7, 8, *, *]]
- * nb = (12, 6, 1)
- *
- * after bcast:
- * a' -> (2, 1, 2)
- * a': [[[1, 2], *, *]
- *      [[2, 3], *, *]]
- * nb = (8, 4, 2, 1)
- *
- * b' : [[[1, 2], [3, 4], *, *]
- *       [[5, 6], [7, 8], *, *]]
- * nb = (12, 6, 2, 1)
- *
- * because dim1 in a inserted dim, should add nb for dim1,
- * and all other nb moves to next in order.
- */
 int64_t get_bcast_shape(const ggml_tensor* src0, const ggml_tensor* src1,
-                        int64_t* bcast_ne_src0, int64_t* bcast_ne_src1,
-                        size_t* bcast_nb_src0, size_t* bcast_nb_src1) {
+                        int64_t* bcast_src0_ne, int64_t* bcast_src1_ne,
+                        size_t* bcast_src0_nb, size_t* bcast_src1_nb) {
     GGML_ASSERT(ggml_can_repeat(src1, src0));
     int bcast_dim_cnt = 0;
     for (int i = 0; i < GGML_MAX_DIMS; i++) {
         int64_t nr = src0->ne[i] / src1->ne[i];
-        bcast_ne_src0[bcast_dim_cnt] = src0->ne[i] / nr;
-        bcast_ne_src1[bcast_dim_cnt] = src1->ne[i];
-        bcast_nb_src0[bcast_dim_cnt] = src0->nb[i];
-        bcast_nb_src1[bcast_dim_cnt] = src1->nb[i];
+        bcast_src0_ne[bcast_dim_cnt] = src0->ne[i] / nr;
+        bcast_src1_ne[bcast_dim_cnt] = src1->ne[i];
+        bcast_src0_nb[bcast_dim_cnt] = src0->nb[i];
+        bcast_src1_nb[bcast_dim_cnt] = src1->nb[i];
         bcast_dim_cnt++;
         if (nr != 1) {
             // Need to add an extra dim.
-            bcast_ne_src0[bcast_dim_cnt] = nr;
-            bcast_ne_src1[bcast_dim_cnt] = 1;
-            bcast_nb_src0[bcast_dim_cnt] = bcast_nb_src0[bcast_dim_cnt - 1] *
-                                           bcast_ne_src0[bcast_dim_cnt - 1];
-            bcast_nb_src1[bcast_dim_cnt] = bcast_nb_src1[bcast_dim_cnt - 1] *
-                                           bcast_ne_src1[bcast_dim_cnt - 1];
+            bcast_src0_ne[bcast_dim_cnt] = nr;
+            bcast_src1_ne[bcast_dim_cnt] = 1;
+            bcast_src0_nb[bcast_dim_cnt] = bcast_src0_nb[bcast_dim_cnt - 1] *
+                                           bcast_src0_ne[bcast_dim_cnt - 1];
+            bcast_src1_nb[bcast_dim_cnt] = bcast_src1_nb[bcast_dim_cnt - 1] *
+                                           bcast_src1_ne[bcast_dim_cnt - 1];
             bcast_dim_cnt++;
         }
     }
     return bcast_dim_cnt;
 }
 
-int64_t get_bcast_shape(const int64_t* src0_ne, const int64_t* src1_ne, const size_t* src0_nb, const size_t* src1_nb,
-                        int64_t* bcast_ne_src0, int64_t* bcast_ne_src1,
-                        size_t* bcast_nb_src0, size_t* bcast_nb_src1, int32_t start_dim) {
+int64_t get_mul_mat_bcast_shape(const int64_t* input_ne,
+                                const int64_t* weight_ne, const int64_t* dst_ne,
+                                const size_t* input_nb, const size_t* weight_nb,
+                                const size_t* dst_nb, int64_t* bcast_input_ne,
+                                int64_t* bcast_weight_ne, int64_t* bcast_dst_ne,
+                                size_t* bcast_input_nb, size_t* bcast_weight_nb,
+                                size_t* bcast_dst_nb) {
+    // input and dst shoule in same shape, except first two dims.
+    GGML_ASSERT(input_ne[2] == dst_ne[2]);
+    GGML_ASSERT(input_ne[3] == dst_ne[3]);
+
     int bcast_dim_cnt = 0;
-    int i = 0;
-    for(;i<start_dim;i++) {
-        bcast_ne_src0[bcast_dim_cnt] = src0_ne[i];
-        bcast_ne_src1[bcast_dim_cnt] = src1_ne[i];
-        bcast_nb_src0[bcast_dim_cnt] = src0_nb[i];
-        bcast_nb_src1[bcast_dim_cnt] = src1_nb[i];
-        bcast_dim_cnt++;
-    }
-    for (;i < GGML_MAX_DIMS; i++) {
-        int64_t nr = src0_ne[i] / src1_ne[i];
-        if (nr != 1) {
-            // Need to add an extra dim.
-            bcast_ne_src0[bcast_dim_cnt] = nr;
-            bcast_ne_src1[bcast_dim_cnt] = 1;
-            bcast_nb_src0[bcast_dim_cnt] = src0_nb[i];
-            bcast_nb_src1[bcast_dim_cnt] = src1_nb[i];
+
+    // For mul_mat, a dimension needs to be added before the dimension that
+    // weight needs to be expanded to satisfy the bcast rule of matrix
+    // multiplication.
+    for (int i = 0; i < GGML_MAX_DIMS; i++) {
+        int64_t nr = input_ne[i] / weight_ne[i];
+        // Do not use bcast in the first two dimensions because we only support
+        // the bcast batch dimension. Just copy them.
+        if (i < 2 || nr == 1) {
+            bcast_input_ne[bcast_dim_cnt] = input_ne[i];
+            bcast_weight_ne[bcast_dim_cnt] = weight_ne[i];
+            bcast_dst_ne[bcast_dim_cnt] = dst_ne[i];
+
+            bcast_input_nb[bcast_dim_cnt] = input_nb[i];
+            bcast_weight_nb[bcast_dim_cnt] = weight_nb[i];
+            bcast_dst_nb[bcast_dim_cnt] = dst_nb[i];
             bcast_dim_cnt++;
-            bcast_ne_src0[bcast_dim_cnt] = src0_ne[i] / nr;
-            bcast_ne_src1[bcast_dim_cnt] = src1_ne[i];
-            bcast_nb_src0[bcast_dim_cnt] = bcast_nb_src0[bcast_dim_cnt - 1] * bcast_ne_src0[bcast_dim_cnt - 1];
-            bcast_nb_src1[bcast_dim_cnt] = bcast_nb_src1[bcast_dim_cnt - 1] * bcast_ne_src1[bcast_dim_cnt - 1];
+        } else {
+            // Need to add an extra dim.
+            bcast_input_ne[bcast_dim_cnt] = nr;
+            bcast_dst_ne[bcast_dim_cnt] = nr;
+            bcast_weight_ne[bcast_dim_cnt] = 1;
+            bcast_input_nb[bcast_dim_cnt] = input_nb[i];
+            bcast_dst_nb[bcast_dim_cnt] = dst_nb[i];
+            bcast_weight_nb[bcast_dim_cnt] = weight_nb[i];
             bcast_dim_cnt++;
-        }
-        else {
-            bcast_ne_src0[bcast_dim_cnt] = src0_ne[i];
-            bcast_ne_src1[bcast_dim_cnt] = src1_ne[i];
-            bcast_nb_src0[bcast_dim_cnt] = src0_nb[i];
-            bcast_nb_src1[bcast_dim_cnt] = src1_nb[i];
+
+            bcast_input_ne[bcast_dim_cnt] = input_ne[i] / nr;
+            bcast_dst_ne[bcast_dim_cnt] = dst_ne[i] / nr;
+            bcast_weight_ne[bcast_dim_cnt] = weight_ne[i];
+            bcast_input_nb[bcast_dim_cnt] = bcast_input_nb[bcast_dim_cnt - 1] *
+                                            bcast_input_ne[bcast_dim_cnt - 1];
+            bcast_dst_nb[bcast_dim_cnt] = bcast_dst_nb[bcast_dim_cnt - 1] *
+                                          bcast_dst_ne[bcast_dim_cnt - 1];
+            bcast_weight_nb[bcast_dim_cnt] =
+                bcast_weight_nb[bcast_dim_cnt - 1] *
+                bcast_weight_ne[bcast_dim_cnt - 1];
             bcast_dim_cnt++;
         }
     }
     return bcast_dim_cnt;
 }
-
-/**
- * Check if shape are not same, and no dim equals 1.
- * if any dim equals 1, acl kernel will do the broadcast.
- */
-bool need_bcast(const ggml_tensor* t0, const ggml_tensor* t1) {
-    for (int i = 0; i < GGML_MAX_DIMS; i++) {
-        if (t1->ne[i] != t0->ne[i] && t1->ne[i] != 1) {
-            return true;
-        }
-    }
-    return false;
-}