diff --git a/docs/Dialects/krnl.md b/docs/Dialects/krnl.md
index 0e6c8a065c..4a9a3da08d 100644
--- a/docs/Dialects/krnl.md
+++ b/docs/Dialects/krnl.md
@@ -453,7 +453,7 @@ in the `value` dense element attribute.
 
 Traits: `AlwaysSpeculatableImplTrait`, `MemRefsNormalizable`
 
-Interfaces: `ConditionallySpeculatable`, `NoMemoryEffect (MemoryEffectOpInterface)`
+Interfaces: `ConditionallySpeculatable`, `KrnlGlobalOpInterface`, `NoMemoryEffect (MemoryEffectOpInterface)`
 
 Effects: `MemoryEffects::Effect{}`
 
diff --git a/docs/Dialects/onnx.md b/docs/Dialects/onnx.md
index 38d6eac50e..3996ad35d6 100644
--- a/docs/Dialects/onnx.md
+++ b/docs/Dialects/onnx.md
@@ -529,7 +529,7 @@ AveragePool consumes an input tensor X and applies average pooling across
  ```
  output_spatial_shape[i] = ceil((input_spatial_shape[i] + pad_shape[i] - dilation[i] * (kernel_shape[i] - 1) - 1) / strides_spatial_shape[i] + 1)
  ```
- if ceil_mode is enabled. `pad_shape[i]` is the sum of pads along axis `i`. Sliding windows that would start in the right padded region are ignored.
+ if ceil_mode is enabled. `pad_shape[i]` is the sum of pads along axis `i`.
 
  `auto_pad` is a DEPRECATED attribute. If you are using them currently, the output spatial shape will be following when ceil_mode is enabled:
  ```
@@ -1701,15 +1701,15 @@ Effects: `MemoryEffects::Effect{}`
 
 | Operand | Description |
 | :-----: | ----------- |
-| `X` | tensor of 16-bit float values or tensor of 32-bit float values or tensor of 64-bit float values
-| `W` | tensor of 16-bit float values or tensor of 32-bit float values or tensor of 64-bit float values
-| `B` | tensor of 16-bit float values or tensor of 32-bit float values or tensor of 64-bit float values or none type
+| `X` | tensor of 16-bit float values or tensor of 16-bit float values or tensor of 32-bit float values or tensor of 64-bit float values
+| `W` | tensor of 16-bit float values or tensor of 16-bit float values or tensor of 32-bit float values or tensor of 64-bit float values
+| `B` | tensor of 16-bit float values or tensor of 16-bit float values or tensor of 32-bit float values or tensor of 64-bit float values or none type
 
 #### Results:
 
 | Result | Description |
 | :----: | ----------- |
-| `Y` | tensor of 16-bit float values or tensor of 32-bit float values or tensor of 64-bit float values
+| `Y` | tensor of 16-bit float values or tensor of 16-bit float values or tensor of 32-bit float values or tensor of 64-bit float values
 
 ### `onnx.ConvTranspose` (ONNXConvTransposeOp)
 
@@ -2610,13 +2610,13 @@ Effects: `MemoryEffects::Effect{}`
 
 | Operand | Description |
 | :-----: | ----------- |
-| `input` | tensor of 16-bit float values or tensor of 32-bit float values or tensor of 64-bit float values or tensor of bfloat16 type values
+| `input` | tensor of bfloat16 type values or tensor of 16-bit float values or tensor of 32-bit float values or tensor of 64-bit float values
 
 #### Results:
 
 | Result | Description |
 | :----: | ----------- |
-| `output` | tensor of 16-bit float values or tensor of 32-bit float values or tensor of 64-bit float values or tensor of bfloat16 type values
+| `output` | tensor of bfloat16 type values or tensor of 16-bit float values or tensor of 32-bit float values or tensor of 64-bit float values
 
 ### `onnx.Expand` (ONNXExpandOp)
 
@@ -3282,13 +3282,13 @@ Effects: `MemoryEffects::Effect{}`
 
 | Operand | Description |
 | :-----: | ----------- |
-| `X` | tensor of 16-bit float values or tensor of 32-bit float values or tensor of 64-bit float values
+| `X` | tensor of bfloat16 type values or tensor of 16-bit float values or tensor of 32-bit float values or tensor of 64-bit float values
 
 #### Results:
 
 | Result | Description |
 | :----: | ----------- |
-| `Y` | tensor of 16-bit float values or tensor of 32-bit float values or tensor of 64-bit float values
+| `Y` | tensor of bfloat16 type values or tensor of 16-bit float values or tensor of 32-bit float values or tensor of 64-bit float values
 
 ### `onnx.GlobalMaxPool` (ONNXGlobalMaxPoolOp)
 
@@ -4817,7 +4817,7 @@ Effects: `MemoryEffects::Effect{}`
 
 _ONNX MatMulInteger operation_
 
-Matrix product that behaves like numpy.matmul: https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.matmul.html.
+Matrix product that behaves like [numpy.matmul](https://numpy.org/doc/stable/reference/generated/numpy.matmul.html).
 The production MUST never overflow. The accumulation may overflow if and only if in 32 bits.
 
 Traits: `AlwaysSpeculatableImplTrait`
@@ -4845,7 +4845,7 @@ Effects: `MemoryEffects::Effect{}`
 
 _ONNX MatMul operation_
 
-Matrix product that behaves like numpy.matmul: https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.matmul.html
+Matrix product that behaves like [numpy.matmul](https://numpy.org/doc/stable/reference/generated/numpy.matmul.html).
 
 Traits: `AlwaysSpeculatableImplTrait`
 
@@ -4910,7 +4910,7 @@ MaxPool consumes an input tensor X and applies max pooling across
  ```
  output_spatial_shape[i] = ceil((input_spatial_shape[i] + pad_shape[i] - dilation[i] * (kernel_shape[i] - 1) - 1) / strides_spatial_shape[i] + 1)
  ```
- if ceil_mode is enabled. `pad_shape[i]` is the sum of pads along axis `i`. Sliding windows that would start in the right padded region are ignored.
+ if ceil_mode is enabled. `pad_shape[i]` is the sum of pads along axis `i`.
 
  `auto_pad` is a DEPRECATED attribute. If you are using them currently, the output spatial shape will be following when ceil_mode is enabled:
  ```
@@ -6611,7 +6611,7 @@ Effects: `MemoryEffects::Effect{}`
 
 _ONNX QLinearMatMul operation_
 
-Matrix product that behaves like numpy.matmul: https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.matmul.html.
+Matrix product that behaves like [numpy.matmul](https://numpy.org/doc/stable/reference/generated/numpy.matmul.html).
 It consumes two quantized input tensors, their scales and zero points, scale and zero point of output,
 and computes the quantized output. The quantization formula is y = saturate((x / y_scale) + y_zero_point).
 For (x / y_scale), it is rounding to nearest ties to even. Refer to https://en.wikipedia.org/wiki/Rounding for details.
@@ -10215,13 +10215,13 @@ Effects: `MemoryEffects::Effect{}`
 
 | Operand | Description |
 | :-----: | ----------- |
-| `input` | tensor of 16-bit float values or tensor of 32-bit float values or tensor of 64-bit float values or tensor of bfloat16 type values
+| `input` | tensor of bfloat16 type values or tensor of 16-bit float values or tensor of 32-bit float values or tensor of 64-bit float values
 
 #### Results:
 
 | Result | Description |
 | :----: | ----------- |
-| `output` | tensor of 16-bit float values or tensor of 32-bit float values or tensor of 64-bit float values or tensor of bfloat16 type values
+| `output` | tensor of bfloat16 type values or tensor of 16-bit float values or tensor of 32-bit float values or tensor of 64-bit float values
 
 ### `onnx.TfIdfVectorizer` (ONNXTfIdfVectorizerOp)
 
diff --git a/docs/Dialects/zhigh.md b/docs/Dialects/zhigh.md
index 4780cbe551..0ce869bc1c 100644
--- a/docs/Dialects/zhigh.md
+++ b/docs/Dialects/zhigh.md
@@ -793,6 +793,8 @@ Effects: `MemoryEffects::Effect{}`
 _ZHigh Stickified Constant operation_
 
 This operator produces a constant tensor to store stickified data.
+`value` attribute has original constant or stickified constant.
+`stickified` attribute indicates the `value` is already stickified or not.
 Stickified data is opaque and must be 4K-aligned. One who produces
 the stickified data must make sure its size in bytes consistent with
 the output tensor's size.
@@ -807,6 +809,7 @@ Effects: `MemoryEffects::Effect{}`
 
 <table>
 <tr><th>Attribute</th><th>MLIR Type</th><th>Description</th></tr>
+<tr><td><code>stickified</code></td><td>::mlir::BoolAttr</td><td>bool attribute</td></tr>
 <tr><td><code>value</code></td><td>::mlir::Attribute</td><td>any attribute</td></tr>
 <tr><td><code>alignment</code></td><td>::mlir::IntegerAttr</td><td>64-bit signless integer attribute</td></tr>
 </table>
diff --git a/docs/Dialects/zlow.md b/docs/Dialects/zlow.md
index ba6907fced..4b1c3c3b81 100644
--- a/docs/Dialects/zlow.md
+++ b/docs/Dialects/zlow.md
@@ -752,6 +752,34 @@ Interfaces: `MemoryEffectOpInterface`
 | `X` | memref of 16-bit float or 32-bit float values
 | `Out` | memref of dlfloat16 type values
 
+### `zlow.stickifiedConstant` (::onnx_mlir::zlow::ZLowStickifiedConstantOp)
+
+_ZLow Stickified Constant operation._
+
+
+Traits: `MemRefsNormalizable`
+
+Interfaces: `KrnlGlobalOpInterface`
+
+#### Attributes:
+
+<table>
+<tr><th>Attribute</th><th>MLIR Type</th><th>Description</th></tr>
+<tr><td><code>shape</code></td><td>::mlir::Attribute</td><td>any attribute</td></tr>
+<tr><td><code>name</code></td><td>::mlir::StringAttr</td><td>string attribute</td></tr>
+<tr><td><code>stickified</code></td><td>::mlir::BoolAttr</td><td>bool attribute</td></tr>
+<tr><td><code>value</code></td><td>::mlir::Attribute</td><td>any attribute</td></tr>
+<tr><td><code>layout</code></td><td>::mlir::StringAttr</td><td>string attribute</td></tr>
+<tr><td><code>offset</code></td><td>::mlir::IntegerAttr</td><td>64-bit signless integer attribute</td></tr>
+<tr><td><code>alignment</code></td><td>::mlir::IntegerAttr</td><td>64-bit signless integer attribute</td></tr>
+</table>
+
+#### Results:
+
+| Result | Description |
+| :----: | ----------- |
+| `output` | memref of dlfloat16 type values
+
 ### `zlow.sub` (::onnx_mlir::zlow::ZLowSubOp)
 
 _ZLow sub operation_
diff --git a/src/Dialect/ONNX/ONNXOps.td.inc b/src/Dialect/ONNX/ONNXOps.td.inc
index cfd3883ed8..0516cd5f3e 100644
--- a/src/Dialect/ONNX/ONNXOps.td.inc
+++ b/src/Dialect/ONNX/ONNXOps.td.inc
@@ -426,7 +426,7 @@ def ONNXAveragePoolOp:ONNX_Op<"AveragePool",
    ```
    output_spatial_shape[i] = ceil((input_spatial_shape[i] + pad_shape[i] - dilation[i] * (kernel_shape[i] - 1) - 1) / strides_spatial_shape[i] + 1)
    ```
-   if ceil_mode is enabled. `pad_shape[i]` is the sum of pads along axis `i`. Sliding windows that would start in the right padded region are ignored.
+   if ceil_mode is enabled. `pad_shape[i]` is the sum of pads along axis `i`.
   
    `auto_pad` is a DEPRECATED attribute. If you are using them currently, the output spatial shape will be following when ceil_mode is enabled:
    ```
@@ -2283,8 +2283,8 @@ def ONNXExpOp:ONNX_Op<"Exp",
   let description = [{
   Calculates the exponential of the given input tensor, element-wise.
   }];
-  let arguments = (ins AnyTypeOf<[TensorOf<[F16]>, TensorOf<[F32]>, TensorOf<[F64]>, TensorOf<[BF16]>]>:$input);
-  let results = (outs AnyTypeOf<[TensorOf<[F16]>, TensorOf<[F32]>, TensorOf<[F64]>, TensorOf<[BF16]>]>:$output);
+  let arguments = (ins AnyTypeOf<[TensorOf<[BF16]>, TensorOf<[F16]>, TensorOf<[F32]>, TensorOf<[F64]>]>:$input);
+  let results = (outs AnyTypeOf<[TensorOf<[BF16]>, TensorOf<[F16]>, TensorOf<[F32]>, TensorOf<[F64]>]>:$output);
   let builders = [
     OpBuilder<(ins "Value":$input), [{
       auto resultType = UnrankedTensorType::get(mlir::cast<ShapedType>(input.getType()).getElementType());
@@ -2947,9 +2947,9 @@ def ONNXGlobalLpPoolOp:ONNX_Op<"GlobalLpPool",
    the values in the same channel. This is equivalent to LpPool with kernel size
    equal to the spatial dimension of input tensor.
   }];
-  let arguments = (ins AnyTypeOf<[TensorOf<[F16]>, TensorOf<[F32]>, TensorOf<[F64]>]>:$X,
+  let arguments = (ins AnyTypeOf<[TensorOf<[BF16]>, TensorOf<[F16]>, TensorOf<[F32]>, TensorOf<[F64]>]>:$X,
     DefaultValuedAttr<SI64Attr, "2">:$p);
-  let results = (outs AnyTypeOf<[TensorOf<[F16]>, TensorOf<[F32]>, TensorOf<[F64]>]>:$Y);
+  let results = (outs AnyTypeOf<[TensorOf<[BF16]>, TensorOf<[F16]>, TensorOf<[F32]>, TensorOf<[F64]>]>:$Y);
   let extraClassDeclaration = [{
     static int getNumberOfOperands() {
       return 1;
@@ -3178,7 +3178,6 @@ def ONNXGroupNormalizationOp:ONNX_Op<"GroupNormalization",
   groups `num_groups` should be divisible by the number of channels so that there are
   an equal number of channels per group.
   
-<<<<<<< HEAD
   The overall computation has two stages: the first stage normalizes the elements to
   have zero mean and unit variance for each instance in each group, and the second
   stage scales and shifts the results of the first stage. The floating-point precision
@@ -3235,8 +3234,6 @@ def ONNXGroupNormalizationV18Op:ONNX_Op<"GroupNormalizationV18",
   groups `num_groups` should be divisible by the number of channels so that there are
   an equal number of channels per group.
   
-=======
->>>>>>> 20c926ee (fix: now use gen_onnx_mlir.py and change few details)
   When the number of groups is the same as the number of channels, this operator is
   equivalent to InstanceNormalization. When there is only one group, this operator
   is equivalent to LayerNormalization.
@@ -4302,7 +4299,7 @@ def ONNXMatMulOp:ONNX_Op<"MatMul",
   [Pure, OpVersionTrait<13>, DeclareOpInterfaceMethods<ShapeInferenceOpInterface>, DeclareOpInterfaceMethods<ShapeHelperOpInterface>]> {
   let summary = "ONNX MatMul operation";
   let description = [{
-  Matrix product that behaves like numpy.matmul: https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.matmul.html
+  Matrix product that behaves like [numpy.matmul](https://numpy.org/doc/stable/reference/generated/numpy.matmul.html).
   }];
   let arguments = (ins AnyTypeOf<[TensorOf<[F16]>, TensorOf<[F32]>, TensorOf<[F64]>, TensorOf<[UI32]>, TensorOf<[UI64]>, TensorOf<[I32]>, TensorOf<[I64]>, TensorOf<[BF16]>]>:$A,
     AnyTypeOf<[TensorOf<[F16]>, TensorOf<[F32]>, TensorOf<[F64]>, TensorOf<[UI32]>, TensorOf<[UI64]>, TensorOf<[I32]>, TensorOf<[I64]>, TensorOf<[BF16]>]>:$B);
@@ -4332,7 +4329,7 @@ def ONNXMatMulIntegerOp:ONNX_Op<"MatMulInteger",
   [Pure, OpVersionTrait<10>, DeclareOpInterfaceMethods<ShapeInferenceOpInterface>, DeclareOpInterfaceMethods<ShapeHelperOpInterface>]> {
   let summary = "ONNX MatMulInteger operation";
   let description = [{
-  Matrix product that behaves like numpy.matmul: https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.matmul.html.
+  Matrix product that behaves like [numpy.matmul](https://numpy.org/doc/stable/reference/generated/numpy.matmul.html).
   The production MUST never overflow. The accumulation may overflow if and only if in 32 bits.
   }];
   let arguments = (ins AnyTypeOf<[TensorOf<[I8]>, TensorOf<[UI8]>]>:$A,
@@ -4412,7 +4409,7 @@ def ONNXMaxPoolOp:ONNX_Op<"MaxPool",
    ```
    output_spatial_shape[i] = ceil((input_spatial_shape[i] + pad_shape[i] - dilation[i] * (kernel_shape[i] - 1) - 1) / strides_spatial_shape[i] + 1)
    ```
-   if ceil_mode is enabled. `pad_shape[i]` is the sum of pads along axis `i`. Sliding windows that would start in the right padded region are ignored.
+   if ceil_mode is enabled. `pad_shape[i]` is the sum of pads along axis `i`.
   
    `auto_pad` is a DEPRECATED attribute. If you are using them currently, the output spatial shape will be following when ceil_mode is enabled:
    ```
@@ -5982,7 +5979,7 @@ def ONNXQLinearMatMulOp:ONNX_Op<"QLinearMatMul",
   [Pure, OpVersionTrait<10>, DeclareOpInterfaceMethods<ShapeInferenceOpInterface>, DeclareOpInterfaceMethods<ShapeHelperOpInterface>]> {
   let summary = "ONNX QLinearMatMul operation";
   let description = [{
-  Matrix product that behaves like numpy.matmul: https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.matmul.html.
+  Matrix product that behaves like [numpy.matmul](https://numpy.org/doc/stable/reference/generated/numpy.matmul.html).
   It consumes two quantized input tensors, their scales and zero points, scale and zero point of output,
   and computes the quantized output. The quantization formula is y = saturate((x / y_scale) + y_zero_point).
   For (x / y_scale), it is rounding to nearest ties to even. Refer to https://en.wikipedia.org/wiki/Rounding for details.
@@ -9577,8 +9574,8 @@ def ONNXTanhOp:ONNX_Op<"Tanh",
   let description = [{
   Calculates the hyperbolic tangent of the given input tensor element-wise.
   }];
-  let arguments = (ins AnyTypeOf<[TensorOf<[F16]>, TensorOf<[F32]>, TensorOf<[F64]>, TensorOf<[BF16]>]>:$input);
-  let results = (outs AnyTypeOf<[TensorOf<[F16]>, TensorOf<[F32]>, TensorOf<[F64]>, TensorOf<[BF16]>]>:$output);
+  let arguments = (ins AnyTypeOf<[TensorOf<[BF16]>, TensorOf<[F16]>, TensorOf<[F32]>, TensorOf<[F64]>]>:$input);
+  let results = (outs AnyTypeOf<[TensorOf<[BF16]>, TensorOf<[F16]>, TensorOf<[F32]>, TensorOf<[F64]>]>:$output);
   let extraClassDeclaration = [{
     static int getNumberOfOperands() {
       return 1;
diff --git a/test/backend/all_test_names.txt b/test/backend/all_test_names.txt
index 5430789565..0daf5403ea 100644
--- a/test/backend/all_test_names.txt
+++ b/test/backend/all_test_names.txt
@@ -1,5 +1,5 @@
 # This file is automatically generated by "make check-onnx-backend-case"
-# From onnx 1.16.2
+# From onnx 1.17.0
 # All test cases for cpu target
 test_bvlc_alexnet_cpu
 test_densenet121_cpu
@@ -303,6 +303,8 @@ test_convtranspose_3d_cpu
 test_convtranspose_autopad_same_cpu
 test_convtranspose_cpu
 test_convtranspose_dilations_cpu
+test_convtranspose_group_2_cpu
+test_convtranspose_group_2_image_3_cpu
 test_convtranspose_kernel_shape_cpu
 test_convtranspose_output_shape_cpu
 test_convtranspose_pad_cpu
@@ -880,6 +882,7 @@ test_reduce_max_default_axes_keepdim_example_cpu
 test_reduce_max_default_axes_keepdims_random_cpu
 test_reduce_max_do_not_keepdims_example_cpu
 test_reduce_max_do_not_keepdims_random_cpu
+test_reduce_max_empty_set_cpu
 test_reduce_max_keepdims_example_cpu
 test_reduce_max_keepdims_random_cpu
 test_reduce_max_negative_axes_keepdims_example_cpu
@@ -915,6 +918,7 @@ test_reduce_sum_default_axes_keepdims_example_cpu
 test_reduce_sum_default_axes_keepdims_random_cpu
 test_reduce_sum_do_not_keepdims_example_cpu
 test_reduce_sum_do_not_keepdims_random_cpu
+test_reduce_sum_empty_axes_input_noop_cpu
 test_reduce_sum_empty_axes_input_noop_example_cpu
 test_reduce_sum_empty_set_cpu
 test_reduce_sum_empty_set_non_reduced_axis_zero_cpu
@@ -975,6 +979,7 @@ test_resize_downsample_sizes_nearest_not_smaller_cpu
 test_resize_tf_crop_and_resize_axes_2_3_cpu
 test_resize_tf_crop_and_resize_axes_3_2_cpu
 test_resize_tf_crop_and_resize_cpu
+test_resize_tf_crop_and_resize_extrapolation_value_cpu
 test_resize_upsample_scales_cubic_A_n0p5_exclude_outside_cpu
 test_resize_upsample_scales_cubic_align_corners_cpu
 test_resize_upsample_scales_cubic_asymmetric_cpu
@@ -992,6 +997,7 @@ test_resize_upsample_sizes_nearest_ceil_half_pixel_cpu
 test_resize_upsample_sizes_nearest_cpu
 test_resize_upsample_sizes_nearest_floor_align_corners_cpu
 test_resize_upsample_sizes_nearest_not_larger_cpu
+test_resize_upsample_sizes_nearest_not_smaller_cpu
 test_resize_upsample_sizes_nearest_round_prefer_ceil_asymmetric_cpu
 test_reversesequence_batch_cpu
 test_reversesequence_time_cpu
diff --git a/third_party/onnx b/third_party/onnx
index 3bf92c03a9..b8baa84466 160000
--- a/third_party/onnx
+++ b/third_party/onnx
@@ -1 +1 @@
-Subproject commit 3bf92c03a9f27eba3bda1e5b9e63ea20ec213557
+Subproject commit b8baa8446686496da4cc8fda09f2b6fe65c2a02c
diff --git a/utils/gen_onnx_mlir.py b/utils/gen_onnx_mlir.py
index 09197932a3..45e879b0e1 100755
--- a/utils/gen_onnx_mlir.py
+++ b/utils/gen_onnx_mlir.py
@@ -66,7 +66,7 @@
 
 # ==UPDATE_ONNX_VERSION_OPSET==
 # Look for tag above and update all references when upgrading the ONNX support within ONNX-MLIR.
-current_onnx_version = "1.16.2"
+current_onnx_version = "1.17.0"
 
 # Check the version of onnx package being used.
 if (