Skip to content

Conversation

@noemotiovon
Copy link
Contributor

Refactor the backward kernel to compute base pointers within the loop instead of incrementing pointers. This improves code clarity and maintainability while maintaining the same performance.

Hardware Type: NVIDIA A100-SXM4-80GB

  • run make test to ensure correctness
  • run make checkstyle to ensure code style
  • run make test-convergence to ensure convergence

Refactor the backward kernel to compute base pointers within the loop
instead of incrementing pointers. This improves code clarity and
maintainability while maintaining the same performance.
@noemotiovon
Copy link
Contributor Author

Accuracy test:
image

@noemotiovon
Copy link
Contributor Author

Benchmark:

**************************************
     BENCHMARKING SPEED for POLY_NORM
**************************************
********** Benchmark Data **********
[
  {
    "kernel_name": "poly_norm",
    "kernel_provider": "liger",
    "metric_name": "speed",
    "metric_unit": "ms",
    "gpu_name": "NVIDIA A100-SXM4-80GB",
    "x_name": "H",
    "x_label": "hidden size",
    "x_values": [
      1024,
      2048,
      4096,
      8192,
      16384,
      32768
    ],
    "y_values_50": [
      0.017023999243974686,
      0.02127999998629093,
      0.032255999743938446,
      0.05484800040721893,
      0.11215999722480774,
      1.7323839664459229
    ],
    "y_values_20": [
      0.0161920003592968,
      0.02006400004029274,
      0.03001599945127964,
      0.05379199981689453,
      0.11132799834012985,
      1.7294399738311768
    ],
    "y_values_80": [
      0.017535999417304993,
      0.024345600977540016,
      0.03286400064826012,
      0.05593600124120712,
      0.11299200356006622,
      1.7360320091247559
    ],
    "timestamp": "2026-01-15 08:41:42",
    "kernel_operation_mode": "forward",
    "extra_benchmark_config_str": "{\"M\": 2048, \"dtype\": \"torch.bfloat16\", \"eps\": 1e-06}",
    "liger_version": "0.6.4"
  },
  {
    "kernel_name": "poly_norm",
    "kernel_provider": "huggingface",
    "metric_name": "speed",
    "metric_unit": "ms",
    "gpu_name": "NVIDIA A100-SXM4-80GB",
    "x_name": "H",
    "x_label": "hidden size",
    "x_values": [
      1024,
      2048,
      4096,
      8192,
      16384,
      32768
    ],
    "y_values_50": [
      0.2223680019378662,
      0.44710400700569153,
      0.8895360231399536,
      1.6623680591583252,
      3.1829280853271484,
      6.223135948181152
    ],
    "y_values_20": [
      0.22153599560260773,
      0.44441598653793335,
      0.8869311809539795,
      1.6592191457748413,
      3.1788480281829834,
      6.217222213745117
    ],
    "y_values_80": [
      0.22351999580860138,
      0.44944000244140625,
      0.8921216130256653,
      1.6655488014221191,
      3.1873600482940674,
      6.22899866104126
    ],
    "timestamp": "2026-01-15 08:41:45",
    "kernel_operation_mode": "forward",
    "extra_benchmark_config_str": "{\"M\": 2048, \"dtype\": \"torch.bfloat16\", \"eps\": 1e-06}",
    "liger_version": "0.6.4"
  },
  {
    "kernel_name": "poly_norm",
    "kernel_provider": "liger",
    "metric_name": "speed",
    "metric_unit": "ms",
    "gpu_name": "NVIDIA A100-SXM4-80GB",
    "x_name": "H",
    "x_label": "hidden size",
    "x_values": [
      1024,
      2048,
      4096,
      8192,
      16384,
      32768
    ],
    "y_values_50": [
      0.8752639889717102,
      1.1498240232467651,
      1.160159945487976,
      0.841376006603241,
      0.3607040047645569,
      2.968832015991211
    ],
    "y_values_20": [
      0.5851520299911499,
      1.1405376195907593,
      1.1447999477386475,
      0.5370112061500549,
      0.3078528046607971,
      2.9647295475006104
    ],
    "y_values_80": [
      1.151968002319336,
      1.1681023836135864,
      1.1855039596557617,
      1.1590464115142822,
      0.361952006816864,
      2.973836898803711
    ],
    "timestamp": "2026-01-15 08:41:49",
    "kernel_operation_mode": "full",
    "extra_benchmark_config_str": "{\"M\": 2048, \"dtype\": \"torch.bfloat16\", \"eps\": 1e-06}",
    "liger_version": "0.6.4"
  },
  {
    "kernel_name": "poly_norm",
    "kernel_provider": "huggingface",
    "metric_name": "speed",
    "metric_unit": "ms",
    "gpu_name": "NVIDIA A100-SXM4-80GB",
    "x_name": "H",
    "x_label": "hidden size",
    "x_values": [
      1024,
      2048,
      4096,
      8192,
      16384,
      32768
    ],
    "y_values_50": [
      3.6947999000549316,
      3.7023680210113525,
      3.178015947341919,
      5.867248058319092,
      11.22704029083252,
      22.01611328125
    ],
    "y_values_20": [
      3.675065517425537,
      3.6949760913848877,
      3.1704959869384766,
      5.86181116104126,
      11.218175888061523,
      22.009716033935547
    ],
    "y_values_80": [
      3.721222400665283,
      3.7145793437957764,
      3.7207999229431152,
      5.8725762367248535,
      11.232134819030762,
      22.030067443847656
    ],
    "timestamp": "2026-01-15 08:41:52",
    "kernel_operation_mode": "full",
    "extra_benchmark_config_str": "{\"M\": 2048, \"dtype\": \"torch.bfloat16\", \"eps\": 1e-06}",
    "liger_version": "0.6.4"
  },
  {
    "kernel_name": "poly_norm",
    "kernel_provider": "liger",
    "metric_name": "speed",
    "metric_unit": "ms",
    "gpu_name": "NVIDIA A100-SXM4-80GB",
    "x_name": "H",
    "x_label": "hidden size",
    "x_values": [
      1024,
      2048,
      4096,
      8192,
      16384,
      32768
    ],
    "y_values_50": [
      0.3816640079021454,
      0.22230400145053864,
      0.33475199341773987,
      0.6523839831352234,
      0.6497119665145874,
      1.243008017539978
    ],
    "y_values_20": [
      0.31400319933891296,
      0.21030400693416595,
      0.179123193025589,
      0.6463296413421631,
      0.6448000073432922,
      1.239020824432373
    ],
    "y_values_80": [
      0.5448256134986877,
      0.2444159984588623,
      0.4518783986568451,
      0.669222354888916,
      0.6656448245048523,
      1.2467328310012817
    ],
    "timestamp": "2026-01-15 08:41:56",
    "kernel_operation_mode": "backward",
    "extra_benchmark_config_str": "{\"M\": 2048, \"dtype\": \"torch.bfloat16\", \"eps\": 1e-06}",
    "liger_version": "0.6.4"
  },
  {
    "kernel_name": "poly_norm",
    "kernel_provider": "huggingface",
    "metric_name": "speed",
    "metric_unit": "ms",
    "gpu_name": "NVIDIA A100-SXM4-80GB",
    "x_name": "H",
    "x_label": "hidden size",
    "x_values": [
      1024,
      2048,
      4096,
      8192,
      16384,
      32768
    ],
    "y_values_50": [
      1.5502079725265503,
      1.1302399635314941,
      2.2904160022735596,
      4.208992004394531,
      8.048831939697266,
      15.803600311279297
    ],
    "y_values_20": [
      1.2888895273208618,
      1.126528024673462,
      2.2868800163269043,
      4.204345703125,
      8.044608116149902,
      15.798144340515137
    ],
    "y_values_80": [
      1.9874688386917114,
      1.1338560581207275,
      2.294905662536621,
      4.212857723236084,
      8.055641174316406,
      15.809037208557129
    ],
    "timestamp": "2026-01-15 08:41:59",
    "kernel_operation_mode": "backward",
    "extra_benchmark_config_str": "{\"M\": 2048, \"dtype\": \"torch.bfloat16\", \"eps\": 1e-06}",
    "liger_version": "0.6.4"
  }
]
**************************************
     BENCHMARKING MEMORY for POLY_NORM
**************************************
********** Benchmark Data **********
[
  {
    "kernel_name": "poly_norm",
    "kernel_provider": "liger",
    "metric_name": "memory",
    "metric_unit": "MB",
    "gpu_name": "NVIDIA A100-SXM4-80GB",
    "x_name": "H",
    "x_label": "hidden size",
    "x_values": [
      1024,
      2048,
      4096,
      8192,
      16384,
      32768
    ],
    "y_values_50": [
      12.029296875,
      24.029296875,
      48.029296875,
      96.029296875,
      192.029296875,
      384.029296875
    ],
    "y_values_20": [
      12.029296875,
      24.029296875,
      48.029296875,
      96.029296875,
      192.029296875,
      384.029296875
    ],
    "y_values_80": [
      12.029296875,
      24.029296875,
      48.029296875,
      96.029296875,
      192.029296875,
      384.029296875
    ],
    "timestamp": "2026-01-15 08:41:59",
    "kernel_operation_mode": "full",
    "extra_benchmark_config_str": "{\"M\": 2048, \"dtype\": \"torch.bfloat16\", \"eps\": 1e-06}",
    "liger_version": "0.6.4"
  },
  {
    "kernel_name": "poly_norm",
    "kernel_provider": "huggingface",
    "metric_name": "memory",
    "metric_unit": "MB",
    "gpu_name": "NVIDIA A100-SXM4-80GB",
    "x_name": "H",
    "x_label": "hidden size",
    "x_values": [
      1024,
      2048,
      4096,
      8192,
      16384,
      32768
    ],
    "y_values_50": [
      128.0263671875,
      256.0263671875,
      512.0263671875,
      1024.0263671875,
      2048.0263671875,
      4096.0263671875
    ],
    "y_values_20": [
      128.0263671875,
      256.0263671875,
      512.0263671875,
      1024.0263671875,
      2048.0263671875,
      4096.0263671875
    ],
    "y_values_80": [
      128.0263671875,
      256.0263671875,
      512.0263671875,
      1024.0263671875,
      2048.0263671875,
      4096.0263671875
    ],
    "timestamp": "2026-01-15 08:41:59",
    "kernel_operation_mode": "full",
    "extra_benchmark_config_str": "{\"M\": 2048, \"dtype\": \"torch.bfloat16\", \"eps\": 1e-06}",
    "liger_version": "0.6.4"
  }
]

dy_base = dY_ptr + row_idx * dY_row_stride
x_base = X_ptr + row_idx * X_row_stride
dx_base = dX_ptr + row_idx * dX_row_stride
r_base = RSTD_ptr + row_idx * RSTD_row_stride
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: name it rstd_base for consistency

Suggested change
r_base = RSTD_ptr + row_idx * RSTD_row_stride
rstd_base = RSTD_ptr + row_idx * RSTD_row_stride

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for pointing this out — that was my mistake.

Copy link
Collaborator

@Tcc0403 Tcc0403 left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you!

@Tcc0403 Tcc0403 merged commit 48fb31f into linkedin:main Jan 16, 2026
3 of 7 checks passed
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

2 participants