TrustLLMeu
diff --git a/‎megatron/fused_kernels/__init__.py
+22-42 b/‎megatron/fused_kernels/__init__.py
+22-42
diff --git a/‎megatron/fused_kernels/scaled_masked_softmax.cpp
-83 b/‎megatron/fused_kernels/scaled_masked_softmax.cpp
-83
@@ -19,17 +19,18 @@ def load(args):
     # Check if cuda 11 is installed for compute capability 8.0
     cc_flag = []
     _, bare_metal_major, bare_metal_minor = _get_cuda_bare_metal_version(
-        cpp_extension.CUDA_HOME)
+        cpp_extension.CUDA_HOME
+    )
     if int(bare_metal_major) >= 11:
-        cc_flag.append('-gencode')
-        cc_flag.append('arch=compute_80,code=sm_80')
+        cc_flag.append("-gencode")
+        cc_flag.append("arch=compute_80,code=sm_80")
         if int(bare_metal_minor) >= 7:
-            cc_flag.append('-gencode')
-            cc_flag.append('arch=compute_90,code=sm_90')
+            cc_flag.append("-gencode")
+            cc_flag.append("arch=compute_90,code=sm_90")
 
     # Build path
     srcpath = pathlib.Path(__file__).parent.absolute()
-    buildpath = srcpath / 'build'
+    buildpath = srcpath / "build"
     _create_build_dir(buildpath)
 
     # Helper function to build the kernels.
@@ -38,46 +39,25 @@ def _cpp_extention_load_helper(name, sources, extra_cuda_flags):
             name=name,
             sources=sources,
             build_directory=buildpath,
-            extra_cflags=['-O3',],
-            extra_cuda_cflags=['-O3',
-                               '-gencode', 'arch=compute_70,code=sm_70',
-                               '--use_fast_math'] + extra_cuda_flags + cc_flag,
-            verbose=(args.rank == 0)
+            extra_cflags=[
+                "-O3",
+            ],
+            extra_cuda_cflags=[
+                "-O3",
+                "-gencode",
+                "arch=compute_70,code=sm_70",
+                "--use_fast_math",
+            ]
+            + extra_cuda_flags
+            + cc_flag,
+            verbose=(args.rank == 0),
         )
 
-    # ==============
-    # Fused softmax.
-    # ==============
-
-    if args.masked_softmax_fusion:
-        extra_cuda_flags = ['-U__CUDA_NO_HALF_OPERATORS__',
-                            '-U__CUDA_NO_HALF_CONVERSIONS__',
-                            '--expt-relaxed-constexpr',
-                            '--expt-extended-lambda']
-
-        # Upper triangular softmax.
-        sources=[srcpath / 'scaled_upper_triang_masked_softmax.cpp',
-                 srcpath / 'scaled_upper_triang_masked_softmax_cuda.cu']
-        scaled_upper_triang_masked_softmax_cuda = _cpp_extention_load_helper(
-            "scaled_upper_triang_masked_softmax_cuda",
-            sources, extra_cuda_flags)
-
-        # Masked softmax.
-        sources=[srcpath / 'scaled_masked_softmax.cpp',
-                 srcpath / 'scaled_masked_softmax_cuda.cu']
-        scaled_masked_softmax_cuda = _cpp_extention_load_helper(
-            "scaled_masked_softmax_cuda", sources, extra_cuda_flags)
-
-        # Softmax
-        sources=[srcpath / 'scaled_softmax.cpp',
-                 srcpath / 'scaled_softmax_cuda.cu']
-        scaled_softmax_cuda = _cpp_extention_load_helper(
-            "scaled_softmax_cuda", sources, extra_cuda_flags)
-
 
 def _get_cuda_bare_metal_version(cuda_dir):
-    raw_output = subprocess.check_output([cuda_dir + "/bin/nvcc", "-V"],
-                                         universal_newlines=True)
+    raw_output = subprocess.check_output(
+        [cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True
+    )
     output = raw_output.split()
     release_idx = output.index("release") + 1
     release = output[release_idx].split(".")