comfyanonymous · starmountain1997 · May 11, 2024 · Oct 18, 2024
@@ -155,6 +155,19 @@ This is the command to install pytorch nightly instead which might have performa
 
 ```pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu124```
 
+### ASCEND
+
+Ascend users should install ```cann>=7.0.0 torch+cpu>=2.1.0``` and ```torch_npu```,below are the installation reference documents.
+
+https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC2alpha001/softwareinst/instg/instg_0001.html
+
+https://gitee.com/ascend/pytorch
+
+This is the command to lanuch ComfyUI using Ascend backend.
+
+```python main.py --use-npu```
+
+
 #### Troubleshooting
 
 If you get the "Torch not compiled with CUDA enabled" error, uninstall torch with:

@@ -84,6 +84,10 @@ def __call__(self, parser, namespace, values, option_string=None):
 
 parser.add_argument("--disable-ipex-optimize", action="store_true", help="Disables ipex.optimize when loading models with Intel GPUs.")
 
+parser.add_argument("--disable-torchair-optimize", action="store_true", help="Disables torchair graph modee optimize when loading models with Ascend NPUs.")
+
+parser.add_argument("--use-npu", action="store_true", help="use Huawei Ascend NPUs backend.")
+
 class LatentPreviewMethod(enum.Enum):
  NoPreviews = "none"
  Auto = "auto"

@@ -21,6 +21,8 @@
 from enum import Enum
 from comfy.cli_args import args
 import torch
+if args.use_npu == True:
+ import torch_npu
 import sys
 import platform
 
@@ -95,6 +97,12 @@ def is_intel_xpu():
  return True
  return False
 
+def is_ascend_npu():
+ if args.use_npu and torch.npu.is_available():
+ return True
+ return False
+
+
 def get_torch_device():
  global directml_enabled
  global cpu_state
@@ -108,6 +116,8 @@ def get_torch_device():
  else:
  if is_intel_xpu():
  return torch.device("xpu", torch.xpu.current_device())
+ elif is_ascend_npu():
+ return torch.device("npu", torch.npu.current_device())
  else:
  return torch.device(torch.cuda.current_device())
 
@@ -128,6 +138,11 @@ def get_total_memory(dev=None, torch_total_too=False):
  mem_reserved = stats['reserved_bytes.all.current']
  mem_total_torch = mem_reserved
  mem_total = torch.xpu.get_device_properties(dev).total_memory
+ elif is_ascend_npu():
+ stats = torch.npu.memory_stats(dev)
+ mem_reserved = stats['reserved_bytes.all.current']
+ mem_total = torch.npu.get_device_properties(dev).total_memory
+ mem_total_torch = mem_reserved
  else:
  stats = torch.cuda.memory_stats(dev)
  mem_reserved = stats['reserved_bytes.all.current']
@@ -203,6 +218,10 @@ def is_nvidia():
  if is_intel_xpu():
  if args.use_split_cross_attention == False and args.use_quad_cross_attention == False:
  ENABLE_PYTORCH_ATTENTION = True
+ if is_ascend_npu():
+ if args.use_split_cross_attention == False and args.use_quad_cross_attention == False:
+ ENABLE_PYTORCH_ATTENTION = True
+
 except:
  pass
 
@@ -266,6 +285,8 @@ def get_torch_device_name(device):
  return "{}".format(device.type)
  elif is_intel_xpu():
  return "{} {}".format(device, torch.xpu.get_device_name(device))
+ elif is_ascend_npu():
+ return "{} {}".format(device, torch.npu.get_device_name(device))
  else:
  return "CUDA {}: {}".format(device, torch.cuda.get_device_name(device))
 
@@ -330,6 +351,11 @@ def model_load(self, lowvram_model_memory=0, force_patch_weights=False):
  with torch.no_grad():
  self.real_model = ipex.optimize(self.real_model.eval(), inplace=True, graph_mode=True, concat_linear=True)
 
+ if is_ascend_npu() and not args.disable_torchair_optimize: # torchair optimize
+ import torchair as tng
+ npu_backend = tng.get_npu_backend()
+ self.real_model = torch.compile(self.real_model.eval(), backend=npu_backend, dynamic=False)
+
  self.weights_loaded = True
  return self.real_model
 
@@ -865,6 +891,8 @@ def xformers_enabled():
  return False
  if directml_enabled:
  return False
+ if is_ascend_npu():
+ return False
  return XFORMERS_IS_AVAILABLE
 
 
@@ -921,6 +949,13 @@ def get_free_memory(dev=None, torch_free_too=False):
  mem_free_torch = mem_reserved - mem_active
  mem_free_xpu = torch.xpu.get_device_properties(dev).total_memory - mem_reserved
  mem_free_total = mem_free_xpu + mem_free_torch
+ elif is_ascend_npu():
+ stats = torch.npu.memory_stats(dev)
+ mem_active = stats['active_bytes.all.current']
+ mem_reserved = stats['reserved_bytes.all.current']
+ mem_free_npu, _ = torch.npu.mem_get_info(dev)
+ mem_free_torch = mem_reserved - mem_active
+ mem_free_total = mem_free_npu + mem_free_torch
  else:
  stats = torch.cuda.memory_stats(dev)
  mem_active = stats['active_bytes.all.current']
@@ -986,6 +1021,9 @@ def should_use_fp16(device=None, model_params=0, prioritize_performance=True, ma
  if is_intel_xpu():
  return True
 
+ if is_ascend_npu():
+ return True
+
  if torch.version.hip:
  return True
 
@@ -1085,6 +1123,8 @@ def soft_empty_cache(force=False):
  torch.mps.empty_cache()
  elif is_intel_xpu():
  torch.xpu.empty_cache()
+ elif is_ascend_npu():
+ torch.npu.empty_cache()
  elif torch.cuda.is_available():
  if force or is_nvidia(): #This seems to make things worse on ROCm so I only do it for cuda
  torch.cuda.empty_cache()