diff --git a/cuda_malloc.py b/cuda_malloc.py new file mode 100644 index 000000000..fae5d73cf --- /dev/null +++ b/cuda_malloc.py @@ -0,0 +1,97 @@ +# from comfyui with minor modification +import os +import importlib.util +import subprocess + +from modules import cmd_args + +#Can't use pytorch to get the GPU names because the cuda malloc has to be set before the first import. +def get_gpu_names(): + if os.name == 'nt': + import ctypes + + # Define necessary C structures and types + class DISPLAY_DEVICEA(ctypes.Structure): + _fields_ = [ + ('cb', ctypes.c_ulong), + ('DeviceName', ctypes.c_char * 32), + ('DeviceString', ctypes.c_char * 128), + ('StateFlags', ctypes.c_ulong), + ('DeviceID', ctypes.c_char * 128), + ('DeviceKey', ctypes.c_char * 128) + ] + + # Load user32.dll + user32 = ctypes.windll.user32 + + # Call EnumDisplayDevicesA + def enum_display_devices(): + device_info = DISPLAY_DEVICEA() + device_info.cb = ctypes.sizeof(device_info) + device_index = 0 + gpu_names = set() + + while user32.EnumDisplayDevicesA(None, device_index, ctypes.byref(device_info), 0): + device_index += 1 + gpu_names.add(device_info.DeviceString.decode('utf-8')) + return gpu_names + return enum_display_devices() + else: + gpu_names = set() + out = subprocess.check_output(['nvidia-smi', '-L']) + for line in out.split(b'\n'): + if len(line) > 0: + gpu_names.add(line.decode('utf-8').split(' (UUID')[0]) + return gpu_names + +blacklist = {"GeForce GTX TITAN X", "GeForce GTX 980", "GeForce GTX 970", "GeForce GTX 960", "GeForce GTX 950", "GeForce 945M", + "GeForce 940M", "GeForce 930M", "GeForce 920M", "GeForce 910M", "GeForce GTX 750", "GeForce GTX 745", "Quadro K620", + "Quadro K1200", "Quadro K2200", "Quadro M500", "Quadro M520", "Quadro M600", "Quadro M620", "Quadro M1000", + "Quadro M1200", "Quadro M2000", "Quadro M2200", "Quadro M3000", "Quadro M4000", "Quadro M5000", "Quadro M5500", "Quadro M6000", + "GeForce MX110", "GeForce MX130", "GeForce 830M", "GeForce 840M", "GeForce GTX 850M", "GeForce GTX 860M", + "GeForce GTX 1650", "GeForce GTX 1630", "Tesla M4", "Tesla M6", "Tesla M10", "Tesla M40", "Tesla M60" + } + +def cuda_malloc_supported(): + try: + names = get_gpu_names() + except Exception: + names = set() + for x in names: + if "NVIDIA" in x: + for b in blacklist: + if b in x: + return False + return True + + +parser = cmd_args.parser +args, _ = parser.parse_known_args() + + +if not args.cuda_malloc: + try: + version = "" + torch_spec = importlib.util.find_spec("torch") + for folder in torch_spec.submodule_search_locations: + ver_file = os.path.join(folder, "version.py") + if os.path.isfile(ver_file): + spec = importlib.util.spec_from_file_location("torch_version_import", ver_file) + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + version = module.__version__ + if int(version[0]) >= 2: #enable by default for torch version 2.0 and up + args.cuda_malloc = cuda_malloc_supported() + except Exception: + pass + +def init_cuda_malloc(): + if args.cuda_malloc and not args.disable_cuda_malloc: + env_var = os.environ.get('PYTORCH_CUDA_ALLOC_CONF', None) + if env_var is None: + env_var = "backend:cudaMallocAsync" + else: + env_var += ",backend:cudaMallocAsync" + + os.environ['PYTORCH_CUDA_ALLOC_CONF'] = env_var + print(f"Setup environment PYTORCH_CUDA_ALLOC_CONF={env_var}") diff --git a/modules/cmd_args.py b/modules/cmd_args.py index 2ebe267cc..8b9a7b6e6 100644 --- a/modules/cmd_args.py +++ b/modules/cmd_args.py @@ -9,6 +9,9 @@ parser.add_argument("-f", action='store_true', help=argparse.SUPPRESS) # allows parser.add_argument("--update-all-extensions", action='store_true', help="launch.py argument: download updates for all extensions when starting the program") parser.add_argument("--skip-python-version-check", action='store_true', help="launch.py argument: do not check python version") parser.add_argument("--skip-torch-cuda-test", action='store_true', help="launch.py argument: do not check if CUDA is able to work properly") +cm_group = parser.add_mutually_exclusive_group() +cm_group.add_argument("--cuda-malloc", action='store_true', help="Enable cudaMallocAsync (enabled by default for torch 2.0 and up).") +cm_group.add_argument("--disable-cuda-malloc", action="store_true", help="Disable cudaMallocAsync.") parser.add_argument("--reinstall-xformers", action='store_true', help="launch.py argument: install the appropriate version of xformers even if you have some version already installed") parser.add_argument("--reinstall-torch", action='store_true', help="launch.py argument: install the appropriate version of torch even if you have some version already installed") parser.add_argument("--update-check", action='store_true', help="launch.py argument: check for updates at startup") diff --git a/webui.py b/webui.py index 421e3b833..3713750c3 100644 --- a/webui.py +++ b/webui.py @@ -10,6 +10,10 @@ from modules import initialize startup_timer = timer.startup_timer startup_timer.record("launcher") +from cuda_malloc import init_cuda_malloc +init_cuda_malloc() +startup_timer.record("cuda_malloc") + initialize.imports() initialize.check_versions()