From 7aa27b000a3087dcb5cc7254600064bf70cacd3e Mon Sep 17 00:00:00 2001
From: Aarni Koskela <akx@iki.fi>
Date: Mon, 25 Dec 2023 14:44:15 +0200
Subject: [PATCH 1/7] Add types to split_grid

---
 modules/images.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/modules/images.py b/modules/images.py
index 16f9ae7cc..d30e8865d 100644
--- a/modules/images.py
+++ b/modules/images.py
@@ -64,9 +64,8 @@ def image_grid(imgs, batch_size=1, rows=None):
 Grid = namedtuple("Grid", ["tiles", "tile_w", "tile_h", "image_w", "image_h", "overlap"])
 
 
-def split_grid(image, tile_w=512, tile_h=512, overlap=64):
-    w = image.width
-    h = image.height
+def split_grid(image: Image.Image, tile_w: int = 512, tile_h: int = 512, overlap: int = 64) -> Grid:
+    w, h = image.size
 
     non_overlap_width = tile_w - overlap
     non_overlap_height = tile_h - overlap

From 12c6f37f8e4b1d1d643c9d8d5dfc763c3203c728 Mon Sep 17 00:00:00 2001
From: Aarni Koskela <akx@iki.fi>
Date: Wed, 27 Dec 2023 11:01:45 +0200
Subject: [PATCH 2/7] Add tile_count property to Grid

---
 modules/images.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/modules/images.py b/modules/images.py
index d30e8865d..87a7bf221 100644
--- a/modules/images.py
+++ b/modules/images.py
@@ -61,7 +61,13 @@ def image_grid(imgs, batch_size=1, rows=None):
     return grid
 
 
-Grid = namedtuple("Grid", ["tiles", "tile_w", "tile_h", "image_w", "image_h", "overlap"])
+class Grid(namedtuple("_Grid", ["tiles", "tile_w", "tile_h", "image_w", "image_h", "overlap"])):
+    @property
+    def tile_count(self) -> int:
+        """
+        The total number of tiles in the grid.
+        """
+        return sum(len(row[2]) for row in self.tiles)
 
 
 def split_grid(image: Image.Image, tile_w: int = 512, tile_h: int = 512, overlap: int = 64) -> Grid:

From e472383acbb9e07dca311abe5fb16ee2675e410a Mon Sep 17 00:00:00 2001
From: Aarni Koskela <akx@iki.fi>
Date: Wed, 27 Dec 2023 11:04:33 +0200
Subject: [PATCH 3/7] Refactor esrgan_upscale to more generic
 upscale_with_model

---
 modules/esrgan_model.py   | 47 +++++-----------------------
 modules/upscaler_utils.py | 66 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 74 insertions(+), 39 deletions(-)
 create mode 100644 modules/upscaler_utils.py

diff --git a/modules/esrgan_model.py b/modules/esrgan_model.py
index 02a1727d2..c0d22a992 100644
--- a/modules/esrgan_model.py
+++ b/modules/esrgan_model.py
@@ -1,13 +1,12 @@
 import sys
 
-import numpy as np
 import torch
-from PIL import Image
 
 import modules.esrgan_model_arch as arch
-from modules import modelloader, images, devices
+from modules import modelloader, devices
 from modules.shared import opts
 from modules.upscaler import Upscaler, UpscalerData
+from modules.upscaler_utils import upscale_with_model
 
 
 def mod2normal(state_dict):
@@ -190,40 +189,10 @@ class UpscalerESRGAN(Upscaler):
         return model
 
 
-def upscale_without_tiling(model, img):
-    img = np.array(img)
-    img = img[:, :, ::-1]
-    img = np.ascontiguousarray(np.transpose(img, (2, 0, 1))) / 255
-    img = torch.from_numpy(img).float()
-    img = img.unsqueeze(0).to(devices.device_esrgan)
-    with torch.no_grad():
-        output = model(img)
-    output = output.squeeze().float().cpu().clamp_(0, 1).numpy()
-    output = 255. * np.moveaxis(output, 0, 2)
-    output = output.astype(np.uint8)
-    output = output[:, :, ::-1]
-    return Image.fromarray(output, 'RGB')
-
-
 def esrgan_upscale(model, img):
-    if opts.ESRGAN_tile == 0:
-        return upscale_without_tiling(model, img)
-
-    grid = images.split_grid(img, opts.ESRGAN_tile, opts.ESRGAN_tile, opts.ESRGAN_tile_overlap)
-    newtiles = []
-    scale_factor = 1
-
-    for y, h, row in grid.tiles:
-        newrow = []
-        for tiledata in row:
-            x, w, tile = tiledata
-
-            output = upscale_without_tiling(model, tile)
-            scale_factor = output.width // tile.width
-
-            newrow.append([x * scale_factor, w * scale_factor, output])
-        newtiles.append([y * scale_factor, h * scale_factor, newrow])
-
-    newgrid = images.Grid(newtiles, grid.tile_w * scale_factor, grid.tile_h * scale_factor, grid.image_w * scale_factor, grid.image_h * scale_factor, grid.overlap * scale_factor)
-    output = images.combine_grid(newgrid)
-    return output
+    return upscale_with_model(
+        model,
+        img,
+        tile_size=opts.ESRGAN_tile,
+        tile_overlap=opts.ESRGAN_tile_overlap,
+    )
diff --git a/modules/upscaler_utils.py b/modules/upscaler_utils.py
new file mode 100644
index 000000000..8bdda51c4
--- /dev/null
+++ b/modules/upscaler_utils.py
@@ -0,0 +1,66 @@
+import logging
+from typing import Callable
+
+import numpy as np
+import torch
+import tqdm
+from PIL import Image
+
+from modules import devices, images
+
+logger = logging.getLogger(__name__)
+
+
+def upscale_without_tiling(model, img: Image.Image):
+    img = np.array(img)
+    img = img[:, :, ::-1]
+    img = np.ascontiguousarray(np.transpose(img, (2, 0, 1))) / 255
+    img = torch.from_numpy(img).float()
+    img = img.unsqueeze(0).to(devices.device_esrgan)
+    with torch.no_grad():
+        output = model(img)
+    output = output.squeeze().float().cpu().clamp_(0, 1).numpy()
+    output = 255. * np.moveaxis(output, 0, 2)
+    output = output.astype(np.uint8)
+    output = output[:, :, ::-1]
+    return Image.fromarray(output, 'RGB')
+
+
+def upscale_with_model(
+    model: Callable[[torch.Tensor], torch.Tensor],
+    img: Image.Image,
+    *,
+    tile_size: int,
+    tile_overlap: int = 0,
+    desc="tiled upscale",
+) -> Image.Image:
+    if tile_size <= 0:
+        logger.debug("Upscaling %s without tiling", img)
+        output = upscale_without_tiling(model, img)
+        logger.debug("=> %s", output)
+        return output
+
+    grid = images.split_grid(img, tile_size, tile_size, tile_overlap)
+    newtiles = []
+
+    with tqdm.tqdm(total=grid.tile_count, desc=desc) as p:
+        for y, h, row in grid.tiles:
+            newrow = []
+            for x, w, tile in row:
+                logger.debug("Tile (%d, %d) %s...", x, y, tile)
+                output = upscale_without_tiling(model, tile)
+                scale_factor = output.width // tile.width
+                logger.debug("=> %s (scale factor %s)", output, scale_factor)
+                newrow.append([x * scale_factor, w * scale_factor, output])
+                p.update(1)
+            newtiles.append([y * scale_factor, h * scale_factor, newrow])
+
+    newgrid = images.Grid(
+        newtiles,
+        tile_w=grid.tile_w * scale_factor,
+        tile_h=grid.tile_h * scale_factor,
+        image_w=grid.image_w * scale_factor,
+        image_h=grid.image_h * scale_factor,
+        overlap=grid.overlap * scale_factor,
+    )
+    return images.combine_grid(newgrid)

From b0f59342346b1c8b405f97c0e0bb01c6ae05c601 Mon Sep 17 00:00:00 2001
From: Aarni Koskela <akx@iki.fi>
Date: Mon, 25 Dec 2023 14:43:51 +0200
Subject: [PATCH 4/7] Use Spandrel for upscaling and face restoration
 architectures (aside from GFPGAN and LDSR)

---
 .../ScuNET/scripts/scunet_model.py            |   13 +-
 .../ScuNET/scunet_model_arch.py               |  268 -----
 .../SwinIR/scripts/swinir_model.py            |  126 +-
 .../SwinIR/swinir_model_arch.py               |  867 --------------
 .../SwinIR/swinir_model_arch_v2.py            | 1017 -----------------
 modules/codeformer/codeformer_arch.py         |  276 -----
 modules/codeformer/vqgan_arch.py              |  435 -------
 modules/codeformer_model.py                   |  223 ++--
 modules/esrgan_model.py                       |  153 +--
 modules/esrgan_model_arch.py                  |  465 --------
 modules/gfpgan_model.py                       |   13 +-
 modules/launch_utils.py                       |    7 -
 modules/modelloader.py                        |   16 +
 modules/paths.py                              |    1 -
 modules/realesrgan_model.py                   |  153 +--
 modules/sysinfo.py                            |    2 -
 modules/upscaler.py                           |    3 +
 requirements.txt                              |    3 +-
 requirements_versions.txt                     |    4 +-
 19 files changed, 277 insertions(+), 3768 deletions(-)
 delete mode 100644 extensions-builtin/ScuNET/scunet_model_arch.py
 delete mode 100644 extensions-builtin/SwinIR/swinir_model_arch.py
 delete mode 100644 extensions-builtin/SwinIR/swinir_model_arch_v2.py
 delete mode 100644 modules/codeformer/codeformer_arch.py
 delete mode 100644 modules/codeformer/vqgan_arch.py
 delete mode 100644 modules/esrgan_model_arch.py

diff --git a/extensions-builtin/ScuNET/scripts/scunet_model.py b/extensions-builtin/ScuNET/scripts/scunet_model.py
index 167d2f64b..18cf8e1a0 100644
--- a/extensions-builtin/ScuNET/scripts/scunet_model.py
+++ b/extensions-builtin/ScuNET/scripts/scunet_model.py
@@ -7,9 +7,7 @@ from tqdm import tqdm
 
 import modules.upscaler
 from modules import devices, modelloader, script_callbacks, errors
-from scunet_model_arch import SCUNet
 
-from modules.modelloader import load_file_from_url
 from modules.shared import opts
 
 
@@ -120,17 +118,10 @@ class UpscalerScuNET(modules.upscaler.Upscaler):
         device = devices.get_device_for('scunet')
         if path.startswith("http"):
             # TODO: this doesn't use `path` at all?
-            filename = load_file_from_url(self.model_url, model_dir=self.model_download_path, file_name=f"{self.name}.pth")
+            filename = modelloader.load_file_from_url(self.model_url, model_dir=self.model_download_path, file_name=f"{self.name}.pth")
         else:
             filename = path
-        model = SCUNet(in_nc=3, config=[4, 4, 4, 4, 4, 4, 4], dim=64)
-        model.load_state_dict(torch.load(filename), strict=True)
-        model.eval()
-        for _, v in model.named_parameters():
-            v.requires_grad = False
-        model = model.to(device)
-
-        return model
+        return modelloader.load_spandrel_model(filename, device=device)
 
 
 def on_ui_settings():
diff --git a/extensions-builtin/ScuNET/scunet_model_arch.py b/extensions-builtin/ScuNET/scunet_model_arch.py
deleted file mode 100644
index b51a88062..000000000
--- a/extensions-builtin/ScuNET/scunet_model_arch.py
+++ /dev/null
@@ -1,268 +0,0 @@
-# -*- coding: utf-8 -*-
-import numpy as np
-import torch
-import torch.nn as nn
-from einops import rearrange
-from einops.layers.torch import Rearrange
-from timm.models.layers import trunc_normal_, DropPath
-
-
-class WMSA(nn.Module):
-    """ Self-attention module in Swin Transformer
-    """
-
-    def __init__(self, input_dim, output_dim, head_dim, window_size, type):
-        super(WMSA, self).__init__()
-        self.input_dim = input_dim
-        self.output_dim = output_dim
-        self.head_dim = head_dim
-        self.scale = self.head_dim ** -0.5
-        self.n_heads = input_dim // head_dim
-        self.window_size = window_size
-        self.type = type
-        self.embedding_layer = nn.Linear(self.input_dim, 3 * self.input_dim, bias=True)
-
-        self.relative_position_params = nn.Parameter(
-            torch.zeros((2 * window_size - 1) * (2 * window_size - 1), self.n_heads))
-
-        self.linear = nn.Linear(self.input_dim, self.output_dim)
-
-        trunc_normal_(self.relative_position_params, std=.02)
-        self.relative_position_params = torch.nn.Parameter(
-            self.relative_position_params.view(2 * window_size - 1, 2 * window_size - 1, self.n_heads).transpose(1,
-                                                                                                                 2).transpose(
-                0, 1))
-
-    def generate_mask(self, h, w, p, shift):
-        """ generating the mask of SW-MSA
-        Args:
-            shift: shift parameters in CyclicShift.
-        Returns:
-            attn_mask: should be (1 1 w p p),
-        """
-        # supporting square.
-        attn_mask = torch.zeros(h, w, p, p, p, p, dtype=torch.bool, device=self.relative_position_params.device)
-        if self.type == 'W':
-            return attn_mask
-
-        s = p - shift
-        attn_mask[-1, :, :s, :, s:, :] = True
-        attn_mask[-1, :, s:, :, :s, :] = True
-        attn_mask[:, -1, :, :s, :, s:] = True
-        attn_mask[:, -1, :, s:, :, :s] = True
-        attn_mask = rearrange(attn_mask, 'w1 w2 p1 p2 p3 p4 -> 1 1 (w1 w2) (p1 p2) (p3 p4)')
-        return attn_mask
-
-    def forward(self, x):
-        """ Forward pass of Window Multi-head Self-attention module.
-        Args:
-            x: input tensor with shape of [b h w c];
-            attn_mask: attention mask, fill -inf where the value is True;
-        Returns:
-            output: tensor shape [b h w c]
-        """
-        if self.type != 'W':
-            x = torch.roll(x, shifts=(-(self.window_size // 2), -(self.window_size // 2)), dims=(1, 2))
-
-        x = rearrange(x, 'b (w1 p1) (w2 p2) c -> b w1 w2 p1 p2 c', p1=self.window_size, p2=self.window_size)
-        h_windows = x.size(1)
-        w_windows = x.size(2)
-        # square validation
-        # assert h_windows == w_windows
-
-        x = rearrange(x, 'b w1 w2 p1 p2 c -> b (w1 w2) (p1 p2) c', p1=self.window_size, p2=self.window_size)
-        qkv = self.embedding_layer(x)
-        q, k, v = rearrange(qkv, 'b nw np (threeh c) -> threeh b nw np c', c=self.head_dim).chunk(3, dim=0)
-        sim = torch.einsum('hbwpc,hbwqc->hbwpq', q, k) * self.scale
-        # Adding learnable relative embedding
-        sim = sim + rearrange(self.relative_embedding(), 'h p q -> h 1 1 p q')
-        # Using Attn Mask to distinguish different subwindows.
-        if self.type != 'W':
-            attn_mask = self.generate_mask(h_windows, w_windows, self.window_size, shift=self.window_size // 2)
-            sim = sim.masked_fill_(attn_mask, float("-inf"))
-
-        probs = nn.functional.softmax(sim, dim=-1)
-        output = torch.einsum('hbwij,hbwjc->hbwic', probs, v)
-        output = rearrange(output, 'h b w p c -> b w p (h c)')
-        output = self.linear(output)
-        output = rearrange(output, 'b (w1 w2) (p1 p2) c -> b (w1 p1) (w2 p2) c', w1=h_windows, p1=self.window_size)
-
-        if self.type != 'W':
-            output = torch.roll(output, shifts=(self.window_size // 2, self.window_size // 2), dims=(1, 2))
-
-        return output
-
-    def relative_embedding(self):
-        cord = torch.tensor(np.array([[i, j] for i in range(self.window_size) for j in range(self.window_size)]))
-        relation = cord[:, None, :] - cord[None, :, :] + self.window_size - 1
-        # negative is allowed
-        return self.relative_position_params[:, relation[:, :, 0].long(), relation[:, :, 1].long()]
-
-
-class Block(nn.Module):
-    def __init__(self, input_dim, output_dim, head_dim, window_size, drop_path, type='W', input_resolution=None):
-        """ SwinTransformer Block
-        """
-        super(Block, self).__init__()
-        self.input_dim = input_dim
-        self.output_dim = output_dim
-        assert type in ['W', 'SW']
-        self.type = type
-        if input_resolution <= window_size:
-            self.type = 'W'
-
-        self.ln1 = nn.LayerNorm(input_dim)
-        self.msa = WMSA(input_dim, input_dim, head_dim, window_size, self.type)
-        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
-        self.ln2 = nn.LayerNorm(input_dim)
-        self.mlp = nn.Sequential(
-            nn.Linear(input_dim, 4 * input_dim),
-            nn.GELU(),
-            nn.Linear(4 * input_dim, output_dim),
-        )
-
-    def forward(self, x):
-        x = x + self.drop_path(self.msa(self.ln1(x)))
-        x = x + self.drop_path(self.mlp(self.ln2(x)))
-        return x
-
-
-class ConvTransBlock(nn.Module):
-    def __init__(self, conv_dim, trans_dim, head_dim, window_size, drop_path, type='W', input_resolution=None):
-        """ SwinTransformer and Conv Block
-        """
-        super(ConvTransBlock, self).__init__()
-        self.conv_dim = conv_dim
-        self.trans_dim = trans_dim
-        self.head_dim = head_dim
-        self.window_size = window_size
-        self.drop_path = drop_path
-        self.type = type
-        self.input_resolution = input_resolution
-
-        assert self.type in ['W', 'SW']
-        if self.input_resolution <= self.window_size:
-            self.type = 'W'
-
-        self.trans_block = Block(self.trans_dim, self.trans_dim, self.head_dim, self.window_size, self.drop_path,
-                                 self.type, self.input_resolution)
-        self.conv1_1 = nn.Conv2d(self.conv_dim + self.trans_dim, self.conv_dim + self.trans_dim, 1, 1, 0, bias=True)
-        self.conv1_2 = nn.Conv2d(self.conv_dim + self.trans_dim, self.conv_dim + self.trans_dim, 1, 1, 0, bias=True)
-
-        self.conv_block = nn.Sequential(
-            nn.Conv2d(self.conv_dim, self.conv_dim, 3, 1, 1, bias=False),
-            nn.ReLU(True),
-            nn.Conv2d(self.conv_dim, self.conv_dim, 3, 1, 1, bias=False)
-        )
-
-    def forward(self, x):
-        conv_x, trans_x = torch.split(self.conv1_1(x), (self.conv_dim, self.trans_dim), dim=1)
-        conv_x = self.conv_block(conv_x) + conv_x
-        trans_x = Rearrange('b c h w -> b h w c')(trans_x)
-        trans_x = self.trans_block(trans_x)
-        trans_x = Rearrange('b h w c -> b c h w')(trans_x)
-        res = self.conv1_2(torch.cat((conv_x, trans_x), dim=1))
-        x = x + res
-
-        return x
-
-
-class SCUNet(nn.Module):
-    # def __init__(self, in_nc=3, config=[2, 2, 2, 2, 2, 2, 2], dim=64, drop_path_rate=0.0, input_resolution=256):
-    def __init__(self, in_nc=3, config=None, dim=64, drop_path_rate=0.0, input_resolution=256):
-        super(SCUNet, self).__init__()
-        if config is None:
-            config = [2, 2, 2, 2, 2, 2, 2]
-        self.config = config
-        self.dim = dim
-        self.head_dim = 32
-        self.window_size = 8
-
-        # drop path rate for each layer
-        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(config))]
-
-        self.m_head = [nn.Conv2d(in_nc, dim, 3, 1, 1, bias=False)]
-
-        begin = 0
-        self.m_down1 = [ConvTransBlock(dim // 2, dim // 2, self.head_dim, self.window_size, dpr[i + begin],
-                                       'W' if not i % 2 else 'SW', input_resolution)
-                        for i in range(config[0])] + \
-                       [nn.Conv2d(dim, 2 * dim, 2, 2, 0, bias=False)]
-
-        begin += config[0]
-        self.m_down2 = [ConvTransBlock(dim, dim, self.head_dim, self.window_size, dpr[i + begin],
-                                       'W' if not i % 2 else 'SW', input_resolution // 2)
-                        for i in range(config[1])] + \
-                       [nn.Conv2d(2 * dim, 4 * dim, 2, 2, 0, bias=False)]
-
-        begin += config[1]
-        self.m_down3 = [ConvTransBlock(2 * dim, 2 * dim, self.head_dim, self.window_size, dpr[i + begin],
-                                       'W' if not i % 2 else 'SW', input_resolution // 4)
-                        for i in range(config[2])] + \
-                       [nn.Conv2d(4 * dim, 8 * dim, 2, 2, 0, bias=False)]
-
-        begin += config[2]
-        self.m_body = [ConvTransBlock(4 * dim, 4 * dim, self.head_dim, self.window_size, dpr[i + begin],
-                                      'W' if not i % 2 else 'SW', input_resolution // 8)
-                       for i in range(config[3])]
-
-        begin += config[3]
-        self.m_up3 = [nn.ConvTranspose2d(8 * dim, 4 * dim, 2, 2, 0, bias=False), ] + \
-                     [ConvTransBlock(2 * dim, 2 * dim, self.head_dim, self.window_size, dpr[i + begin],
-                                     'W' if not i % 2 else 'SW', input_resolution // 4)
-                      for i in range(config[4])]
-
-        begin += config[4]
-        self.m_up2 = [nn.ConvTranspose2d(4 * dim, 2 * dim, 2, 2, 0, bias=False), ] + \
-                     [ConvTransBlock(dim, dim, self.head_dim, self.window_size, dpr[i + begin],
-                                     'W' if not i % 2 else 'SW', input_resolution // 2)
-                      for i in range(config[5])]
-
-        begin += config[5]
-        self.m_up1 = [nn.ConvTranspose2d(2 * dim, dim, 2, 2, 0, bias=False), ] + \
-                     [ConvTransBlock(dim // 2, dim // 2, self.head_dim, self.window_size, dpr[i + begin],
-                                     'W' if not i % 2 else 'SW', input_resolution)
-                      for i in range(config[6])]
-
-        self.m_tail = [nn.Conv2d(dim, in_nc, 3, 1, 1, bias=False)]
-
-        self.m_head = nn.Sequential(*self.m_head)
-        self.m_down1 = nn.Sequential(*self.m_down1)
-        self.m_down2 = nn.Sequential(*self.m_down2)
-        self.m_down3 = nn.Sequential(*self.m_down3)
-        self.m_body = nn.Sequential(*self.m_body)
-        self.m_up3 = nn.Sequential(*self.m_up3)
-        self.m_up2 = nn.Sequential(*self.m_up2)
-        self.m_up1 = nn.Sequential(*self.m_up1)
-        self.m_tail = nn.Sequential(*self.m_tail)
-        # self.apply(self._init_weights)
-
-    def forward(self, x0):
-
-        h, w = x0.size()[-2:]
-        paddingBottom = int(np.ceil(h / 64) * 64 - h)
-        paddingRight = int(np.ceil(w / 64) * 64 - w)
-        x0 = nn.ReplicationPad2d((0, paddingRight, 0, paddingBottom))(x0)
-
-        x1 = self.m_head(x0)
-        x2 = self.m_down1(x1)
-        x3 = self.m_down2(x2)
-        x4 = self.m_down3(x3)
-        x = self.m_body(x4)
-        x = self.m_up3(x + x4)
-        x = self.m_up2(x + x3)
-        x = self.m_up1(x + x2)
-        x = self.m_tail(x + x1)
-
-        x = x[..., :h, :w]
-
-        return x
-
-    def _init_weights(self, m):
-        if isinstance(m, nn.Linear):
-            trunc_normal_(m.weight, std=.02)
-            if m.bias is not None:
-                nn.init.constant_(m.bias, 0)
-        elif isinstance(m, nn.LayerNorm):
-            nn.init.constant_(m.bias, 0)
-            nn.init.constant_(m.weight, 1.0)
diff --git a/extensions-builtin/SwinIR/scripts/swinir_model.py b/extensions-builtin/SwinIR/scripts/swinir_model.py
index ae0d0e6a8..85c18b9e9 100644
--- a/extensions-builtin/SwinIR/scripts/swinir_model.py
+++ b/extensions-builtin/SwinIR/scripts/swinir_model.py
@@ -1,5 +1,5 @@
+import logging
 import sys
-import platform
 
 import numpy as np
 import torch
@@ -8,13 +8,11 @@ from tqdm import tqdm
 
 from modules import modelloader, devices, script_callbacks, shared
 from modules.shared import opts, state
-from swinir_model_arch import SwinIR
-from swinir_model_arch_v2 import Swin2SR
 from modules.upscaler import Upscaler, UpscalerData
 
 SWINIR_MODEL_URL = "https://github.com/JingyunLiang/SwinIR/releases/download/v0.0/003_realSR_BSRGAN_DFOWMFC_s64w8_SwinIR-L_x4_GAN.pth"
 
-device_swinir = devices.get_device_for('swinir')
+logger = logging.getLogger(__name__)
 
 
 class UpscalerSwinIR(Upscaler):
@@ -37,26 +35,29 @@ class UpscalerSwinIR(Upscaler):
             scalers.append(model_data)
         self.scalers = scalers
 
-    def do_upscale(self, img, model_file):
-        use_compile = hasattr(opts, 'SWIN_torch_compile') and opts.SWIN_torch_compile \
-            and int(torch.__version__.split('.')[0]) >= 2 and platform.system() != "Windows"
+    def do_upscale(self, img: Image.Image, model_file: str) -> Image.Image:
         current_config = (model_file, opts.SWIN_tile)
 
-        if use_compile and self._cached_model_config == current_config:
+        device = self._get_device()
+
+        if self._cached_model_config == current_config:
             model = self._cached_model
         else:
-            self._cached_model = None
             try:
                 model = self.load_model(model_file)
             except Exception as e:
                 print(f"Failed loading SwinIR model {model_file}: {e}", file=sys.stderr)
                 return img
-            model = model.to(device_swinir, dtype=devices.dtype)
-            if use_compile:
-                model = torch.compile(model)
-                self._cached_model = model
-                self._cached_model_config = current_config
-        img = upscale(img, model)
+            self._cached_model = model
+            self._cached_model_config = current_config
+
+        img = upscale(
+            img,
+            model,
+            tile=opts.SWIN_tile,
+            tile_overlap=opts.SWIN_tile_overlap,
+            device=device,
+        )
         devices.torch_gc()
         return img
 
@@ -69,69 +70,54 @@ class UpscalerSwinIR(Upscaler):
             )
         else:
             filename = path
-        if filename.endswith(".v2.pth"):
-            model = Swin2SR(
-                upscale=scale,
-                in_chans=3,
-                img_size=64,
-                window_size=8,
-                img_range=1.0,
-                depths=[6, 6, 6, 6, 6, 6],
-                embed_dim=180,
-                num_heads=[6, 6, 6, 6, 6, 6],
-                mlp_ratio=2,
-                upsampler="nearest+conv",
-                resi_connection="1conv",
-            )
-            params = None
-        else:
-            model = SwinIR(
-                upscale=scale,
-                in_chans=3,
-                img_size=64,
-                window_size=8,
-                img_range=1.0,
-                depths=[6, 6, 6, 6, 6, 6, 6, 6, 6],
-                embed_dim=240,
-                num_heads=[8, 8, 8, 8, 8, 8, 8, 8, 8],
-                mlp_ratio=2,
-                upsampler="nearest+conv",
-                resi_connection="3conv",
-            )
-            params = "params_ema"
 
-        pretrained_model = torch.load(filename)
-        if params is not None:
-            model.load_state_dict(pretrained_model[params], strict=True)
-        else:
-            model.load_state_dict(pretrained_model, strict=True)
+        model = modelloader.load_spandrel_model(
+            filename,
+            device=self._get_device(),
+            dtype=devices.dtype,
+        )
+        if getattr(opts, 'SWIN_torch_compile', False):
+            try:
+                model = torch.compile(model)
+            except Exception:
+                logger.warning("Failed to compile SwinIR model, fallback to JIT", exc_info=True)
         return model
 
+    def _get_device(self):
+        return devices.get_device_for('swinir')
+
 
 def upscale(
-        img,
-        model,
-        tile=None,
-        tile_overlap=None,
-        window_size=8,
-        scale=4,
+    img,
+    model,
+    *,
+    tile: int,
+    tile_overlap: int,
+    window_size=8,
+    scale=4,
+    device,
 ):
-    tile = tile or opts.SWIN_tile
-    tile_overlap = tile_overlap or opts.SWIN_tile_overlap
-
 
     img = np.array(img)
     img = img[:, :, ::-1]
     img = np.moveaxis(img, 2, 0) / 255
     img = torch.from_numpy(img).float()
-    img = img.unsqueeze(0).to(device_swinir, dtype=devices.dtype)
+    img = img.unsqueeze(0).to(device, dtype=devices.dtype)
     with torch.no_grad(), devices.autocast():
         _, _, h_old, w_old = img.size()
         h_pad = (h_old // window_size + 1) * window_size - h_old
         w_pad = (w_old // window_size + 1) * window_size - w_old
         img = torch.cat([img, torch.flip(img, [2])], 2)[:, :, : h_old + h_pad, :]
         img = torch.cat([img, torch.flip(img, [3])], 3)[:, :, :, : w_old + w_pad]
-        output = inference(img, model, tile, tile_overlap, window_size, scale)
+        output = inference(
+            img,
+            model,
+            tile=tile,
+            tile_overlap=tile_overlap,
+            window_size=window_size,
+            scale=scale,
+            device=device,
+        )
         output = output[..., : h_old * scale, : w_old * scale]
         output = output.data.squeeze().float().cpu().clamp_(0, 1).numpy()
         if output.ndim == 3:
@@ -142,7 +128,16 @@ def upscale(
         return Image.fromarray(output, "RGB")
 
 
-def inference(img, model, tile, tile_overlap, window_size, scale):
+def inference(
+    img,
+    model,
+    *,
+    tile: int,
+    tile_overlap: int,
+    window_size: int,
+    scale: int,
+    device,
+):
     # test the image tile by tile
     b, c, h, w = img.size()
     tile = min(tile, h, w)
@@ -152,8 +147,8 @@ def inference(img, model, tile, tile_overlap, window_size, scale):
     stride = tile - tile_overlap
     h_idx_list = list(range(0, h - tile, stride)) + [h - tile]
     w_idx_list = list(range(0, w - tile, stride)) + [w - tile]
-    E = torch.zeros(b, c, h * sf, w * sf, dtype=devices.dtype, device=device_swinir).type_as(img)
-    W = torch.zeros_like(E, dtype=devices.dtype, device=device_swinir)
+    E = torch.zeros(b, c, h * sf, w * sf, dtype=devices.dtype, device=device).type_as(img)
+    W = torch.zeros_like(E, dtype=devices.dtype, device=device)
 
     with tqdm(total=len(h_idx_list) * len(w_idx_list), desc="SwinIR tiles") as pbar:
         for h_idx in h_idx_list:
@@ -185,8 +180,7 @@ def on_ui_settings():
 
     shared.opts.add_option("SWIN_tile", shared.OptionInfo(192, "Tile size for all SwinIR.", gr.Slider, {"minimum": 16, "maximum": 512, "step": 16}, section=('upscaling', "Upscaling")))
     shared.opts.add_option("SWIN_tile_overlap", shared.OptionInfo(8, "Tile overlap, in pixels for SwinIR. Low values = visible seam.", gr.Slider, {"minimum": 0, "maximum": 48, "step": 1}, section=('upscaling', "Upscaling")))
-    if int(torch.__version__.split('.')[0]) >= 2 and platform.system() != "Windows":    # torch.compile() require pytorch 2.0 or above, and not on Windows
-        shared.opts.add_option("SWIN_torch_compile", shared.OptionInfo(False, "Use torch.compile to accelerate SwinIR.", gr.Checkbox, {"interactive": True}, section=('upscaling', "Upscaling")).info("Takes longer on first run"))
+    shared.opts.add_option("SWIN_torch_compile", shared.OptionInfo(False, "Use torch.compile to accelerate SwinIR.", gr.Checkbox, {"interactive": True}, section=('upscaling', "Upscaling")).info("Takes longer on first run"))
 
 
 script_callbacks.on_ui_settings(on_ui_settings)
diff --git a/extensions-builtin/SwinIR/swinir_model_arch.py b/extensions-builtin/SwinIR/swinir_model_arch.py
deleted file mode 100644
index 93b932747..000000000
--- a/extensions-builtin/SwinIR/swinir_model_arch.py
+++ /dev/null
@@ -1,867 +0,0 @@
-# -----------------------------------------------------------------------------------
-# SwinIR: Image Restoration Using Swin Transformer, https://arxiv.org/abs/2108.10257
-# Originally Written by Ze Liu, Modified by Jingyun Liang.
-# -----------------------------------------------------------------------------------
-
-import math
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import torch.utils.checkpoint as checkpoint
-from timm.models.layers import DropPath, to_2tuple, trunc_normal_
-
-
-class Mlp(nn.Module):
-    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
-        super().__init__()
-        out_features = out_features or in_features
-        hidden_features = hidden_features or in_features
-        self.fc1 = nn.Linear(in_features, hidden_features)
-        self.act = act_layer()
-        self.fc2 = nn.Linear(hidden_features, out_features)
-        self.drop = nn.Dropout(drop)
-
-    def forward(self, x):
-        x = self.fc1(x)
-        x = self.act(x)
-        x = self.drop(x)
-        x = self.fc2(x)
-        x = self.drop(x)
-        return x
-
-
-def window_partition(x, window_size):
-    """
-    Args:
-        x: (B, H, W, C)
-        window_size (int): window size
-
-    Returns:
-        windows: (num_windows*B, window_size, window_size, C)
-    """
-    B, H, W, C = x.shape
-    x = x.view(B, H // window_size, window_size, W // window_size, window_size, C)
-    windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
-    return windows
-
-
-def window_reverse(windows, window_size, H, W):
-    """
-    Args:
-        windows: (num_windows*B, window_size, window_size, C)
-        window_size (int): Window size
-        H (int): Height of image
-        W (int): Width of image
-
-    Returns:
-        x: (B, H, W, C)
-    """
-    B = int(windows.shape[0] / (H * W / window_size / window_size))
-    x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1)
-    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
-    return x
-
-
-class WindowAttention(nn.Module):
-    r""" Window based multi-head self attention (W-MSA) module with relative position bias.
-    It supports both of shifted and non-shifted window.
-
-    Args:
-        dim (int): Number of input channels.
-        window_size (tuple[int]): The height and width of the window.
-        num_heads (int): Number of attention heads.
-        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
-        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
-        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
-        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
-    """
-
-    def __init__(self, dim, window_size, num_heads, qkv_bias=True, qk_scale=None, attn_drop=0., proj_drop=0.):
-
-        super().__init__()
-        self.dim = dim
-        self.window_size = window_size  # Wh, Ww
-        self.num_heads = num_heads
-        head_dim = dim // num_heads
-        self.scale = qk_scale or head_dim ** -0.5
-
-        # define a parameter table of relative position bias
-        self.relative_position_bias_table = nn.Parameter(
-            torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads))  # 2*Wh-1 * 2*Ww-1, nH
-
-        # get pair-wise relative position index for each token inside the window
-        coords_h = torch.arange(self.window_size[0])
-        coords_w = torch.arange(self.window_size[1])
-        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
-        coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
-        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Ww
-        relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
-        relative_coords[:, :, 0] += self.window_size[0] - 1  # shift to start from 0
-        relative_coords[:, :, 1] += self.window_size[1] - 1
-        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
-        relative_position_index = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
-        self.register_buffer("relative_position_index", relative_position_index)
-
-        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
-        self.attn_drop = nn.Dropout(attn_drop)
-        self.proj = nn.Linear(dim, dim)
-
-        self.proj_drop = nn.Dropout(proj_drop)
-
-        trunc_normal_(self.relative_position_bias_table, std=.02)
-        self.softmax = nn.Softmax(dim=-1)
-
-    def forward(self, x, mask=None):
-        """
-        Args:
-            x: input features with shape of (num_windows*B, N, C)
-            mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
-        """
-        B_, N, C = x.shape
-        qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
-        q, k, v = qkv[0], qkv[1], qkv[2]  # make torchscript happy (cannot use tensor as tuple)
-
-        q = q * self.scale
-        attn = (q @ k.transpose(-2, -1))
-
-        relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
-            self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1)  # Wh*Ww,Wh*Ww,nH
-        relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
-        attn = attn + relative_position_bias.unsqueeze(0)
-
-        if mask is not None:
-            nW = mask.shape[0]
-            attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0)
-            attn = attn.view(-1, self.num_heads, N, N)
-            attn = self.softmax(attn)
-        else:
-            attn = self.softmax(attn)
-
-        attn = self.attn_drop(attn)
-
-        x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
-        x = self.proj(x)
-        x = self.proj_drop(x)
-        return x
-
-    def extra_repr(self) -> str:
-        return f'dim={self.dim}, window_size={self.window_size}, num_heads={self.num_heads}'
-
-    def flops(self, N):
-        # calculate flops for 1 window with token length of N
-        flops = 0
-        # qkv = self.qkv(x)
-        flops += N * self.dim * 3 * self.dim
-        # attn = (q @ k.transpose(-2, -1))
-        flops += self.num_heads * N * (self.dim // self.num_heads) * N
-        #  x = (attn @ v)
-        flops += self.num_heads * N * N * (self.dim // self.num_heads)
-        # x = self.proj(x)
-        flops += N * self.dim * self.dim
-        return flops
-
-
-class SwinTransformerBlock(nn.Module):
-    r""" Swin Transformer Block.
-
-    Args:
-        dim (int): Number of input channels.
-        input_resolution (tuple[int]): Input resolution.
-        num_heads (int): Number of attention heads.
-        window_size (int): Window size.
-        shift_size (int): Shift size for SW-MSA.
-        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
-        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
-        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
-        drop (float, optional): Dropout rate. Default: 0.0
-        attn_drop (float, optional): Attention dropout rate. Default: 0.0
-        drop_path (float, optional): Stochastic depth rate. Default: 0.0
-        act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
-        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
-    """
-
-    def __init__(self, dim, input_resolution, num_heads, window_size=7, shift_size=0,
-                 mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0., drop_path=0.,
-                 act_layer=nn.GELU, norm_layer=nn.LayerNorm):
-        super().__init__()
-        self.dim = dim
-        self.input_resolution = input_resolution
-        self.num_heads = num_heads
-        self.window_size = window_size
-        self.shift_size = shift_size
-        self.mlp_ratio = mlp_ratio
-        if min(self.input_resolution) <= self.window_size:
-            # if window size is larger than input resolution, we don't partition windows
-            self.shift_size = 0
-            self.window_size = min(self.input_resolution)
-        assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size"
-
-        self.norm1 = norm_layer(dim)
-        self.attn = WindowAttention(
-            dim, window_size=to_2tuple(self.window_size), num_heads=num_heads,
-            qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop)
-
-        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
-        self.norm2 = norm_layer(dim)
-        mlp_hidden_dim = int(dim * mlp_ratio)
-        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
-
-        if self.shift_size > 0:
-            attn_mask = self.calculate_mask(self.input_resolution)
-        else:
-            attn_mask = None
-
-        self.register_buffer("attn_mask", attn_mask)
-
-    def calculate_mask(self, x_size):
-        # calculate attention mask for SW-MSA
-        H, W = x_size
-        img_mask = torch.zeros((1, H, W, 1))  # 1 H W 1
-        h_slices = (slice(0, -self.window_size),
-                    slice(-self.window_size, -self.shift_size),
-                    slice(-self.shift_size, None))
-        w_slices = (slice(0, -self.window_size),
-                    slice(-self.window_size, -self.shift_size),
-                    slice(-self.shift_size, None))
-        cnt = 0
-        for h in h_slices:
-            for w in w_slices:
-                img_mask[:, h, w, :] = cnt
-                cnt += 1
-
-        mask_windows = window_partition(img_mask, self.window_size)  # nW, window_size, window_size, 1
-        mask_windows = mask_windows.view(-1, self.window_size * self.window_size)
-        attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
-        attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
-
-        return attn_mask
-
-    def forward(self, x, x_size):
-        H, W = x_size
-        B, L, C = x.shape
-        # assert L == H * W, "input feature has wrong size"
-
-        shortcut = x
-        x = self.norm1(x)
-        x = x.view(B, H, W, C)
-
-        # cyclic shift
-        if self.shift_size > 0:
-            shifted_x = torch.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
-        else:
-            shifted_x = x
-
-        # partition windows
-        x_windows = window_partition(shifted_x, self.window_size)  # nW*B, window_size, window_size, C
-        x_windows = x_windows.view(-1, self.window_size * self.window_size, C)  # nW*B, window_size*window_size, C
-
-        # W-MSA/SW-MSA (to be compatible for testing on images whose shapes are the multiple of window size
-        if self.input_resolution == x_size:
-            attn_windows = self.attn(x_windows, mask=self.attn_mask)  # nW*B, window_size*window_size, C
-        else:
-            attn_windows = self.attn(x_windows, mask=self.calculate_mask(x_size).to(x.device))
-
-        # merge windows
-        attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C)
-        shifted_x = window_reverse(attn_windows, self.window_size, H, W)  # B H' W' C
-
-        # reverse cyclic shift
-        if self.shift_size > 0:
-            x = torch.roll(shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2))
-        else:
-            x = shifted_x
-        x = x.view(B, H * W, C)
-
-        # FFN
-        x = shortcut + self.drop_path(x)
-        x = x + self.drop_path(self.mlp(self.norm2(x)))
-
-        return x
-
-    def extra_repr(self) -> str:
-        return f"dim={self.dim}, input_resolution={self.input_resolution}, num_heads={self.num_heads}, " \
-               f"window_size={self.window_size}, shift_size={self.shift_size}, mlp_ratio={self.mlp_ratio}"
-
-    def flops(self):
-        flops = 0
-        H, W = self.input_resolution
-        # norm1
-        flops += self.dim * H * W
-        # W-MSA/SW-MSA
-        nW = H * W / self.window_size / self.window_size
-        flops += nW * self.attn.flops(self.window_size * self.window_size)
-        # mlp
-        flops += 2 * H * W * self.dim * self.dim * self.mlp_ratio
-        # norm2
-        flops += self.dim * H * W
-        return flops
-
-
-class PatchMerging(nn.Module):
-    r""" Patch Merging Layer.
-
-    Args:
-        input_resolution (tuple[int]): Resolution of input feature.
-        dim (int): Number of input channels.
-        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
-    """
-
-    def __init__(self, input_resolution, dim, norm_layer=nn.LayerNorm):
-        super().__init__()
-        self.input_resolution = input_resolution
-        self.dim = dim
-        self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
-        self.norm = norm_layer(4 * dim)
-
-    def forward(self, x):
-        """
-        x: B, H*W, C
-        """
-        H, W = self.input_resolution
-        B, L, C = x.shape
-        assert L == H * W, "input feature has wrong size"
-        assert H % 2 == 0 and W % 2 == 0, f"x size ({H}*{W}) are not even."
-
-        x = x.view(B, H, W, C)
-
-        x0 = x[:, 0::2, 0::2, :]  # B H/2 W/2 C
-        x1 = x[:, 1::2, 0::2, :]  # B H/2 W/2 C
-        x2 = x[:, 0::2, 1::2, :]  # B H/2 W/2 C
-        x3 = x[:, 1::2, 1::2, :]  # B H/2 W/2 C
-        x = torch.cat([x0, x1, x2, x3], -1)  # B H/2 W/2 4*C
-        x = x.view(B, -1, 4 * C)  # B H/2*W/2 4*C
-
-        x = self.norm(x)
-        x = self.reduction(x)
-
-        return x
-
-    def extra_repr(self) -> str:
-        return f"input_resolution={self.input_resolution}, dim={self.dim}"
-
-    def flops(self):
-        H, W = self.input_resolution
-        flops = H * W * self.dim
-        flops += (H // 2) * (W // 2) * 4 * self.dim * 2 * self.dim
-        return flops
-
-
-class BasicLayer(nn.Module):
-    """ A basic Swin Transformer layer for one stage.
-
-    Args:
-        dim (int): Number of input channels.
-        input_resolution (tuple[int]): Input resolution.
-        depth (int): Number of blocks.
-        num_heads (int): Number of attention heads.
-        window_size (int): Local window size.
-        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
-        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
-        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
-        drop (float, optional): Dropout rate. Default: 0.0
-        attn_drop (float, optional): Attention dropout rate. Default: 0.0
-        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
-        norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
-        downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
-        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
-    """
-
-    def __init__(self, dim, input_resolution, depth, num_heads, window_size,
-                 mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0.,
-                 drop_path=0., norm_layer=nn.LayerNorm, downsample=None, use_checkpoint=False):
-
-        super().__init__()
-        self.dim = dim
-        self.input_resolution = input_resolution
-        self.depth = depth
-        self.use_checkpoint = use_checkpoint
-
-        # build blocks
-        self.blocks = nn.ModuleList([
-            SwinTransformerBlock(dim=dim, input_resolution=input_resolution,
-                                 num_heads=num_heads, window_size=window_size,
-                                 shift_size=0 if (i % 2 == 0) else window_size // 2,
-                                 mlp_ratio=mlp_ratio,
-                                 qkv_bias=qkv_bias, qk_scale=qk_scale,
-                                 drop=drop, attn_drop=attn_drop,
-                                 drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path,
-                                 norm_layer=norm_layer)
-            for i in range(depth)])
-
-        # patch merging layer
-        if downsample is not None:
-            self.downsample = downsample(input_resolution, dim=dim, norm_layer=norm_layer)
-        else:
-            self.downsample = None
-
-    def forward(self, x, x_size):
-        for blk in self.blocks:
-            if self.use_checkpoint:
-                x = checkpoint.checkpoint(blk, x, x_size)
-            else:
-                x = blk(x, x_size)
-        if self.downsample is not None:
-            x = self.downsample(x)
-        return x
-
-    def extra_repr(self) -> str:
-        return f"dim={self.dim}, input_resolution={self.input_resolution}, depth={self.depth}"
-
-    def flops(self):
-        flops = 0
-        for blk in self.blocks:
-            flops += blk.flops()
-        if self.downsample is not None:
-            flops += self.downsample.flops()
-        return flops
-
-
-class RSTB(nn.Module):
-    """Residual Swin Transformer Block (RSTB).
-
-    Args:
-        dim (int): Number of input channels.
-        input_resolution (tuple[int]): Input resolution.
-        depth (int): Number of blocks.
-        num_heads (int): Number of attention heads.
-        window_size (int): Local window size.
-        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
-        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
-        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
-        drop (float, optional): Dropout rate. Default: 0.0
-        attn_drop (float, optional): Attention dropout rate. Default: 0.0
-        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
-        norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
-        downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
-        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
-        img_size: Input image size.
-        patch_size: Patch size.
-        resi_connection: The convolutional block before residual connection.
-    """
-
-    def __init__(self, dim, input_resolution, depth, num_heads, window_size,
-                 mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0.,
-                 drop_path=0., norm_layer=nn.LayerNorm, downsample=None, use_checkpoint=False,
-                 img_size=224, patch_size=4, resi_connection='1conv'):
-        super(RSTB, self).__init__()
-
-        self.dim = dim
-        self.input_resolution = input_resolution
-
-        self.residual_group = BasicLayer(dim=dim,
-                                         input_resolution=input_resolution,
-                                         depth=depth,
-                                         num_heads=num_heads,
-                                         window_size=window_size,
-                                         mlp_ratio=mlp_ratio,
-                                         qkv_bias=qkv_bias, qk_scale=qk_scale,
-                                         drop=drop, attn_drop=attn_drop,
-                                         drop_path=drop_path,
-                                         norm_layer=norm_layer,
-                                         downsample=downsample,
-                                         use_checkpoint=use_checkpoint)
-
-        if resi_connection == '1conv':
-            self.conv = nn.Conv2d(dim, dim, 3, 1, 1)
-        elif resi_connection == '3conv':
-            # to save parameters and memory
-            self.conv = nn.Sequential(nn.Conv2d(dim, dim // 4, 3, 1, 1), nn.LeakyReLU(negative_slope=0.2, inplace=True),
-                                      nn.Conv2d(dim // 4, dim // 4, 1, 1, 0),
-                                      nn.LeakyReLU(negative_slope=0.2, inplace=True),
-                                      nn.Conv2d(dim // 4, dim, 3, 1, 1))
-
-        self.patch_embed = PatchEmbed(
-            img_size=img_size, patch_size=patch_size, in_chans=0, embed_dim=dim,
-            norm_layer=None)
-
-        self.patch_unembed = PatchUnEmbed(
-            img_size=img_size, patch_size=patch_size, in_chans=0, embed_dim=dim,
-            norm_layer=None)
-
-    def forward(self, x, x_size):
-        return self.patch_embed(self.conv(self.patch_unembed(self.residual_group(x, x_size), x_size))) + x
-
-    def flops(self):
-        flops = 0
-        flops += self.residual_group.flops()
-        H, W = self.input_resolution
-        flops += H * W * self.dim * self.dim * 9
-        flops += self.patch_embed.flops()
-        flops += self.patch_unembed.flops()
-
-        return flops
-
-
-class PatchEmbed(nn.Module):
-    r""" Image to Patch Embedding
-
-    Args:
-        img_size (int): Image size.  Default: 224.
-        patch_size (int): Patch token size. Default: 4.
-        in_chans (int): Number of input image channels. Default: 3.
-        embed_dim (int): Number of linear projection output channels. Default: 96.
-        norm_layer (nn.Module, optional): Normalization layer. Default: None
-    """
-
-    def __init__(self, img_size=224, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None):
-        super().__init__()
-        img_size = to_2tuple(img_size)
-        patch_size = to_2tuple(patch_size)
-        patches_resolution = [img_size[0] // patch_size[0], img_size[1] // patch_size[1]]
-        self.img_size = img_size
-        self.patch_size = patch_size
-        self.patches_resolution = patches_resolution
-        self.num_patches = patches_resolution[0] * patches_resolution[1]
-
-        self.in_chans = in_chans
-        self.embed_dim = embed_dim
-
-        if norm_layer is not None:
-            self.norm = norm_layer(embed_dim)
-        else:
-            self.norm = None
-
-    def forward(self, x):
-        x = x.flatten(2).transpose(1, 2)  # B Ph*Pw C
-        if self.norm is not None:
-            x = self.norm(x)
-        return x
-
-    def flops(self):
-        flops = 0
-        H, W = self.img_size
-        if self.norm is not None:
-            flops += H * W * self.embed_dim
-        return flops
-
-
-class PatchUnEmbed(nn.Module):
-    r""" Image to Patch Unembedding
-
-    Args:
-        img_size (int): Image size.  Default: 224.
-        patch_size (int): Patch token size. Default: 4.
-        in_chans (int): Number of input image channels. Default: 3.
-        embed_dim (int): Number of linear projection output channels. Default: 96.
-        norm_layer (nn.Module, optional): Normalization layer. Default: None
-    """
-
-    def __init__(self, img_size=224, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None):
-        super().__init__()
-        img_size = to_2tuple(img_size)
-        patch_size = to_2tuple(patch_size)
-        patches_resolution = [img_size[0] // patch_size[0], img_size[1] // patch_size[1]]
-        self.img_size = img_size
-        self.patch_size = patch_size
-        self.patches_resolution = patches_resolution
-        self.num_patches = patches_resolution[0] * patches_resolution[1]
-
-        self.in_chans = in_chans
-        self.embed_dim = embed_dim
-
-    def forward(self, x, x_size):
-        B, HW, C = x.shape
-        x = x.transpose(1, 2).view(B, self.embed_dim, x_size[0], x_size[1])  # B Ph*Pw C
-        return x
-
-    def flops(self):
-        flops = 0
-        return flops
-
-
-class Upsample(nn.Sequential):
-    """Upsample module.
-
-    Args:
-        scale (int): Scale factor. Supported scales: 2^n and 3.
-        num_feat (int): Channel number of intermediate features.
-    """
-
-    def __init__(self, scale, num_feat):
-        m = []
-        if (scale & (scale - 1)) == 0:  # scale = 2^n
-            for _ in range(int(math.log(scale, 2))):
-                m.append(nn.Conv2d(num_feat, 4 * num_feat, 3, 1, 1))
-                m.append(nn.PixelShuffle(2))
-        elif scale == 3:
-            m.append(nn.Conv2d(num_feat, 9 * num_feat, 3, 1, 1))
-            m.append(nn.PixelShuffle(3))
-        else:
-            raise ValueError(f'scale {scale} is not supported. ' 'Supported scales: 2^n and 3.')
-        super(Upsample, self).__init__(*m)
-
-
-class UpsampleOneStep(nn.Sequential):
-    """UpsampleOneStep module (the difference with Upsample is that it always only has 1conv + 1pixelshuffle)
-       Used in lightweight SR to save parameters.
-
-    Args:
-        scale (int): Scale factor. Supported scales: 2^n and 3.
-        num_feat (int): Channel number of intermediate features.
-
-    """
-
-    def __init__(self, scale, num_feat, num_out_ch, input_resolution=None):
-        self.num_feat = num_feat
-        self.input_resolution = input_resolution
-        m = []
-        m.append(nn.Conv2d(num_feat, (scale ** 2) * num_out_ch, 3, 1, 1))
-        m.append(nn.PixelShuffle(scale))
-        super(UpsampleOneStep, self).__init__(*m)
-
-    def flops(self):
-        H, W = self.input_resolution
-        flops = H * W * self.num_feat * 3 * 9
-        return flops
-
-
-class SwinIR(nn.Module):
-    r""" SwinIR
-        A PyTorch impl of : `SwinIR: Image Restoration Using Swin Transformer`, based on Swin Transformer.
-
-    Args:
-        img_size (int | tuple(int)): Input image size. Default 64
-        patch_size (int | tuple(int)): Patch size. Default: 1
-        in_chans (int): Number of input image channels. Default: 3
-        embed_dim (int): Patch embedding dimension. Default: 96
-        depths (tuple(int)): Depth of each Swin Transformer layer.
-        num_heads (tuple(int)): Number of attention heads in different layers.
-        window_size (int): Window size. Default: 7
-        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4
-        qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
-        qk_scale (float): Override default qk scale of head_dim ** -0.5 if set. Default: None
-        drop_rate (float): Dropout rate. Default: 0
-        attn_drop_rate (float): Attention dropout rate. Default: 0
-        drop_path_rate (float): Stochastic depth rate. Default: 0.1
-        norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm.
-        ape (bool): If True, add absolute position embedding to the patch embedding. Default: False
-        patch_norm (bool): If True, add normalization after patch embedding. Default: True
-        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False
-        upscale: Upscale factor. 2/3/4/8 for image SR, 1 for denoising and compress artifact reduction
-        img_range: Image range. 1. or 255.
-        upsampler: The reconstruction reconstruction module. 'pixelshuffle'/'pixelshuffledirect'/'nearest+conv'/None
-        resi_connection: The convolutional block before residual connection. '1conv'/'3conv'
-    """
-
-    def __init__(self, img_size=64, patch_size=1, in_chans=3,
-                 embed_dim=96, depths=(6, 6, 6, 6), num_heads=(6, 6, 6, 6),
-                 window_size=7, mlp_ratio=4., qkv_bias=True, qk_scale=None,
-                 drop_rate=0., attn_drop_rate=0., drop_path_rate=0.1,
-                 norm_layer=nn.LayerNorm, ape=False, patch_norm=True,
-                 use_checkpoint=False, upscale=2, img_range=1., upsampler='', resi_connection='1conv',
-                 **kwargs):
-        super(SwinIR, self).__init__()
-        num_in_ch = in_chans
-        num_out_ch = in_chans
-        num_feat = 64
-        self.img_range = img_range
-        if in_chans == 3:
-            rgb_mean = (0.4488, 0.4371, 0.4040)
-            self.mean = torch.Tensor(rgb_mean).view(1, 3, 1, 1)
-        else:
-            self.mean = torch.zeros(1, 1, 1, 1)
-        self.upscale = upscale
-        self.upsampler = upsampler
-        self.window_size = window_size
-
-        #####################################################################################################
-        ################################### 1, shallow feature extraction ###################################
-        self.conv_first = nn.Conv2d(num_in_ch, embed_dim, 3, 1, 1)
-
-        #####################################################################################################
-        ################################### 2, deep feature extraction ######################################
-        self.num_layers = len(depths)
-        self.embed_dim = embed_dim
-        self.ape = ape
-        self.patch_norm = patch_norm
-        self.num_features = embed_dim
-        self.mlp_ratio = mlp_ratio
-
-        # split image into non-overlapping patches
-        self.patch_embed = PatchEmbed(
-            img_size=img_size, patch_size=patch_size, in_chans=embed_dim, embed_dim=embed_dim,
-            norm_layer=norm_layer if self.patch_norm else None)
-        num_patches = self.patch_embed.num_patches
-        patches_resolution = self.patch_embed.patches_resolution
-        self.patches_resolution = patches_resolution
-
-        # merge non-overlapping patches into image
-        self.patch_unembed = PatchUnEmbed(
-            img_size=img_size, patch_size=patch_size, in_chans=embed_dim, embed_dim=embed_dim,
-            norm_layer=norm_layer if self.patch_norm else None)
-
-        # absolute position embedding
-        if self.ape:
-            self.absolute_pos_embed = nn.Parameter(torch.zeros(1, num_patches, embed_dim))
-            trunc_normal_(self.absolute_pos_embed, std=.02)
-
-        self.pos_drop = nn.Dropout(p=drop_rate)
-
-        # stochastic depth
-        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))]  # stochastic depth decay rule
-
-        # build Residual Swin Transformer blocks (RSTB)
-        self.layers = nn.ModuleList()
-        for i_layer in range(self.num_layers):
-            layer = RSTB(dim=embed_dim,
-                         input_resolution=(patches_resolution[0],
-                                           patches_resolution[1]),
-                         depth=depths[i_layer],
-                         num_heads=num_heads[i_layer],
-                         window_size=window_size,
-                         mlp_ratio=self.mlp_ratio,
-                         qkv_bias=qkv_bias, qk_scale=qk_scale,
-                         drop=drop_rate, attn_drop=attn_drop_rate,
-                         drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],  # no impact on SR results
-                         norm_layer=norm_layer,
-                         downsample=None,
-                         use_checkpoint=use_checkpoint,
-                         img_size=img_size,
-                         patch_size=patch_size,
-                         resi_connection=resi_connection
-
-                         )
-            self.layers.append(layer)
-        self.norm = norm_layer(self.num_features)
-
-        # build the last conv layer in deep feature extraction
-        if resi_connection == '1conv':
-            self.conv_after_body = nn.Conv2d(embed_dim, embed_dim, 3, 1, 1)
-        elif resi_connection == '3conv':
-            # to save parameters and memory
-            self.conv_after_body = nn.Sequential(nn.Conv2d(embed_dim, embed_dim // 4, 3, 1, 1),
-                                                 nn.LeakyReLU(negative_slope=0.2, inplace=True),
-                                                 nn.Conv2d(embed_dim // 4, embed_dim // 4, 1, 1, 0),
-                                                 nn.LeakyReLU(negative_slope=0.2, inplace=True),
-                                                 nn.Conv2d(embed_dim // 4, embed_dim, 3, 1, 1))
-
-        #####################################################################################################
-        ################################ 3, high quality image reconstruction ################################
-        if self.upsampler == 'pixelshuffle':
-            # for classical SR
-            self.conv_before_upsample = nn.Sequential(nn.Conv2d(embed_dim, num_feat, 3, 1, 1),
-                                                      nn.LeakyReLU(inplace=True))
-            self.upsample = Upsample(upscale, num_feat)
-            self.conv_last = nn.Conv2d(num_feat, num_out_ch, 3, 1, 1)
-        elif self.upsampler == 'pixelshuffledirect':
-            # for lightweight SR (to save parameters)
-            self.upsample = UpsampleOneStep(upscale, embed_dim, num_out_ch,
-                                            (patches_resolution[0], patches_resolution[1]))
-        elif self.upsampler == 'nearest+conv':
-            # for real-world SR (less artifacts)
-            self.conv_before_upsample = nn.Sequential(nn.Conv2d(embed_dim, num_feat, 3, 1, 1),
-                                                      nn.LeakyReLU(inplace=True))
-            self.conv_up1 = nn.Conv2d(num_feat, num_feat, 3, 1, 1)
-            if self.upscale == 4:
-                self.conv_up2 = nn.Conv2d(num_feat, num_feat, 3, 1, 1)
-            self.conv_hr = nn.Conv2d(num_feat, num_feat, 3, 1, 1)
-            self.conv_last = nn.Conv2d(num_feat, num_out_ch, 3, 1, 1)
-            self.lrelu = nn.LeakyReLU(negative_slope=0.2, inplace=True)
-        else:
-            # for image denoising and JPEG compression artifact reduction
-            self.conv_last = nn.Conv2d(embed_dim, num_out_ch, 3, 1, 1)
-
-        self.apply(self._init_weights)
-
-    def _init_weights(self, m):
-        if isinstance(m, nn.Linear):
-            trunc_normal_(m.weight, std=.02)
-            if isinstance(m, nn.Linear) and m.bias is not None:
-                nn.init.constant_(m.bias, 0)
-        elif isinstance(m, nn.LayerNorm):
-            nn.init.constant_(m.bias, 0)
-            nn.init.constant_(m.weight, 1.0)
-
-    @torch.jit.ignore
-    def no_weight_decay(self):
-        return {'absolute_pos_embed'}
-
-    @torch.jit.ignore
-    def no_weight_decay_keywords(self):
-        return {'relative_position_bias_table'}
-
-    def check_image_size(self, x):
-        _, _, h, w = x.size()
-        mod_pad_h = (self.window_size - h % self.window_size) % self.window_size
-        mod_pad_w = (self.window_size - w % self.window_size) % self.window_size
-        x = F.pad(x, (0, mod_pad_w, 0, mod_pad_h), 'reflect')
-        return x
-
-    def forward_features(self, x):
-        x_size = (x.shape[2], x.shape[3])
-        x = self.patch_embed(x)
-        if self.ape:
-            x = x + self.absolute_pos_embed
-        x = self.pos_drop(x)
-
-        for layer in self.layers:
-            x = layer(x, x_size)
-
-        x = self.norm(x)  # B L C
-        x = self.patch_unembed(x, x_size)
-
-        return x
-
-    def forward(self, x):
-        H, W = x.shape[2:]
-        x = self.check_image_size(x)
-
-        self.mean = self.mean.type_as(x)
-        x = (x - self.mean) * self.img_range
-
-        if self.upsampler == 'pixelshuffle':
-            # for classical SR
-            x = self.conv_first(x)
-            x = self.conv_after_body(self.forward_features(x)) + x
-            x = self.conv_before_upsample(x)
-            x = self.conv_last(self.upsample(x))
-        elif self.upsampler == 'pixelshuffledirect':
-            # for lightweight SR
-            x = self.conv_first(x)
-            x = self.conv_after_body(self.forward_features(x)) + x
-            x = self.upsample(x)
-        elif self.upsampler == 'nearest+conv':
-            # for real-world SR
-            x = self.conv_first(x)
-            x = self.conv_after_body(self.forward_features(x)) + x
-            x = self.conv_before_upsample(x)
-            x = self.lrelu(self.conv_up1(torch.nn.functional.interpolate(x, scale_factor=2, mode='nearest')))
-            if self.upscale == 4:
-                x = self.lrelu(self.conv_up2(torch.nn.functional.interpolate(x, scale_factor=2, mode='nearest')))
-            x = self.conv_last(self.lrelu(self.conv_hr(x)))
-        else:
-            # for image denoising and JPEG compression artifact reduction
-            x_first = self.conv_first(x)
-            res = self.conv_after_body(self.forward_features(x_first)) + x_first
-            x = x + self.conv_last(res)
-
-        x = x / self.img_range + self.mean
-
-        return x[:, :, :H*self.upscale, :W*self.upscale]
-
-    def flops(self):
-        flops = 0
-        H, W = self.patches_resolution
-        flops += H * W * 3 * self.embed_dim * 9
-        flops += self.patch_embed.flops()
-        for layer in self.layers:
-            flops += layer.flops()
-        flops += H * W * 3 * self.embed_dim * self.embed_dim
-        flops += self.upsample.flops()
-        return flops
-
-
-if __name__ == '__main__':
-    upscale = 4
-    window_size = 8
-    height = (1024 // upscale // window_size + 1) * window_size
-    width = (720 // upscale // window_size + 1) * window_size
-    model = SwinIR(upscale=2, img_size=(height, width),
-                   window_size=window_size, img_range=1., depths=[6, 6, 6, 6],
-                   embed_dim=60, num_heads=[6, 6, 6, 6], mlp_ratio=2, upsampler='pixelshuffledirect')
-    print(model)
-    print(height, width, model.flops() / 1e9)
-
-    x = torch.randn((1, 3, height, width))
-    x = model(x)
-    print(x.shape)
diff --git a/extensions-builtin/SwinIR/swinir_model_arch_v2.py b/extensions-builtin/SwinIR/swinir_model_arch_v2.py
deleted file mode 100644
index dad22cca2..000000000
--- a/extensions-builtin/SwinIR/swinir_model_arch_v2.py
+++ /dev/null
@@ -1,1017 +0,0 @@
-# -----------------------------------------------------------------------------------
-# Swin2SR: Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration, https://arxiv.org/abs/
-# Written by Conde and Choi et al.
-# -----------------------------------------------------------------------------------
-
-import math
-import numpy as np
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import torch.utils.checkpoint as checkpoint
-from timm.models.layers import DropPath, to_2tuple, trunc_normal_
-
-
-class Mlp(nn.Module):
-    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
-        super().__init__()
-        out_features = out_features or in_features
-        hidden_features = hidden_features or in_features
-        self.fc1 = nn.Linear(in_features, hidden_features)
-        self.act = act_layer()
-        self.fc2 = nn.Linear(hidden_features, out_features)
-        self.drop = nn.Dropout(drop)
-
-    def forward(self, x):
-        x = self.fc1(x)
-        x = self.act(x)
-        x = self.drop(x)
-        x = self.fc2(x)
-        x = self.drop(x)
-        return x
-
-
-def window_partition(x, window_size):
-    """
-    Args:
-        x: (B, H, W, C)
-        window_size (int): window size
-    Returns:
-        windows: (num_windows*B, window_size, window_size, C)
-    """
-    B, H, W, C = x.shape
-    x = x.view(B, H // window_size, window_size, W // window_size, window_size, C)
-    windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
-    return windows
-
-
-def window_reverse(windows, window_size, H, W):
-    """
-    Args:
-        windows: (num_windows*B, window_size, window_size, C)
-        window_size (int): Window size
-        H (int): Height of image
-        W (int): Width of image
-    Returns:
-        x: (B, H, W, C)
-    """
-    B = int(windows.shape[0] / (H * W / window_size / window_size))
-    x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1)
-    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
-    return x
-
-class WindowAttention(nn.Module):
-    r""" Window based multi-head self attention (W-MSA) module with relative position bias.
-    It supports both of shifted and non-shifted window.
-    Args:
-        dim (int): Number of input channels.
-        window_size (tuple[int]): The height and width of the window.
-        num_heads (int): Number of attention heads.
-        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
-        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
-        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
-        pretrained_window_size (tuple[int]): The height and width of the window in pre-training.
-    """
-
-    def __init__(self, dim, window_size, num_heads, qkv_bias=True, attn_drop=0., proj_drop=0.,
-                 pretrained_window_size=(0, 0)):
-
-        super().__init__()
-        self.dim = dim
-        self.window_size = window_size  # Wh, Ww
-        self.pretrained_window_size = pretrained_window_size
-        self.num_heads = num_heads
-
-        self.logit_scale = nn.Parameter(torch.log(10 * torch.ones((num_heads, 1, 1))), requires_grad=True)
-
-        # mlp to generate continuous relative position bias
-        self.cpb_mlp = nn.Sequential(nn.Linear(2, 512, bias=True),
-                                     nn.ReLU(inplace=True),
-                                     nn.Linear(512, num_heads, bias=False))
-
-        # get relative_coords_table
-        relative_coords_h = torch.arange(-(self.window_size[0] - 1), self.window_size[0], dtype=torch.float32)
-        relative_coords_w = torch.arange(-(self.window_size[1] - 1), self.window_size[1], dtype=torch.float32)
-        relative_coords_table = torch.stack(
-            torch.meshgrid([relative_coords_h,
-                            relative_coords_w])).permute(1, 2, 0).contiguous().unsqueeze(0)  # 1, 2*Wh-1, 2*Ww-1, 2
-        if pretrained_window_size[0] > 0:
-            relative_coords_table[:, :, :, 0] /= (pretrained_window_size[0] - 1)
-            relative_coords_table[:, :, :, 1] /= (pretrained_window_size[1] - 1)
-        else:
-            relative_coords_table[:, :, :, 0] /= (self.window_size[0] - 1)
-            relative_coords_table[:, :, :, 1] /= (self.window_size[1] - 1)
-        relative_coords_table *= 8  # normalize to -8, 8
-        relative_coords_table = torch.sign(relative_coords_table) * torch.log2(
-            torch.abs(relative_coords_table) + 1.0) / np.log2(8)
-
-        self.register_buffer("relative_coords_table", relative_coords_table)
-
-        # get pair-wise relative position index for each token inside the window
-        coords_h = torch.arange(self.window_size[0])
-        coords_w = torch.arange(self.window_size[1])
-        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
-        coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
-        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Ww
-        relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
-        relative_coords[:, :, 0] += self.window_size[0] - 1  # shift to start from 0
-        relative_coords[:, :, 1] += self.window_size[1] - 1
-        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
-        relative_position_index = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
-        self.register_buffer("relative_position_index", relative_position_index)
-
-        self.qkv = nn.Linear(dim, dim * 3, bias=False)
-        if qkv_bias:
-            self.q_bias = nn.Parameter(torch.zeros(dim))
-            self.v_bias = nn.Parameter(torch.zeros(dim))
-        else:
-            self.q_bias = None
-            self.v_bias = None
-        self.attn_drop = nn.Dropout(attn_drop)
-        self.proj = nn.Linear(dim, dim)
-        self.proj_drop = nn.Dropout(proj_drop)
-        self.softmax = nn.Softmax(dim=-1)
-
-    def forward(self, x, mask=None):
-        """
-        Args:
-            x: input features with shape of (num_windows*B, N, C)
-            mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
-        """
-        B_, N, C = x.shape
-        qkv_bias = None
-        if self.q_bias is not None:
-            qkv_bias = torch.cat((self.q_bias, torch.zeros_like(self.v_bias, requires_grad=False), self.v_bias))
-        qkv = F.linear(input=x, weight=self.qkv.weight, bias=qkv_bias)
-        qkv = qkv.reshape(B_, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
-        q, k, v = qkv[0], qkv[1], qkv[2]  # make torchscript happy (cannot use tensor as tuple)
-
-        # cosine attention
-        attn = (F.normalize(q, dim=-1) @ F.normalize(k, dim=-1).transpose(-2, -1))
-        logit_scale = torch.clamp(self.logit_scale, max=torch.log(torch.tensor(1. / 0.01)).to(self.logit_scale.device)).exp()
-        attn = attn * logit_scale
-
-        relative_position_bias_table = self.cpb_mlp(self.relative_coords_table).view(-1, self.num_heads)
-        relative_position_bias = relative_position_bias_table[self.relative_position_index.view(-1)].view(
-            self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1)  # Wh*Ww,Wh*Ww,nH
-        relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
-        relative_position_bias = 16 * torch.sigmoid(relative_position_bias)
-        attn = attn + relative_position_bias.unsqueeze(0)
-
-        if mask is not None:
-            nW = mask.shape[0]
-            attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0)
-            attn = attn.view(-1, self.num_heads, N, N)
-            attn = self.softmax(attn)
-        else:
-            attn = self.softmax(attn)
-
-        attn = self.attn_drop(attn)
-
-        x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
-        x = self.proj(x)
-        x = self.proj_drop(x)
-        return x
-
-    def extra_repr(self) -> str:
-        return f'dim={self.dim}, window_size={self.window_size}, ' \
-               f'pretrained_window_size={self.pretrained_window_size}, num_heads={self.num_heads}'
-
-    def flops(self, N):
-        # calculate flops for 1 window with token length of N
-        flops = 0
-        # qkv = self.qkv(x)
-        flops += N * self.dim * 3 * self.dim
-        # attn = (q @ k.transpose(-2, -1))
-        flops += self.num_heads * N * (self.dim // self.num_heads) * N
-        #  x = (attn @ v)
-        flops += self.num_heads * N * N * (self.dim // self.num_heads)
-        # x = self.proj(x)
-        flops += N * self.dim * self.dim
-        return flops
-
-class SwinTransformerBlock(nn.Module):
-    r""" Swin Transformer Block.
-    Args:
-        dim (int): Number of input channels.
-        input_resolution (tuple[int]): Input resulotion.
-        num_heads (int): Number of attention heads.
-        window_size (int): Window size.
-        shift_size (int): Shift size for SW-MSA.
-        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
-        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
-        drop (float, optional): Dropout rate. Default: 0.0
-        attn_drop (float, optional): Attention dropout rate. Default: 0.0
-        drop_path (float, optional): Stochastic depth rate. Default: 0.0
-        act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
-        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
-        pretrained_window_size (int): Window size in pre-training.
-    """
-
-    def __init__(self, dim, input_resolution, num_heads, window_size=7, shift_size=0,
-                 mlp_ratio=4., qkv_bias=True, drop=0., attn_drop=0., drop_path=0.,
-                 act_layer=nn.GELU, norm_layer=nn.LayerNorm, pretrained_window_size=0):
-        super().__init__()
-        self.dim = dim
-        self.input_resolution = input_resolution
-        self.num_heads = num_heads
-        self.window_size = window_size
-        self.shift_size = shift_size
-        self.mlp_ratio = mlp_ratio
-        if min(self.input_resolution) <= self.window_size:
-            # if window size is larger than input resolution, we don't partition windows
-            self.shift_size = 0
-            self.window_size = min(self.input_resolution)
-        assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size"
-
-        self.norm1 = norm_layer(dim)
-        self.attn = WindowAttention(
-            dim, window_size=to_2tuple(self.window_size), num_heads=num_heads,
-            qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop,
-            pretrained_window_size=to_2tuple(pretrained_window_size))
-
-        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
-        self.norm2 = norm_layer(dim)
-        mlp_hidden_dim = int(dim * mlp_ratio)
-        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
-
-        if self.shift_size > 0:
-            attn_mask = self.calculate_mask(self.input_resolution)
-        else:
-            attn_mask = None
-
-        self.register_buffer("attn_mask", attn_mask)
-
-    def calculate_mask(self, x_size):
-        # calculate attention mask for SW-MSA
-        H, W = x_size
-        img_mask = torch.zeros((1, H, W, 1))  # 1 H W 1
-        h_slices = (slice(0, -self.window_size),
-                    slice(-self.window_size, -self.shift_size),
-                    slice(-self.shift_size, None))
-        w_slices = (slice(0, -self.window_size),
-                    slice(-self.window_size, -self.shift_size),
-                    slice(-self.shift_size, None))
-        cnt = 0
-        for h in h_slices:
-            for w in w_slices:
-                img_mask[:, h, w, :] = cnt
-                cnt += 1
-
-        mask_windows = window_partition(img_mask, self.window_size)  # nW, window_size, window_size, 1
-        mask_windows = mask_windows.view(-1, self.window_size * self.window_size)
-        attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
-        attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
-
-        return attn_mask
-
-    def forward(self, x, x_size):
-        H, W = x_size
-        B, L, C = x.shape
-        #assert L == H * W, "input feature has wrong size"
-
-        shortcut = x
-        x = x.view(B, H, W, C)
-
-        # cyclic shift
-        if self.shift_size > 0:
-            shifted_x = torch.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
-        else:
-            shifted_x = x
-
-        # partition windows
-        x_windows = window_partition(shifted_x, self.window_size)  # nW*B, window_size, window_size, C
-        x_windows = x_windows.view(-1, self.window_size * self.window_size, C)  # nW*B, window_size*window_size, C
-
-        # W-MSA/SW-MSA (to be compatible for testing on images whose shapes are the multiple of window size
-        if self.input_resolution == x_size:
-            attn_windows = self.attn(x_windows, mask=self.attn_mask)  # nW*B, window_size*window_size, C
-        else:
-            attn_windows = self.attn(x_windows, mask=self.calculate_mask(x_size).to(x.device))
-
-        # merge windows
-        attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C)
-        shifted_x = window_reverse(attn_windows, self.window_size, H, W)  # B H' W' C
-
-        # reverse cyclic shift
-        if self.shift_size > 0:
-            x = torch.roll(shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2))
-        else:
-            x = shifted_x
-        x = x.view(B, H * W, C)
-        x = shortcut + self.drop_path(self.norm1(x))
-
-        # FFN
-        x = x + self.drop_path(self.norm2(self.mlp(x)))
-
-        return x
-
-    def extra_repr(self) -> str:
-        return f"dim={self.dim}, input_resolution={self.input_resolution}, num_heads={self.num_heads}, " \
-               f"window_size={self.window_size}, shift_size={self.shift_size}, mlp_ratio={self.mlp_ratio}"
-
-    def flops(self):
-        flops = 0
-        H, W = self.input_resolution
-        # norm1
-        flops += self.dim * H * W
-        # W-MSA/SW-MSA
-        nW = H * W / self.window_size / self.window_size
-        flops += nW * self.attn.flops(self.window_size * self.window_size)
-        # mlp
-        flops += 2 * H * W * self.dim * self.dim * self.mlp_ratio
-        # norm2
-        flops += self.dim * H * W
-        return flops
-
-class PatchMerging(nn.Module):
-    r""" Patch Merging Layer.
-    Args:
-        input_resolution (tuple[int]): Resolution of input feature.
-        dim (int): Number of input channels.
-        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
-    """
-
-    def __init__(self, input_resolution, dim, norm_layer=nn.LayerNorm):
-        super().__init__()
-        self.input_resolution = input_resolution
-        self.dim = dim
-        self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
-        self.norm = norm_layer(2 * dim)
-
-    def forward(self, x):
-        """
-        x: B, H*W, C
-        """
-        H, W = self.input_resolution
-        B, L, C = x.shape
-        assert L == H * W, "input feature has wrong size"
-        assert H % 2 == 0 and W % 2 == 0, f"x size ({H}*{W}) are not even."
-
-        x = x.view(B, H, W, C)
-
-        x0 = x[:, 0::2, 0::2, :]  # B H/2 W/2 C
-        x1 = x[:, 1::2, 0::2, :]  # B H/2 W/2 C
-        x2 = x[:, 0::2, 1::2, :]  # B H/2 W/2 C
-        x3 = x[:, 1::2, 1::2, :]  # B H/2 W/2 C
-        x = torch.cat([x0, x1, x2, x3], -1)  # B H/2 W/2 4*C
-        x = x.view(B, -1, 4 * C)  # B H/2*W/2 4*C
-
-        x = self.reduction(x)
-        x = self.norm(x)
-
-        return x
-
-    def extra_repr(self) -> str:
-        return f"input_resolution={self.input_resolution}, dim={self.dim}"
-
-    def flops(self):
-        H, W = self.input_resolution
-        flops = (H // 2) * (W // 2) * 4 * self.dim * 2 * self.dim
-        flops += H * W * self.dim // 2
-        return flops
-
-class BasicLayer(nn.Module):
-    """ A basic Swin Transformer layer for one stage.
-    Args:
-        dim (int): Number of input channels.
-        input_resolution (tuple[int]): Input resolution.
-        depth (int): Number of blocks.
-        num_heads (int): Number of attention heads.
-        window_size (int): Local window size.
-        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
-        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
-        drop (float, optional): Dropout rate. Default: 0.0
-        attn_drop (float, optional): Attention dropout rate. Default: 0.0
-        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
-        norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
-        downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
-        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
-        pretrained_window_size (int): Local window size in pre-training.
-    """
-
-    def __init__(self, dim, input_resolution, depth, num_heads, window_size,
-                 mlp_ratio=4., qkv_bias=True, drop=0., attn_drop=0.,
-                 drop_path=0., norm_layer=nn.LayerNorm, downsample=None, use_checkpoint=False,
-                 pretrained_window_size=0):
-
-        super().__init__()
-        self.dim = dim
-        self.input_resolution = input_resolution
-        self.depth = depth
-        self.use_checkpoint = use_checkpoint
-
-        # build blocks
-        self.blocks = nn.ModuleList([
-            SwinTransformerBlock(dim=dim, input_resolution=input_resolution,
-                                 num_heads=num_heads, window_size=window_size,
-                                 shift_size=0 if (i % 2 == 0) else window_size // 2,
-                                 mlp_ratio=mlp_ratio,
-                                 qkv_bias=qkv_bias,
-                                 drop=drop, attn_drop=attn_drop,
-                                 drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path,
-                                 norm_layer=norm_layer,
-                                 pretrained_window_size=pretrained_window_size)
-            for i in range(depth)])
-
-        # patch merging layer
-        if downsample is not None:
-            self.downsample = downsample(input_resolution, dim=dim, norm_layer=norm_layer)
-        else:
-            self.downsample = None
-
-    def forward(self, x, x_size):
-        for blk in self.blocks:
-            if self.use_checkpoint:
-                x = checkpoint.checkpoint(blk, x, x_size)
-            else:
-                x = blk(x, x_size)
-        if self.downsample is not None:
-            x = self.downsample(x)
-        return x
-
-    def extra_repr(self) -> str:
-        return f"dim={self.dim}, input_resolution={self.input_resolution}, depth={self.depth}"
-
-    def flops(self):
-        flops = 0
-        for blk in self.blocks:
-            flops += blk.flops()
-        if self.downsample is not None:
-            flops += self.downsample.flops()
-        return flops
-
-    def _init_respostnorm(self):
-        for blk in self.blocks:
-            nn.init.constant_(blk.norm1.bias, 0)
-            nn.init.constant_(blk.norm1.weight, 0)
-            nn.init.constant_(blk.norm2.bias, 0)
-            nn.init.constant_(blk.norm2.weight, 0)
-
-class PatchEmbed(nn.Module):
-    r""" Image to Patch Embedding
-    Args:
-        img_size (int): Image size.  Default: 224.
-        patch_size (int): Patch token size. Default: 4.
-        in_chans (int): Number of input image channels. Default: 3.
-        embed_dim (int): Number of linear projection output channels. Default: 96.
-        norm_layer (nn.Module, optional): Normalization layer. Default: None
-    """
-
-    def __init__(self, img_size=224, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None):
-        super().__init__()
-        img_size = to_2tuple(img_size)
-        patch_size = to_2tuple(patch_size)
-        patches_resolution = [img_size[0] // patch_size[0], img_size[1] // patch_size[1]]
-        self.img_size = img_size
-        self.patch_size = patch_size
-        self.patches_resolution = patches_resolution
-        self.num_patches = patches_resolution[0] * patches_resolution[1]
-
-        self.in_chans = in_chans
-        self.embed_dim = embed_dim
-
-        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
-        if norm_layer is not None:
-            self.norm = norm_layer(embed_dim)
-        else:
-            self.norm = None
-
-    def forward(self, x):
-        B, C, H, W = x.shape
-        # FIXME look at relaxing size constraints
-        # assert H == self.img_size[0] and W == self.img_size[1],
-        #     f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
-        x = self.proj(x).flatten(2).transpose(1, 2)  # B Ph*Pw C
-        if self.norm is not None:
-            x = self.norm(x)
-        return x
-
-    def flops(self):
-        Ho, Wo = self.patches_resolution
-        flops = Ho * Wo * self.embed_dim * self.in_chans * (self.patch_size[0] * self.patch_size[1])
-        if self.norm is not None:
-            flops += Ho * Wo * self.embed_dim
-        return flops
-
-class RSTB(nn.Module):
-    """Residual Swin Transformer Block (RSTB).
-
-    Args:
-        dim (int): Number of input channels.
-        input_resolution (tuple[int]): Input resolution.
-        depth (int): Number of blocks.
-        num_heads (int): Number of attention heads.
-        window_size (int): Local window size.
-        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
-        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
-        drop (float, optional): Dropout rate. Default: 0.0
-        attn_drop (float, optional): Attention dropout rate. Default: 0.0
-        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
-        norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
-        downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
-        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
-        img_size: Input image size.
-        patch_size: Patch size.
-        resi_connection: The convolutional block before residual connection.
-    """
-
-    def __init__(self, dim, input_resolution, depth, num_heads, window_size,
-                 mlp_ratio=4., qkv_bias=True, drop=0., attn_drop=0.,
-                 drop_path=0., norm_layer=nn.LayerNorm, downsample=None, use_checkpoint=False,
-                 img_size=224, patch_size=4, resi_connection='1conv'):
-        super(RSTB, self).__init__()
-
-        self.dim = dim
-        self.input_resolution = input_resolution
-
-        self.residual_group = BasicLayer(dim=dim,
-                                         input_resolution=input_resolution,
-                                         depth=depth,
-                                         num_heads=num_heads,
-                                         window_size=window_size,
-                                         mlp_ratio=mlp_ratio,
-                                         qkv_bias=qkv_bias,
-                                         drop=drop, attn_drop=attn_drop,
-                                         drop_path=drop_path,
-                                         norm_layer=norm_layer,
-                                         downsample=downsample,
-                                         use_checkpoint=use_checkpoint)
-
-        if resi_connection == '1conv':
-            self.conv = nn.Conv2d(dim, dim, 3, 1, 1)
-        elif resi_connection == '3conv':
-            # to save parameters and memory
-            self.conv = nn.Sequential(nn.Conv2d(dim, dim // 4, 3, 1, 1), nn.LeakyReLU(negative_slope=0.2, inplace=True),
-                                      nn.Conv2d(dim // 4, dim // 4, 1, 1, 0),
-                                      nn.LeakyReLU(negative_slope=0.2, inplace=True),
-                                      nn.Conv2d(dim // 4, dim, 3, 1, 1))
-
-        self.patch_embed = PatchEmbed(
-            img_size=img_size, patch_size=patch_size, in_chans=dim, embed_dim=dim,
-            norm_layer=None)
-
-        self.patch_unembed = PatchUnEmbed(
-            img_size=img_size, patch_size=patch_size, in_chans=dim, embed_dim=dim,
-            norm_layer=None)
-
-    def forward(self, x, x_size):
-        return self.patch_embed(self.conv(self.patch_unembed(self.residual_group(x, x_size), x_size))) + x
-
-    def flops(self):
-        flops = 0
-        flops += self.residual_group.flops()
-        H, W = self.input_resolution
-        flops += H * W * self.dim * self.dim * 9
-        flops += self.patch_embed.flops()
-        flops += self.patch_unembed.flops()
-
-        return flops
-
-class PatchUnEmbed(nn.Module):
-    r""" Image to Patch Unembedding
-
-    Args:
-        img_size (int): Image size.  Default: 224.
-        patch_size (int): Patch token size. Default: 4.
-        in_chans (int): Number of input image channels. Default: 3.
-        embed_dim (int): Number of linear projection output channels. Default: 96.
-        norm_layer (nn.Module, optional): Normalization layer. Default: None
-    """
-
-    def __init__(self, img_size=224, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None):
-        super().__init__()
-        img_size = to_2tuple(img_size)
-        patch_size = to_2tuple(patch_size)
-        patches_resolution = [img_size[0] // patch_size[0], img_size[1] // patch_size[1]]
-        self.img_size = img_size
-        self.patch_size = patch_size
-        self.patches_resolution = patches_resolution
-        self.num_patches = patches_resolution[0] * patches_resolution[1]
-
-        self.in_chans = in_chans
-        self.embed_dim = embed_dim
-
-    def forward(self, x, x_size):
-        B, HW, C = x.shape
-        x = x.transpose(1, 2).view(B, self.embed_dim, x_size[0], x_size[1])  # B Ph*Pw C
-        return x
-
-    def flops(self):
-        flops = 0
-        return flops
-
-
-class Upsample(nn.Sequential):
-    """Upsample module.
-
-    Args:
-        scale (int): Scale factor. Supported scales: 2^n and 3.
-        num_feat (int): Channel number of intermediate features.
-    """
-
-    def __init__(self, scale, num_feat):
-        m = []
-        if (scale & (scale - 1)) == 0:  # scale = 2^n
-            for _ in range(int(math.log(scale, 2))):
-                m.append(nn.Conv2d(num_feat, 4 * num_feat, 3, 1, 1))
-                m.append(nn.PixelShuffle(2))
-        elif scale == 3:
-            m.append(nn.Conv2d(num_feat, 9 * num_feat, 3, 1, 1))
-            m.append(nn.PixelShuffle(3))
-        else:
-            raise ValueError(f'scale {scale} is not supported. ' 'Supported scales: 2^n and 3.')
-        super(Upsample, self).__init__(*m)
-
-class Upsample_hf(nn.Sequential):
-    """Upsample module.
-
-    Args:
-        scale (int): Scale factor. Supported scales: 2^n and 3.
-        num_feat (int): Channel number of intermediate features.
-    """
-
-    def __init__(self, scale, num_feat):
-        m = []
-        if (scale & (scale - 1)) == 0:  # scale = 2^n
-            for _ in range(int(math.log(scale, 2))):
-                m.append(nn.Conv2d(num_feat, 4 * num_feat, 3, 1, 1))
-                m.append(nn.PixelShuffle(2))
-        elif scale == 3:
-            m.append(nn.Conv2d(num_feat, 9 * num_feat, 3, 1, 1))
-            m.append(nn.PixelShuffle(3))
-        else:
-            raise ValueError(f'scale {scale} is not supported. ' 'Supported scales: 2^n and 3.')
-        super(Upsample_hf, self).__init__(*m)
-
-
-class UpsampleOneStep(nn.Sequential):
-    """UpsampleOneStep module (the difference with Upsample is that it always only has 1conv + 1pixelshuffle)
-       Used in lightweight SR to save parameters.
-
-    Args:
-        scale (int): Scale factor. Supported scales: 2^n and 3.
-        num_feat (int): Channel number of intermediate features.
-
-    """
-
-    def __init__(self, scale, num_feat, num_out_ch, input_resolution=None):
-        self.num_feat = num_feat
-        self.input_resolution = input_resolution
-        m = []
-        m.append(nn.Conv2d(num_feat, (scale ** 2) * num_out_ch, 3, 1, 1))
-        m.append(nn.PixelShuffle(scale))
-        super(UpsampleOneStep, self).__init__(*m)
-
-    def flops(self):
-        H, W = self.input_resolution
-        flops = H * W * self.num_feat * 3 * 9
-        return flops
-
-
-
-class Swin2SR(nn.Module):
-    r""" Swin2SR
-        A PyTorch impl of : `Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration`.
-
-    Args:
-        img_size (int | tuple(int)): Input image size. Default 64
-        patch_size (int | tuple(int)): Patch size. Default: 1
-        in_chans (int): Number of input image channels. Default: 3
-        embed_dim (int): Patch embedding dimension. Default: 96
-        depths (tuple(int)): Depth of each Swin Transformer layer.
-        num_heads (tuple(int)): Number of attention heads in different layers.
-        window_size (int): Window size. Default: 7
-        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4
-        qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
-        drop_rate (float): Dropout rate. Default: 0
-        attn_drop_rate (float): Attention dropout rate. Default: 0
-        drop_path_rate (float): Stochastic depth rate. Default: 0.1
-        norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm.
-        ape (bool): If True, add absolute position embedding to the patch embedding. Default: False
-        patch_norm (bool): If True, add normalization after patch embedding. Default: True
-        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False
-        upscale: Upscale factor. 2/3/4/8 for image SR, 1 for denoising and compress artifact reduction
-        img_range: Image range. 1. or 255.
-        upsampler: The reconstruction reconstruction module. 'pixelshuffle'/'pixelshuffledirect'/'nearest+conv'/None
-        resi_connection: The convolutional block before residual connection. '1conv'/'3conv'
-    """
-
-    def __init__(self, img_size=64, patch_size=1, in_chans=3,
-                 embed_dim=96, depths=(6, 6, 6, 6), num_heads=(6, 6, 6, 6),
-                 window_size=7, mlp_ratio=4., qkv_bias=True,
-                 drop_rate=0., attn_drop_rate=0., drop_path_rate=0.1,
-                 norm_layer=nn.LayerNorm, ape=False, patch_norm=True,
-                 use_checkpoint=False, upscale=2, img_range=1., upsampler='', resi_connection='1conv',
-                 **kwargs):
-        super(Swin2SR, self).__init__()
-        num_in_ch = in_chans
-        num_out_ch = in_chans
-        num_feat = 64
-        self.img_range = img_range
-        if in_chans == 3:
-            rgb_mean = (0.4488, 0.4371, 0.4040)
-            self.mean = torch.Tensor(rgb_mean).view(1, 3, 1, 1)
-        else:
-            self.mean = torch.zeros(1, 1, 1, 1)
-        self.upscale = upscale
-        self.upsampler = upsampler
-        self.window_size = window_size
-
-        #####################################################################################################
-        ################################### 1, shallow feature extraction ###################################
-        self.conv_first = nn.Conv2d(num_in_ch, embed_dim, 3, 1, 1)
-
-        #####################################################################################################
-        ################################### 2, deep feature extraction ######################################
-        self.num_layers = len(depths)
-        self.embed_dim = embed_dim
-        self.ape = ape
-        self.patch_norm = patch_norm
-        self.num_features = embed_dim
-        self.mlp_ratio = mlp_ratio
-
-        # split image into non-overlapping patches
-        self.patch_embed = PatchEmbed(
-            img_size=img_size, patch_size=patch_size, in_chans=embed_dim, embed_dim=embed_dim,
-            norm_layer=norm_layer if self.patch_norm else None)
-        num_patches = self.patch_embed.num_patches
-        patches_resolution = self.patch_embed.patches_resolution
-        self.patches_resolution = patches_resolution
-
-        # merge non-overlapping patches into image
-        self.patch_unembed = PatchUnEmbed(
-            img_size=img_size, patch_size=patch_size, in_chans=embed_dim, embed_dim=embed_dim,
-            norm_layer=norm_layer if self.patch_norm else None)
-
-        # absolute position embedding
-        if self.ape:
-            self.absolute_pos_embed = nn.Parameter(torch.zeros(1, num_patches, embed_dim))
-            trunc_normal_(self.absolute_pos_embed, std=.02)
-
-        self.pos_drop = nn.Dropout(p=drop_rate)
-
-        # stochastic depth
-        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))]  # stochastic depth decay rule
-
-        # build Residual Swin Transformer blocks (RSTB)
-        self.layers = nn.ModuleList()
-        for i_layer in range(self.num_layers):
-            layer = RSTB(dim=embed_dim,
-                         input_resolution=(patches_resolution[0],
-                                           patches_resolution[1]),
-                         depth=depths[i_layer],
-                         num_heads=num_heads[i_layer],
-                         window_size=window_size,
-                         mlp_ratio=self.mlp_ratio,
-                         qkv_bias=qkv_bias,
-                         drop=drop_rate, attn_drop=attn_drop_rate,
-                         drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],  # no impact on SR results
-                         norm_layer=norm_layer,
-                         downsample=None,
-                         use_checkpoint=use_checkpoint,
-                         img_size=img_size,
-                         patch_size=patch_size,
-                         resi_connection=resi_connection
-
-                         )
-            self.layers.append(layer)
-
-        if self.upsampler == 'pixelshuffle_hf':
-            self.layers_hf = nn.ModuleList()
-            for i_layer in range(self.num_layers):
-                layer = RSTB(dim=embed_dim,
-                             input_resolution=(patches_resolution[0],
-                                               patches_resolution[1]),
-                             depth=depths[i_layer],
-                             num_heads=num_heads[i_layer],
-                             window_size=window_size,
-                             mlp_ratio=self.mlp_ratio,
-                             qkv_bias=qkv_bias,
-                             drop=drop_rate, attn_drop=attn_drop_rate,
-                             drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],  # no impact on SR results
-                             norm_layer=norm_layer,
-                             downsample=None,
-                             use_checkpoint=use_checkpoint,
-                             img_size=img_size,
-                             patch_size=patch_size,
-                             resi_connection=resi_connection
-
-                             )
-                self.layers_hf.append(layer)
-
-        self.norm = norm_layer(self.num_features)
-
-        # build the last conv layer in deep feature extraction
-        if resi_connection == '1conv':
-            self.conv_after_body = nn.Conv2d(embed_dim, embed_dim, 3, 1, 1)
-        elif resi_connection == '3conv':
-            # to save parameters and memory
-            self.conv_after_body = nn.Sequential(nn.Conv2d(embed_dim, embed_dim // 4, 3, 1, 1),
-                                                 nn.LeakyReLU(negative_slope=0.2, inplace=True),
-                                                 nn.Conv2d(embed_dim // 4, embed_dim // 4, 1, 1, 0),
-                                                 nn.LeakyReLU(negative_slope=0.2, inplace=True),
-                                                 nn.Conv2d(embed_dim // 4, embed_dim, 3, 1, 1))
-
-        #####################################################################################################
-        ################################ 3, high quality image reconstruction ################################
-        if self.upsampler == 'pixelshuffle':
-            # for classical SR
-            self.conv_before_upsample = nn.Sequential(nn.Conv2d(embed_dim, num_feat, 3, 1, 1),
-                                                      nn.LeakyReLU(inplace=True))
-            self.upsample = Upsample(upscale, num_feat)
-            self.conv_last = nn.Conv2d(num_feat, num_out_ch, 3, 1, 1)
-        elif self.upsampler == 'pixelshuffle_aux':
-            self.conv_bicubic = nn.Conv2d(num_in_ch, num_feat, 3, 1, 1)
-            self.conv_before_upsample = nn.Sequential(
-                nn.Conv2d(embed_dim, num_feat, 3, 1, 1),
-                nn.LeakyReLU(inplace=True))
-            self.conv_aux = nn.Conv2d(num_feat, num_out_ch, 3, 1, 1)
-            self.conv_after_aux = nn.Sequential(
-                nn.Conv2d(3, num_feat, 3, 1, 1),
-                nn.LeakyReLU(inplace=True))
-            self.upsample = Upsample(upscale, num_feat)
-            self.conv_last = nn.Conv2d(num_feat, num_out_ch, 3, 1, 1)
-
-        elif self.upsampler == 'pixelshuffle_hf':
-            self.conv_before_upsample = nn.Sequential(nn.Conv2d(embed_dim, num_feat, 3, 1, 1),
-                                                      nn.LeakyReLU(inplace=True))
-            self.upsample = Upsample(upscale, num_feat)
-            self.upsample_hf = Upsample_hf(upscale, num_feat)
-            self.conv_last = nn.Conv2d(num_feat, num_out_ch, 3, 1, 1)
-            self.conv_first_hf = nn.Sequential(nn.Conv2d(num_feat, embed_dim, 3, 1, 1),
-                                                      nn.LeakyReLU(inplace=True))
-            self.conv_after_body_hf = nn.Conv2d(embed_dim, embed_dim, 3, 1, 1)
-            self.conv_before_upsample_hf = nn.Sequential(
-                nn.Conv2d(embed_dim, num_feat, 3, 1, 1),
-                nn.LeakyReLU(inplace=True))
-            self.conv_last_hf = nn.Conv2d(num_feat, num_out_ch, 3, 1, 1)
-
-        elif self.upsampler == 'pixelshuffledirect':
-            # for lightweight SR (to save parameters)
-            self.upsample = UpsampleOneStep(upscale, embed_dim, num_out_ch,
-                                            (patches_resolution[0], patches_resolution[1]))
-        elif self.upsampler == 'nearest+conv':
-            # for real-world SR (less artifacts)
-            assert self.upscale == 4, 'only support x4 now.'
-            self.conv_before_upsample = nn.Sequential(nn.Conv2d(embed_dim, num_feat, 3, 1, 1),
-                                                      nn.LeakyReLU(inplace=True))
-            self.conv_up1 = nn.Conv2d(num_feat, num_feat, 3, 1, 1)
-            self.conv_up2 = nn.Conv2d(num_feat, num_feat, 3, 1, 1)
-            self.conv_hr = nn.Conv2d(num_feat, num_feat, 3, 1, 1)
-            self.conv_last = nn.Conv2d(num_feat, num_out_ch, 3, 1, 1)
-            self.lrelu = nn.LeakyReLU(negative_slope=0.2, inplace=True)
-        else:
-            # for image denoising and JPEG compression artifact reduction
-            self.conv_last = nn.Conv2d(embed_dim, num_out_ch, 3, 1, 1)
-
-        self.apply(self._init_weights)
-
-    def _init_weights(self, m):
-        if isinstance(m, nn.Linear):
-            trunc_normal_(m.weight, std=.02)
-            if isinstance(m, nn.Linear) and m.bias is not None:
-                nn.init.constant_(m.bias, 0)
-        elif isinstance(m, nn.LayerNorm):
-            nn.init.constant_(m.bias, 0)
-            nn.init.constant_(m.weight, 1.0)
-
-    @torch.jit.ignore
-    def no_weight_decay(self):
-        return {'absolute_pos_embed'}
-
-    @torch.jit.ignore
-    def no_weight_decay_keywords(self):
-        return {'relative_position_bias_table'}
-
-    def check_image_size(self, x):
-        _, _, h, w = x.size()
-        mod_pad_h = (self.window_size - h % self.window_size) % self.window_size
-        mod_pad_w = (self.window_size - w % self.window_size) % self.window_size
-        x = F.pad(x, (0, mod_pad_w, 0, mod_pad_h), 'reflect')
-        return x
-
-    def forward_features(self, x):
-        x_size = (x.shape[2], x.shape[3])
-        x = self.patch_embed(x)
-        if self.ape:
-            x = x + self.absolute_pos_embed
-        x = self.pos_drop(x)
-
-        for layer in self.layers:
-            x = layer(x, x_size)
-
-        x = self.norm(x)  # B L C
-        x = self.patch_unembed(x, x_size)
-
-        return x
-
-    def forward_features_hf(self, x):
-        x_size = (x.shape[2], x.shape[3])
-        x = self.patch_embed(x)
-        if self.ape:
-            x = x + self.absolute_pos_embed
-        x = self.pos_drop(x)
-
-        for layer in self.layers_hf:
-            x = layer(x, x_size)
-
-        x = self.norm(x)  # B L C
-        x = self.patch_unembed(x, x_size)
-
-        return x
-
-    def forward(self, x):
-        H, W = x.shape[2:]
-        x = self.check_image_size(x)
-
-        self.mean = self.mean.type_as(x)
-        x = (x - self.mean) * self.img_range
-
-        if self.upsampler == 'pixelshuffle':
-            # for classical SR
-            x = self.conv_first(x)
-            x = self.conv_after_body(self.forward_features(x)) + x
-            x = self.conv_before_upsample(x)
-            x = self.conv_last(self.upsample(x))
-        elif self.upsampler == 'pixelshuffle_aux':
-            bicubic = F.interpolate(x, size=(H * self.upscale, W * self.upscale), mode='bicubic', align_corners=False)
-            bicubic = self.conv_bicubic(bicubic)
-            x = self.conv_first(x)
-            x = self.conv_after_body(self.forward_features(x)) + x
-            x = self.conv_before_upsample(x)
-            aux = self.conv_aux(x) # b, 3, LR_H, LR_W
-            x = self.conv_after_aux(aux)
-            x = self.upsample(x)[:, :, :H * self.upscale, :W * self.upscale] + bicubic[:, :, :H * self.upscale, :W * self.upscale]
-            x = self.conv_last(x)
-            aux = aux / self.img_range + self.mean
-        elif self.upsampler == 'pixelshuffle_hf':
-            # for classical SR with HF
-            x = self.conv_first(x)
-            x = self.conv_after_body(self.forward_features(x)) + x
-            x_before = self.conv_before_upsample(x)
-            x_out = self.conv_last(self.upsample(x_before))
-
-            x_hf = self.conv_first_hf(x_before)
-            x_hf = self.conv_after_body_hf(self.forward_features_hf(x_hf)) + x_hf
-            x_hf = self.conv_before_upsample_hf(x_hf)
-            x_hf = self.conv_last_hf(self.upsample_hf(x_hf))
-            x = x_out + x_hf
-            x_hf = x_hf / self.img_range + self.mean
-
-        elif self.upsampler == 'pixelshuffledirect':
-            # for lightweight SR
-            x = self.conv_first(x)
-            x = self.conv_after_body(self.forward_features(x)) + x
-            x = self.upsample(x)
-        elif self.upsampler == 'nearest+conv':
-            # for real-world SR
-            x = self.conv_first(x)
-            x = self.conv_after_body(self.forward_features(x)) + x
-            x = self.conv_before_upsample(x)
-            x = self.lrelu(self.conv_up1(torch.nn.functional.interpolate(x, scale_factor=2, mode='nearest')))
-            x = self.lrelu(self.conv_up2(torch.nn.functional.interpolate(x, scale_factor=2, mode='nearest')))
-            x = self.conv_last(self.lrelu(self.conv_hr(x)))
-        else:
-            # for image denoising and JPEG compression artifact reduction
-            x_first = self.conv_first(x)
-            res = self.conv_after_body(self.forward_features(x_first)) + x_first
-            x = x + self.conv_last(res)
-
-        x = x / self.img_range + self.mean
-        if self.upsampler == "pixelshuffle_aux":
-            return x[:, :, :H*self.upscale, :W*self.upscale], aux
-
-        elif self.upsampler == "pixelshuffle_hf":
-            x_out = x_out / self.img_range + self.mean
-            return x_out[:, :, :H*self.upscale, :W*self.upscale], x[:, :, :H*self.upscale, :W*self.upscale], x_hf[:, :, :H*self.upscale, :W*self.upscale]
-
-        else:
-            return x[:, :, :H*self.upscale, :W*self.upscale]
-
-    def flops(self):
-        flops = 0
-        H, W = self.patches_resolution
-        flops += H * W * 3 * self.embed_dim * 9
-        flops += self.patch_embed.flops()
-        for layer in self.layers:
-            flops += layer.flops()
-        flops += H * W * 3 * self.embed_dim * self.embed_dim
-        flops += self.upsample.flops()
-        return flops
-
-
-if __name__ == '__main__':
-    upscale = 4
-    window_size = 8
-    height = (1024 // upscale // window_size + 1) * window_size
-    width = (720 // upscale // window_size + 1) * window_size
-    model = Swin2SR(upscale=2, img_size=(height, width),
-                   window_size=window_size, img_range=1., depths=[6, 6, 6, 6],
-                   embed_dim=60, num_heads=[6, 6, 6, 6], mlp_ratio=2, upsampler='pixelshuffledirect')
-    print(model)
-    print(height, width, model.flops() / 1e9)
-
-    x = torch.randn((1, 3, height, width))
-    x = model(x)
-    print(x.shape)
diff --git a/modules/codeformer/codeformer_arch.py b/modules/codeformer/codeformer_arch.py
deleted file mode 100644
index 12db68142..000000000
--- a/modules/codeformer/codeformer_arch.py
+++ /dev/null
@@ -1,276 +0,0 @@
-# this file is copied from CodeFormer repository. Please see comment in modules/codeformer_model.py
-
-import math
-import torch
-from torch import nn, Tensor
-import torch.nn.functional as F
-from typing import Optional
-
-from modules.codeformer.vqgan_arch import VQAutoEncoder, ResBlock
-from basicsr.utils.registry import ARCH_REGISTRY
-
-def calc_mean_std(feat, eps=1e-5):
-    """Calculate mean and std for adaptive_instance_normalization.
-
-    Args:
-        feat (Tensor): 4D tensor.
-        eps (float): A small value added to the variance to avoid
-            divide-by-zero. Default: 1e-5.
-    """
-    size = feat.size()
-    assert len(size) == 4, 'The input feature should be 4D tensor.'
-    b, c = size[:2]
-    feat_var = feat.view(b, c, -1).var(dim=2) + eps
-    feat_std = feat_var.sqrt().view(b, c, 1, 1)
-    feat_mean = feat.view(b, c, -1).mean(dim=2).view(b, c, 1, 1)
-    return feat_mean, feat_std
-
-
-def adaptive_instance_normalization(content_feat, style_feat):
-    """Adaptive instance normalization.
-
-    Adjust the reference features to have the similar color and illuminations
-    as those in the degradate features.
-
-    Args:
-        content_feat (Tensor): The reference feature.
-        style_feat (Tensor): The degradate features.
-    """
-    size = content_feat.size()
-    style_mean, style_std = calc_mean_std(style_feat)
-    content_mean, content_std = calc_mean_std(content_feat)
-    normalized_feat = (content_feat - content_mean.expand(size)) / content_std.expand(size)
-    return normalized_feat * style_std.expand(size) + style_mean.expand(size)
-
-
-class PositionEmbeddingSine(nn.Module):
-    """
-    This is a more standard version of the position embedding, very similar to the one
-    used by the Attention is all you need paper, generalized to work on images.
-    """
-
-    def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None):
-        super().__init__()
-        self.num_pos_feats = num_pos_feats
-        self.temperature = temperature
-        self.normalize = normalize
-        if scale is not None and normalize is False:
-            raise ValueError("normalize should be True if scale is passed")
-        if scale is None:
-            scale = 2 * math.pi
-        self.scale = scale
-
-    def forward(self, x, mask=None):
-        if mask is None:
-            mask = torch.zeros((x.size(0), x.size(2), x.size(3)), device=x.device, dtype=torch.bool)
-        not_mask = ~mask
-        y_embed = not_mask.cumsum(1, dtype=torch.float32)
-        x_embed = not_mask.cumsum(2, dtype=torch.float32)
-        if self.normalize:
-            eps = 1e-6
-            y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
-            x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
-
-        dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
-        dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)
-
-        pos_x = x_embed[:, :, :, None] / dim_t
-        pos_y = y_embed[:, :, :, None] / dim_t
-        pos_x = torch.stack(
-            (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4
-        ).flatten(3)
-        pos_y = torch.stack(
-            (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4
-        ).flatten(3)
-        pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
-        return pos
-
-def _get_activation_fn(activation):
-    """Return an activation function given a string"""
-    if activation == "relu":
-        return F.relu
-    if activation == "gelu":
-        return F.gelu
-    if activation == "glu":
-        return F.glu
-    raise RuntimeError(F"activation should be relu/gelu, not {activation}.")
-
-
-class TransformerSALayer(nn.Module):
-    def __init__(self, embed_dim, nhead=8, dim_mlp=2048, dropout=0.0, activation="gelu"):
-        super().__init__()
-        self.self_attn = nn.MultiheadAttention(embed_dim, nhead, dropout=dropout)
-        # Implementation of Feedforward model - MLP
-        self.linear1 = nn.Linear(embed_dim, dim_mlp)
-        self.dropout = nn.Dropout(dropout)
-        self.linear2 = nn.Linear(dim_mlp, embed_dim)
-
-        self.norm1 = nn.LayerNorm(embed_dim)
-        self.norm2 = nn.LayerNorm(embed_dim)
-        self.dropout1 = nn.Dropout(dropout)
-        self.dropout2 = nn.Dropout(dropout)
-
-        self.activation = _get_activation_fn(activation)
-
-    def with_pos_embed(self, tensor, pos: Optional[Tensor]):
-        return tensor if pos is None else tensor + pos
-
-    def forward(self, tgt,
-                tgt_mask: Optional[Tensor] = None,
-                tgt_key_padding_mask: Optional[Tensor] = None,
-                query_pos: Optional[Tensor] = None):
-
-        # self attention
-        tgt2 = self.norm1(tgt)
-        q = k = self.with_pos_embed(tgt2, query_pos)
-        tgt2 = self.self_attn(q, k, value=tgt2, attn_mask=tgt_mask,
-                              key_padding_mask=tgt_key_padding_mask)[0]
-        tgt = tgt + self.dropout1(tgt2)
-
-        # ffn
-        tgt2 = self.norm2(tgt)
-        tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt2))))
-        tgt = tgt + self.dropout2(tgt2)
-        return tgt
-
-class Fuse_sft_block(nn.Module):
-    def __init__(self, in_ch, out_ch):
-        super().__init__()
-        self.encode_enc = ResBlock(2*in_ch, out_ch)
-
-        self.scale = nn.Sequential(
-                    nn.Conv2d(in_ch, out_ch, kernel_size=3, padding=1),
-                    nn.LeakyReLU(0.2, True),
-                    nn.Conv2d(out_ch, out_ch, kernel_size=3, padding=1))
-
-        self.shift = nn.Sequential(
-                    nn.Conv2d(in_ch, out_ch, kernel_size=3, padding=1),
-                    nn.LeakyReLU(0.2, True),
-                    nn.Conv2d(out_ch, out_ch, kernel_size=3, padding=1))
-
-    def forward(self, enc_feat, dec_feat, w=1):
-        enc_feat = self.encode_enc(torch.cat([enc_feat, dec_feat], dim=1))
-        scale = self.scale(enc_feat)
-        shift = self.shift(enc_feat)
-        residual = w * (dec_feat * scale + shift)
-        out = dec_feat + residual
-        return out
-
-
-@ARCH_REGISTRY.register()
-class CodeFormer(VQAutoEncoder):
-    def __init__(self, dim_embd=512, n_head=8, n_layers=9,
-                codebook_size=1024, latent_size=256,
-                connect_list=('32', '64', '128', '256'),
-                fix_modules=('quantize', 'generator')):
-        super(CodeFormer, self).__init__(512, 64, [1, 2, 2, 4, 4, 8], 'nearest',2, [16], codebook_size)
-
-        if fix_modules is not None:
-            for module in fix_modules:
-                for param in getattr(self, module).parameters():
-                    param.requires_grad = False
-
-        self.connect_list = connect_list
-        self.n_layers = n_layers
-        self.dim_embd = dim_embd
-        self.dim_mlp = dim_embd*2
-
-        self.position_emb = nn.Parameter(torch.zeros(latent_size, self.dim_embd))
-        self.feat_emb = nn.Linear(256, self.dim_embd)
-
-        # transformer
-        self.ft_layers = nn.Sequential(*[TransformerSALayer(embed_dim=dim_embd, nhead=n_head, dim_mlp=self.dim_mlp, dropout=0.0)
-                                    for _ in range(self.n_layers)])
-
-        # logits_predict head
-        self.idx_pred_layer = nn.Sequential(
-            nn.LayerNorm(dim_embd),
-            nn.Linear(dim_embd, codebook_size, bias=False))
-
-        self.channels = {
-            '16': 512,
-            '32': 256,
-            '64': 256,
-            '128': 128,
-            '256': 128,
-            '512': 64,
-        }
-
-        # after second residual block for > 16, before attn layer for ==16
-        self.fuse_encoder_block = {'512':2, '256':5, '128':8, '64':11, '32':14, '16':18}
-        # after first residual block for > 16, before attn layer for ==16
-        self.fuse_generator_block = {'16':6, '32': 9, '64':12, '128':15, '256':18, '512':21}
-
-        # fuse_convs_dict
-        self.fuse_convs_dict = nn.ModuleDict()
-        for f_size in self.connect_list:
-            in_ch = self.channels[f_size]
-            self.fuse_convs_dict[f_size] = Fuse_sft_block(in_ch, in_ch)
-
-    def _init_weights(self, module):
-        if isinstance(module, (nn.Linear, nn.Embedding)):
-            module.weight.data.normal_(mean=0.0, std=0.02)
-            if isinstance(module, nn.Linear) and module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, nn.LayerNorm):
-            module.bias.data.zero_()
-            module.weight.data.fill_(1.0)
-
-    def forward(self, x, w=0, detach_16=True, code_only=False, adain=False):
-        # ################### Encoder #####################
-        enc_feat_dict = {}
-        out_list = [self.fuse_encoder_block[f_size] for f_size in self.connect_list]
-        for i, block in enumerate(self.encoder.blocks):
-            x = block(x)
-            if i in out_list:
-                enc_feat_dict[str(x.shape[-1])] = x.clone()
-
-        lq_feat = x
-        # ################# Transformer ###################
-        # quant_feat, codebook_loss, quant_stats = self.quantize(lq_feat)
-        pos_emb = self.position_emb.unsqueeze(1).repeat(1,x.shape[0],1)
-        # BCHW -> BC(HW) -> (HW)BC
-        feat_emb = self.feat_emb(lq_feat.flatten(2).permute(2,0,1))
-        query_emb = feat_emb
-        # Transformer encoder
-        for layer in self.ft_layers:
-            query_emb = layer(query_emb, query_pos=pos_emb)
-
-        # output logits
-        logits = self.idx_pred_layer(query_emb) # (hw)bn
-        logits = logits.permute(1,0,2) # (hw)bn -> b(hw)n
-
-        if code_only: # for training stage II
-          # logits doesn't need softmax before cross_entropy loss
-            return logits, lq_feat
-
-        # ################# Quantization ###################
-        # if self.training:
-        #     quant_feat = torch.einsum('btn,nc->btc', [soft_one_hot, self.quantize.embedding.weight])
-        #     # b(hw)c -> bc(hw) -> bchw
-        #     quant_feat = quant_feat.permute(0,2,1).view(lq_feat.shape)
-        # ------------
-        soft_one_hot = F.softmax(logits, dim=2)
-        _, top_idx = torch.topk(soft_one_hot, 1, dim=2)
-        quant_feat = self.quantize.get_codebook_feat(top_idx, shape=[x.shape[0],16,16,256])
-        # preserve gradients
-        # quant_feat = lq_feat + (quant_feat - lq_feat).detach()
-
-        if detach_16:
-            quant_feat = quant_feat.detach() # for training stage III
-        if adain:
-            quant_feat = adaptive_instance_normalization(quant_feat, lq_feat)
-
-        # ################## Generator ####################
-        x = quant_feat
-        fuse_list = [self.fuse_generator_block[f_size] for f_size in self.connect_list]
-
-        for i, block in enumerate(self.generator.blocks):
-            x = block(x)
-            if i in fuse_list: # fuse after i-th block
-                f_size = str(x.shape[-1])
-                if w>0:
-                    x = self.fuse_convs_dict[f_size](enc_feat_dict[f_size].detach(), x, w)
-        out = x
-        # logits doesn't need softmax before cross_entropy loss
-        return out, logits, lq_feat
diff --git a/modules/codeformer/vqgan_arch.py b/modules/codeformer/vqgan_arch.py
deleted file mode 100644
index 09ee6660d..000000000
--- a/modules/codeformer/vqgan_arch.py
+++ /dev/null
@@ -1,435 +0,0 @@
-# this file is copied from CodeFormer repository. Please see comment in modules/codeformer_model.py
-
-'''
-VQGAN code, adapted from the original created by the Unleashing Transformers authors:
-https://github.com/samb-t/unleashing-transformers/blob/master/models/vqgan.py
-
-'''
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from basicsr.utils import get_root_logger
-from basicsr.utils.registry import ARCH_REGISTRY
-
-def normalize(in_channels):
-    return torch.nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
-
-
-@torch.jit.script
-def swish(x):
-    return x*torch.sigmoid(x)
-
-
-#  Define VQVAE classes
-class VectorQuantizer(nn.Module):
-    def __init__(self, codebook_size, emb_dim, beta):
-        super(VectorQuantizer, self).__init__()
-        self.codebook_size = codebook_size  # number of embeddings
-        self.emb_dim = emb_dim  # dimension of embedding
-        self.beta = beta  # commitment cost used in loss term, beta * ||z_e(x)-sg[e]||^2
-        self.embedding = nn.Embedding(self.codebook_size, self.emb_dim)
-        self.embedding.weight.data.uniform_(-1.0 / self.codebook_size, 1.0 / self.codebook_size)
-
-    def forward(self, z):
-        # reshape z -> (batch, height, width, channel) and flatten
-        z = z.permute(0, 2, 3, 1).contiguous()
-        z_flattened = z.view(-1, self.emb_dim)
-
-        # distances from z to embeddings e_j (z - e)^2 = z^2 + e^2 - 2 e * z
-        d = (z_flattened ** 2).sum(dim=1, keepdim=True) + (self.embedding.weight**2).sum(1) - \
-            2 * torch.matmul(z_flattened, self.embedding.weight.t())
-
-        mean_distance = torch.mean(d)
-        # find closest encodings
-        # min_encoding_indices = torch.argmin(d, dim=1).unsqueeze(1)
-        min_encoding_scores, min_encoding_indices = torch.topk(d, 1, dim=1, largest=False)
-        # [0-1], higher score, higher confidence
-        min_encoding_scores = torch.exp(-min_encoding_scores/10)
-
-        min_encodings = torch.zeros(min_encoding_indices.shape[0], self.codebook_size).to(z)
-        min_encodings.scatter_(1, min_encoding_indices, 1)
-
-        # get quantized latent vectors
-        z_q = torch.matmul(min_encodings, self.embedding.weight).view(z.shape)
-        # compute loss for embedding
-        loss = torch.mean((z_q.detach()-z)**2) + self.beta * torch.mean((z_q - z.detach()) ** 2)
-        # preserve gradients
-        z_q = z + (z_q - z).detach()
-
-        # perplexity
-        e_mean = torch.mean(min_encodings, dim=0)
-        perplexity = torch.exp(-torch.sum(e_mean * torch.log(e_mean + 1e-10)))
-        # reshape back to match original input shape
-        z_q = z_q.permute(0, 3, 1, 2).contiguous()
-
-        return z_q, loss, {
-            "perplexity": perplexity,
-            "min_encodings": min_encodings,
-            "min_encoding_indices": min_encoding_indices,
-            "min_encoding_scores": min_encoding_scores,
-            "mean_distance": mean_distance
-            }
-
-    def get_codebook_feat(self, indices, shape):
-        # input indices: batch*token_num -> (batch*token_num)*1
-        # shape: batch, height, width, channel
-        indices = indices.view(-1,1)
-        min_encodings = torch.zeros(indices.shape[0], self.codebook_size).to(indices)
-        min_encodings.scatter_(1, indices, 1)
-        # get quantized latent vectors
-        z_q = torch.matmul(min_encodings.float(), self.embedding.weight)
-
-        if shape is not None:  # reshape back to match original input shape
-            z_q = z_q.view(shape).permute(0, 3, 1, 2).contiguous()
-
-        return z_q
-
-
-class GumbelQuantizer(nn.Module):
-    def __init__(self, codebook_size, emb_dim, num_hiddens, straight_through=False, kl_weight=5e-4, temp_init=1.0):
-        super().__init__()
-        self.codebook_size = codebook_size  # number of embeddings
-        self.emb_dim = emb_dim  # dimension of embedding
-        self.straight_through = straight_through
-        self.temperature = temp_init
-        self.kl_weight = kl_weight
-        self.proj = nn.Conv2d(num_hiddens, codebook_size, 1)  # projects last encoder layer to quantized logits
-        self.embed = nn.Embedding(codebook_size, emb_dim)
-
-    def forward(self, z):
-        hard = self.straight_through if self.training else True
-
-        logits = self.proj(z)
-
-        soft_one_hot = F.gumbel_softmax(logits, tau=self.temperature, dim=1, hard=hard)
-
-        z_q = torch.einsum("b n h w, n d -> b d h w", soft_one_hot, self.embed.weight)
-
-        # + kl divergence to the prior loss
-        qy = F.softmax(logits, dim=1)
-        diff = self.kl_weight * torch.sum(qy * torch.log(qy * self.codebook_size + 1e-10), dim=1).mean()
-        min_encoding_indices = soft_one_hot.argmax(dim=1)
-
-        return z_q, diff, {
-            "min_encoding_indices": min_encoding_indices
-        }
-
-
-class Downsample(nn.Module):
-    def __init__(self, in_channels):
-        super().__init__()
-        self.conv = torch.nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=2, padding=0)
-
-    def forward(self, x):
-        pad = (0, 1, 0, 1)
-        x = torch.nn.functional.pad(x, pad, mode="constant", value=0)
-        x = self.conv(x)
-        return x
-
-
-class Upsample(nn.Module):
-    def __init__(self, in_channels):
-        super().__init__()
-        self.conv = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1)
-
-    def forward(self, x):
-        x = F.interpolate(x, scale_factor=2.0, mode="nearest")
-        x = self.conv(x)
-
-        return x
-
-
-class ResBlock(nn.Module):
-    def __init__(self, in_channels, out_channels=None):
-        super(ResBlock, self).__init__()
-        self.in_channels = in_channels
-        self.out_channels = in_channels if out_channels is None else out_channels
-        self.norm1 = normalize(in_channels)
-        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
-        self.norm2 = normalize(out_channels)
-        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
-        if self.in_channels != self.out_channels:
-            self.conv_out = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
-
-    def forward(self, x_in):
-        x = x_in
-        x = self.norm1(x)
-        x = swish(x)
-        x = self.conv1(x)
-        x = self.norm2(x)
-        x = swish(x)
-        x = self.conv2(x)
-        if self.in_channels != self.out_channels:
-            x_in = self.conv_out(x_in)
-
-        return x + x_in
-
-
-class AttnBlock(nn.Module):
-    def __init__(self, in_channels):
-        super().__init__()
-        self.in_channels = in_channels
-
-        self.norm = normalize(in_channels)
-        self.q = torch.nn.Conv2d(
-            in_channels,
-            in_channels,
-            kernel_size=1,
-            stride=1,
-            padding=0
-        )
-        self.k = torch.nn.Conv2d(
-            in_channels,
-            in_channels,
-            kernel_size=1,
-            stride=1,
-            padding=0
-        )
-        self.v = torch.nn.Conv2d(
-            in_channels,
-            in_channels,
-            kernel_size=1,
-            stride=1,
-            padding=0
-        )
-        self.proj_out = torch.nn.Conv2d(
-            in_channels,
-            in_channels,
-            kernel_size=1,
-            stride=1,
-            padding=0
-        )
-
-    def forward(self, x):
-        h_ = x
-        h_ = self.norm(h_)
-        q = self.q(h_)
-        k = self.k(h_)
-        v = self.v(h_)
-
-        # compute attention
-        b, c, h, w = q.shape
-        q = q.reshape(b, c, h*w)
-        q = q.permute(0, 2, 1)
-        k = k.reshape(b, c, h*w)
-        w_ = torch.bmm(q, k)
-        w_ = w_ * (int(c)**(-0.5))
-        w_ = F.softmax(w_, dim=2)
-
-        # attend to values
-        v = v.reshape(b, c, h*w)
-        w_ = w_.permute(0, 2, 1)
-        h_ = torch.bmm(v, w_)
-        h_ = h_.reshape(b, c, h, w)
-
-        h_ = self.proj_out(h_)
-
-        return x+h_
-
-
-class Encoder(nn.Module):
-    def __init__(self, in_channels, nf, emb_dim, ch_mult, num_res_blocks, resolution, attn_resolutions):
-        super().__init__()
-        self.nf = nf
-        self.num_resolutions = len(ch_mult)
-        self.num_res_blocks = num_res_blocks
-        self.resolution = resolution
-        self.attn_resolutions = attn_resolutions
-
-        curr_res = self.resolution
-        in_ch_mult = (1,)+tuple(ch_mult)
-
-        blocks = []
-        # initial convultion
-        blocks.append(nn.Conv2d(in_channels, nf, kernel_size=3, stride=1, padding=1))
-
-        # residual and downsampling blocks, with attention on smaller res (16x16)
-        for i in range(self.num_resolutions):
-            block_in_ch = nf * in_ch_mult[i]
-            block_out_ch = nf * ch_mult[i]
-            for _ in range(self.num_res_blocks):
-                blocks.append(ResBlock(block_in_ch, block_out_ch))
-                block_in_ch = block_out_ch
-                if curr_res in attn_resolutions:
-                    blocks.append(AttnBlock(block_in_ch))
-
-            if i != self.num_resolutions - 1:
-                blocks.append(Downsample(block_in_ch))
-                curr_res = curr_res // 2
-
-        # non-local attention block
-        blocks.append(ResBlock(block_in_ch, block_in_ch))
-        blocks.append(AttnBlock(block_in_ch))
-        blocks.append(ResBlock(block_in_ch, block_in_ch))
-
-        # normalise and convert to latent size
-        blocks.append(normalize(block_in_ch))
-        blocks.append(nn.Conv2d(block_in_ch, emb_dim, kernel_size=3, stride=1, padding=1))
-        self.blocks = nn.ModuleList(blocks)
-
-    def forward(self, x):
-        for block in self.blocks:
-            x = block(x)
-
-        return x
-
-
-class Generator(nn.Module):
-    def __init__(self, nf, emb_dim, ch_mult, res_blocks, img_size, attn_resolutions):
-        super().__init__()
-        self.nf = nf
-        self.ch_mult = ch_mult
-        self.num_resolutions = len(self.ch_mult)
-        self.num_res_blocks = res_blocks
-        self.resolution = img_size
-        self.attn_resolutions = attn_resolutions
-        self.in_channels = emb_dim
-        self.out_channels = 3
-        block_in_ch = self.nf * self.ch_mult[-1]
-        curr_res = self.resolution // 2 ** (self.num_resolutions-1)
-
-        blocks = []
-        # initial conv
-        blocks.append(nn.Conv2d(self.in_channels, block_in_ch, kernel_size=3, stride=1, padding=1))
-
-        # non-local attention block
-        blocks.append(ResBlock(block_in_ch, block_in_ch))
-        blocks.append(AttnBlock(block_in_ch))
-        blocks.append(ResBlock(block_in_ch, block_in_ch))
-
-        for i in reversed(range(self.num_resolutions)):
-            block_out_ch = self.nf * self.ch_mult[i]
-
-            for _ in range(self.num_res_blocks):
-                blocks.append(ResBlock(block_in_ch, block_out_ch))
-                block_in_ch = block_out_ch
-
-                if curr_res in self.attn_resolutions:
-                    blocks.append(AttnBlock(block_in_ch))
-
-            if i != 0:
-                blocks.append(Upsample(block_in_ch))
-                curr_res = curr_res * 2
-
-        blocks.append(normalize(block_in_ch))
-        blocks.append(nn.Conv2d(block_in_ch, self.out_channels, kernel_size=3, stride=1, padding=1))
-
-        self.blocks = nn.ModuleList(blocks)
-
-
-    def forward(self, x):
-        for block in self.blocks:
-            x = block(x)
-
-        return x
-
-
-@ARCH_REGISTRY.register()
-class VQAutoEncoder(nn.Module):
-    def __init__(self, img_size, nf, ch_mult, quantizer="nearest", res_blocks=2, attn_resolutions=None, codebook_size=1024, emb_dim=256,
-                beta=0.25, gumbel_straight_through=False, gumbel_kl_weight=1e-8, model_path=None):
-        super().__init__()
-        logger = get_root_logger()
-        self.in_channels = 3
-        self.nf = nf
-        self.n_blocks = res_blocks
-        self.codebook_size = codebook_size
-        self.embed_dim = emb_dim
-        self.ch_mult = ch_mult
-        self.resolution = img_size
-        self.attn_resolutions = attn_resolutions or [16]
-        self.quantizer_type = quantizer
-        self.encoder = Encoder(
-            self.in_channels,
-            self.nf,
-            self.embed_dim,
-            self.ch_mult,
-            self.n_blocks,
-            self.resolution,
-            self.attn_resolutions
-        )
-        if self.quantizer_type == "nearest":
-            self.beta = beta #0.25
-            self.quantize = VectorQuantizer(self.codebook_size, self.embed_dim, self.beta)
-        elif self.quantizer_type == "gumbel":
-            self.gumbel_num_hiddens = emb_dim
-            self.straight_through = gumbel_straight_through
-            self.kl_weight = gumbel_kl_weight
-            self.quantize = GumbelQuantizer(
-                self.codebook_size,
-                self.embed_dim,
-                self.gumbel_num_hiddens,
-                self.straight_through,
-                self.kl_weight
-            )
-        self.generator = Generator(
-            self.nf,
-            self.embed_dim,
-            self.ch_mult,
-            self.n_blocks,
-            self.resolution,
-            self.attn_resolutions
-        )
-
-        if model_path is not None:
-            chkpt = torch.load(model_path, map_location='cpu')
-            if 'params_ema' in chkpt:
-                self.load_state_dict(torch.load(model_path, map_location='cpu')['params_ema'])
-                logger.info(f'vqgan is loaded from: {model_path} [params_ema]')
-            elif 'params' in chkpt:
-                self.load_state_dict(torch.load(model_path, map_location='cpu')['params'])
-                logger.info(f'vqgan is loaded from: {model_path} [params]')
-            else:
-                raise ValueError('Wrong params!')
-
-
-    def forward(self, x):
-        x = self.encoder(x)
-        quant, codebook_loss, quant_stats = self.quantize(x)
-        x = self.generator(quant)
-        return x, codebook_loss, quant_stats
-
-
-
-# patch based discriminator
-@ARCH_REGISTRY.register()
-class VQGANDiscriminator(nn.Module):
-    def __init__(self, nc=3, ndf=64, n_layers=4, model_path=None):
-        super().__init__()
-
-        layers = [nn.Conv2d(nc, ndf, kernel_size=4, stride=2, padding=1), nn.LeakyReLU(0.2, True)]
-        ndf_mult = 1
-        ndf_mult_prev = 1
-        for n in range(1, n_layers):  # gradually increase the number of filters
-            ndf_mult_prev = ndf_mult
-            ndf_mult = min(2 ** n, 8)
-            layers += [
-                nn.Conv2d(ndf * ndf_mult_prev, ndf * ndf_mult, kernel_size=4, stride=2, padding=1, bias=False),
-                nn.BatchNorm2d(ndf * ndf_mult),
-                nn.LeakyReLU(0.2, True)
-            ]
-
-        ndf_mult_prev = ndf_mult
-        ndf_mult = min(2 ** n_layers, 8)
-
-        layers += [
-            nn.Conv2d(ndf * ndf_mult_prev, ndf * ndf_mult, kernel_size=4, stride=1, padding=1, bias=False),
-            nn.BatchNorm2d(ndf * ndf_mult),
-            nn.LeakyReLU(0.2, True)
-        ]
-
-        layers += [
-            nn.Conv2d(ndf * ndf_mult, 1, kernel_size=4, stride=1, padding=1)]  # output 1 channel prediction map
-        self.main = nn.Sequential(*layers)
-
-        if model_path is not None:
-            chkpt = torch.load(model_path, map_location='cpu')
-            if 'params_d' in chkpt:
-                self.load_state_dict(torch.load(model_path, map_location='cpu')['params_d'])
-            elif 'params' in chkpt:
-                self.load_state_dict(torch.load(model_path, map_location='cpu')['params'])
-            else:
-                raise ValueError('Wrong params!')
-
-    def forward(self, x):
-        return self.main(x)
diff --git a/modules/codeformer_model.py b/modules/codeformer_model.py
index da42b5e99..517eadfd8 100644
--- a/modules/codeformer_model.py
+++ b/modules/codeformer_model.py
@@ -8,9 +8,6 @@ import modules.shared
 from modules import shared, devices, modelloader, errors
 from modules.paths import models_path
 
-# codeformer people made a choice to include modified basicsr library to their project which makes
-# it utterly impossible to use it alongside with other libraries that also use basicsr, like GFPGAN.
-# I am making a choice to include some files from codeformer to work around this issue.
 model_dir = "Codeformer"
 model_path = os.path.join(models_path, model_dir)
 model_url = 'https://github.com/sczhou/CodeFormer/releases/download/v0.1.0/codeformer.pth'
@@ -18,115 +15,127 @@ model_url = 'https://github.com/sczhou/CodeFormer/releases/download/v0.1.0/codef
 codeformer = None
 
 
+class FaceRestorerCodeFormer(modules.face_restoration.FaceRestoration):
+    def name(self):
+        return "CodeFormer"
+
+    def __init__(self, dirname):
+        self.net = None
+        self.face_helper = None
+        self.cmd_dir = dirname
+
+    def create_models(self):
+        from facexlib.detection import retinaface
+        from facexlib.utils.face_restoration_helper import FaceRestoreHelper
+
+        if self.net is not None and self.face_helper is not None:
+            self.net.to(devices.device_codeformer)
+            return self.net, self.face_helper
+        model_paths = modelloader.load_models(
+            model_path,
+            model_url,
+            self.cmd_dir,
+            download_name='codeformer-v0.1.0.pth',
+            ext_filter=['.pth'],
+        )
+
+        if len(model_paths) != 0:
+            ckpt_path = model_paths[0]
+        else:
+            print("Unable to load codeformer model.")
+            return None, None
+        net = modelloader.load_spandrel_model(ckpt_path, device=devices.device_codeformer)
+
+        if hasattr(retinaface, 'device'):
+            retinaface.device = devices.device_codeformer
+
+        face_helper = FaceRestoreHelper(
+            upscale_factor=1,
+            face_size=512,
+            crop_ratio=(1, 1),
+            det_model='retinaface_resnet50',
+            save_ext='png',
+            use_parse=True,
+            device=devices.device_codeformer,
+        )
+
+        self.net = net
+        self.face_helper = face_helper
+
+    def send_model_to(self, device):
+        self.net.to(device)
+        self.face_helper.face_det.to(device)
+        self.face_helper.face_parse.to(device)
+
+    def restore(self, np_image, w=None):
+        from torchvision.transforms.functional import normalize
+        from basicsr.utils import img2tensor, tensor2img
+        np_image = np_image[:, :, ::-1]
+
+        original_resolution = np_image.shape[0:2]
+
+        self.create_models()
+        if self.net is None or self.face_helper is None:
+            return np_image
+
+        self.send_model_to(devices.device_codeformer)
+
+        self.face_helper.clean_all()
+        self.face_helper.read_image(np_image)
+        self.face_helper.get_face_landmarks_5(only_center_face=False, resize=640, eye_dist_threshold=5)
+        self.face_helper.align_warp_face()
+
+        for cropped_face in self.face_helper.cropped_faces:
+            cropped_face_t = img2tensor(cropped_face / 255., bgr2rgb=True, float32=True)
+            normalize(cropped_face_t, (0.5, 0.5, 0.5), (0.5, 0.5, 0.5), inplace=True)
+            cropped_face_t = cropped_face_t.unsqueeze(0).to(devices.device_codeformer)
+
+            try:
+                with torch.no_grad():
+                    res = self.net(cropped_face_t, w=w if w is not None else shared.opts.code_former_weight, adain=True)
+                    if isinstance(res, tuple):
+                        output = res[0]
+                    else:
+                        output = res
+                    if not isinstance(res, torch.Tensor):
+                        raise TypeError(f"Expected torch.Tensor, got {type(res)}")
+                    restored_face = tensor2img(output, rgb2bgr=True, min_max=(-1, 1))
+                del output
+                devices.torch_gc()
+            except Exception:
+                errors.report('Failed inference for CodeFormer', exc_info=True)
+                restored_face = tensor2img(cropped_face_t, rgb2bgr=True, min_max=(-1, 1))
+
+            restored_face = restored_face.astype('uint8')
+            self.face_helper.add_restored_face(restored_face)
+
+        self.face_helper.get_inverse_affine(None)
+
+        restored_img = self.face_helper.paste_faces_to_input_image()
+        restored_img = restored_img[:, :, ::-1]
+
+        if original_resolution != restored_img.shape[0:2]:
+            restored_img = cv2.resize(
+                restored_img,
+                (0, 0),
+                fx=original_resolution[1]/restored_img.shape[1],
+                fy=original_resolution[0]/restored_img.shape[0],
+                interpolation=cv2.INTER_LINEAR,
+            )
+
+        self.face_helper.clean_all()
+
+        if shared.opts.face_restoration_unload:
+            self.send_model_to(devices.cpu)
+
+        return restored_img
+
+
 def setup_model(dirname):
     os.makedirs(model_path, exist_ok=True)
-
-    path = modules.paths.paths.get("CodeFormer", None)
-    if path is None:
-        return
-
     try:
-        from torchvision.transforms.functional import normalize
-        from modules.codeformer.codeformer_arch import CodeFormer
-        from basicsr.utils import img2tensor, tensor2img
-        from facelib.utils.face_restoration_helper import FaceRestoreHelper
-        from facelib.detection.retinaface import retinaface
-
-        net_class = CodeFormer
-
-        class FaceRestorerCodeFormer(modules.face_restoration.FaceRestoration):
-            def name(self):
-                return "CodeFormer"
-
-            def __init__(self, dirname):
-                self.net = None
-                self.face_helper = None
-                self.cmd_dir = dirname
-
-            def create_models(self):
-
-                if self.net is not None and self.face_helper is not None:
-                    self.net.to(devices.device_codeformer)
-                    return self.net, self.face_helper
-                model_paths = modelloader.load_models(model_path, model_url, self.cmd_dir, download_name='codeformer-v0.1.0.pth', ext_filter=['.pth'])
-                if len(model_paths) != 0:
-                    ckpt_path = model_paths[0]
-                else:
-                    print("Unable to load codeformer model.")
-                    return None, None
-                net = net_class(dim_embd=512, codebook_size=1024, n_head=8, n_layers=9, connect_list=['32', '64', '128', '256']).to(devices.device_codeformer)
-                checkpoint = torch.load(ckpt_path)['params_ema']
-                net.load_state_dict(checkpoint)
-                net.eval()
-
-                if hasattr(retinaface, 'device'):
-                    retinaface.device = devices.device_codeformer
-                face_helper = FaceRestoreHelper(1, face_size=512, crop_ratio=(1, 1), det_model='retinaface_resnet50', save_ext='png', use_parse=True, device=devices.device_codeformer)
-
-                self.net = net
-                self.face_helper = face_helper
-
-                return net, face_helper
-
-            def send_model_to(self, device):
-                self.net.to(device)
-                self.face_helper.face_det.to(device)
-                self.face_helper.face_parse.to(device)
-
-            def restore(self, np_image, w=None):
-                np_image = np_image[:, :, ::-1]
-
-                original_resolution = np_image.shape[0:2]
-
-                self.create_models()
-                if self.net is None or self.face_helper is None:
-                    return np_image
-
-                self.send_model_to(devices.device_codeformer)
-
-                self.face_helper.clean_all()
-                self.face_helper.read_image(np_image)
-                self.face_helper.get_face_landmarks_5(only_center_face=False, resize=640, eye_dist_threshold=5)
-                self.face_helper.align_warp_face()
-
-                for cropped_face in self.face_helper.cropped_faces:
-                    cropped_face_t = img2tensor(cropped_face / 255., bgr2rgb=True, float32=True)
-                    normalize(cropped_face_t, (0.5, 0.5, 0.5), (0.5, 0.5, 0.5), inplace=True)
-                    cropped_face_t = cropped_face_t.unsqueeze(0).to(devices.device_codeformer)
-
-                    try:
-                        with torch.no_grad():
-                            output = self.net(cropped_face_t, w=w if w is not None else shared.opts.code_former_weight, adain=True)[0]
-                            restored_face = tensor2img(output, rgb2bgr=True, min_max=(-1, 1))
-                        del output
-                        devices.torch_gc()
-                    except Exception:
-                        errors.report('Failed inference for CodeFormer', exc_info=True)
-                        restored_face = tensor2img(cropped_face_t, rgb2bgr=True, min_max=(-1, 1))
-
-                    restored_face = restored_face.astype('uint8')
-                    self.face_helper.add_restored_face(restored_face)
-
-                self.face_helper.get_inverse_affine(None)
-
-                restored_img = self.face_helper.paste_faces_to_input_image()
-                restored_img = restored_img[:, :, ::-1]
-
-                if original_resolution != restored_img.shape[0:2]:
-                    restored_img = cv2.resize(restored_img, (0, 0), fx=original_resolution[1]/restored_img.shape[1], fy=original_resolution[0]/restored_img.shape[0], interpolation=cv2.INTER_LINEAR)
-
-                self.face_helper.clean_all()
-
-                if shared.opts.face_restoration_unload:
-                    self.send_model_to(devices.cpu)
-
-                return restored_img
-
         global codeformer
         codeformer = FaceRestorerCodeFormer(dirname)
         shared.face_restorers.append(codeformer)
-
     except Exception:
         errors.report("Error setting up CodeFormer", exc_info=True)
-
-   # sys.path = stored_sys_path
diff --git a/modules/esrgan_model.py b/modules/esrgan_model.py
index c0d22a992..a7c7c9e30 100644
--- a/modules/esrgan_model.py
+++ b/modules/esrgan_model.py
@@ -1,122 +1,9 @@
-import sys
-
-import torch
-
-import modules.esrgan_model_arch as arch
-from modules import modelloader, devices
+from modules import modelloader, devices, errors
 from modules.shared import opts
 from modules.upscaler import Upscaler, UpscalerData
 from modules.upscaler_utils import upscale_with_model
 
 
-def mod2normal(state_dict):
-    # this code is copied from https://github.com/victorca25/iNNfer
-    if 'conv_first.weight' in state_dict:
-        crt_net = {}
-        items = list(state_dict)
-
-        crt_net['model.0.weight'] = state_dict['conv_first.weight']
-        crt_net['model.0.bias'] = state_dict['conv_first.bias']
-
-        for k in items.copy():
-            if 'RDB' in k:
-                ori_k = k.replace('RRDB_trunk.', 'model.1.sub.')
-                if '.weight' in k:
-                    ori_k = ori_k.replace('.weight', '.0.weight')
-                elif '.bias' in k:
-                    ori_k = ori_k.replace('.bias', '.0.bias')
-                crt_net[ori_k] = state_dict[k]
-                items.remove(k)
-
-        crt_net['model.1.sub.23.weight'] = state_dict['trunk_conv.weight']
-        crt_net['model.1.sub.23.bias'] = state_dict['trunk_conv.bias']
-        crt_net['model.3.weight'] = state_dict['upconv1.weight']
-        crt_net['model.3.bias'] = state_dict['upconv1.bias']
-        crt_net['model.6.weight'] = state_dict['upconv2.weight']
-        crt_net['model.6.bias'] = state_dict['upconv2.bias']
-        crt_net['model.8.weight'] = state_dict['HRconv.weight']
-        crt_net['model.8.bias'] = state_dict['HRconv.bias']
-        crt_net['model.10.weight'] = state_dict['conv_last.weight']
-        crt_net['model.10.bias'] = state_dict['conv_last.bias']
-        state_dict = crt_net
-    return state_dict
-
-
-def resrgan2normal(state_dict, nb=23):
-    # this code is copied from https://github.com/victorca25/iNNfer
-    if "conv_first.weight" in state_dict and "body.0.rdb1.conv1.weight" in state_dict:
-        re8x = 0
-        crt_net = {}
-        items = list(state_dict)
-
-        crt_net['model.0.weight'] = state_dict['conv_first.weight']
-        crt_net['model.0.bias'] = state_dict['conv_first.bias']
-
-        for k in items.copy():
-            if "rdb" in k:
-                ori_k = k.replace('body.', 'model.1.sub.')
-                ori_k = ori_k.replace('.rdb', '.RDB')
-                if '.weight' in k:
-                    ori_k = ori_k.replace('.weight', '.0.weight')
-                elif '.bias' in k:
-                    ori_k = ori_k.replace('.bias', '.0.bias')
-                crt_net[ori_k] = state_dict[k]
-                items.remove(k)
-
-        crt_net[f'model.1.sub.{nb}.weight'] = state_dict['conv_body.weight']
-        crt_net[f'model.1.sub.{nb}.bias'] = state_dict['conv_body.bias']
-        crt_net['model.3.weight'] = state_dict['conv_up1.weight']
-        crt_net['model.3.bias'] = state_dict['conv_up1.bias']
-        crt_net['model.6.weight'] = state_dict['conv_up2.weight']
-        crt_net['model.6.bias'] = state_dict['conv_up2.bias']
-
-        if 'conv_up3.weight' in state_dict:
-            # modification supporting: https://github.com/ai-forever/Real-ESRGAN/blob/main/RealESRGAN/rrdbnet_arch.py
-            re8x = 3
-            crt_net['model.9.weight'] = state_dict['conv_up3.weight']
-            crt_net['model.9.bias'] = state_dict['conv_up3.bias']
-
-        crt_net[f'model.{8+re8x}.weight'] = state_dict['conv_hr.weight']
-        crt_net[f'model.{8+re8x}.bias'] = state_dict['conv_hr.bias']
-        crt_net[f'model.{10+re8x}.weight'] = state_dict['conv_last.weight']
-        crt_net[f'model.{10+re8x}.bias'] = state_dict['conv_last.bias']
-
-        state_dict = crt_net
-    return state_dict
-
-
-def infer_params(state_dict):
-    # this code is copied from https://github.com/victorca25/iNNfer
-    scale2x = 0
-    scalemin = 6
-    n_uplayer = 0
-    plus = False
-
-    for block in list(state_dict):
-        parts = block.split(".")
-        n_parts = len(parts)
-        if n_parts == 5 and parts[2] == "sub":
-            nb = int(parts[3])
-        elif n_parts == 3:
-            part_num = int(parts[1])
-            if (part_num > scalemin
-                and parts[0] == "model"
-                and parts[2] == "weight"):
-                scale2x += 1
-            if part_num > n_uplayer:
-                n_uplayer = part_num
-                out_nc = state_dict[block].shape[0]
-        if not plus and "conv1x1" in block:
-            plus = True
-
-    nf = state_dict["model.0.weight"].shape[0]
-    in_nc = state_dict["model.0.weight"].shape[1]
-    out_nc = out_nc
-    scale = 2 ** scale2x
-
-    return in_nc, out_nc, nf, nb, plus, scale
-
-
 class UpscalerESRGAN(Upscaler):
     def __init__(self, dirname):
         self.name = "ESRGAN"
@@ -142,12 +29,11 @@ class UpscalerESRGAN(Upscaler):
     def do_upscale(self, img, selected_model):
         try:
             model = self.load_model(selected_model)
-        except Exception as e:
-            print(f"Unable to load ESRGAN model {selected_model}: {e}", file=sys.stderr)
+        except Exception:
+            errors.report(f"Unable to load ESRGAN model {selected_model}", exc_info=True)
             return img
         model.to(devices.device_esrgan)
-        img = esrgan_upscale(model, img)
-        return img
+        return esrgan_upscale(model, img)
 
     def load_model(self, path: str):
         if path.startswith("http"):
@@ -160,33 +46,10 @@ class UpscalerESRGAN(Upscaler):
         else:
             filename = path
 
-        state_dict = torch.load(filename, map_location='cpu' if devices.device_esrgan.type == 'mps' else None)
-
-        if "params_ema" in state_dict:
-            state_dict = state_dict["params_ema"]
-        elif "params" in state_dict:
-            state_dict = state_dict["params"]
-            num_conv = 16 if "realesr-animevideov3" in filename else 32
-            model = arch.SRVGGNetCompact(num_in_ch=3, num_out_ch=3, num_feat=64, num_conv=num_conv, upscale=4, act_type='prelu')
-            model.load_state_dict(state_dict)
-            model.eval()
-            return model
-
-        if "body.0.rdb1.conv1.weight" in state_dict and "conv_first.weight" in state_dict:
-            nb = 6 if "RealESRGAN_x4plus_anime_6B" in filename else 23
-            state_dict = resrgan2normal(state_dict, nb)
-        elif "conv_first.weight" in state_dict:
-            state_dict = mod2normal(state_dict)
-        elif "model.0.weight" not in state_dict:
-            raise Exception("The file is not a recognized ESRGAN model.")
-
-        in_nc, out_nc, nf, nb, plus, mscale = infer_params(state_dict)
-
-        model = arch.RRDBNet(in_nc=in_nc, out_nc=out_nc, nf=nf, nb=nb, upscale=mscale, plus=plus)
-        model.load_state_dict(state_dict)
-        model.eval()
-
-        return model
+        return modelloader.load_spandrel_model(
+            filename,
+            device=('cpu' if devices.device_esrgan.type == 'mps' else None),
+        )
 
 
 def esrgan_upscale(model, img):
diff --git a/modules/esrgan_model_arch.py b/modules/esrgan_model_arch.py
deleted file mode 100644
index 2b9888baf..000000000
--- a/modules/esrgan_model_arch.py
+++ /dev/null
@@ -1,465 +0,0 @@
-# this file is adapted from https://github.com/victorca25/iNNfer
-
-from collections import OrderedDict
-import math
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-
-####################
-# RRDBNet Generator
-####################
-
-class RRDBNet(nn.Module):
-    def __init__(self, in_nc, out_nc, nf, nb, nr=3, gc=32, upscale=4, norm_type=None,
-            act_type='leakyrelu', mode='CNA', upsample_mode='upconv', convtype='Conv2D',
-            finalact=None, gaussian_noise=False, plus=False):
-        super(RRDBNet, self).__init__()
-        n_upscale = int(math.log(upscale, 2))
-        if upscale == 3:
-            n_upscale = 1
-
-        self.resrgan_scale = 0
-        if in_nc % 16 == 0:
-            self.resrgan_scale = 1
-        elif in_nc != 4 and in_nc % 4 == 0:
-            self.resrgan_scale = 2
-
-        fea_conv = conv_block(in_nc, nf, kernel_size=3, norm_type=None, act_type=None, convtype=convtype)
-        rb_blocks = [RRDB(nf, nr, kernel_size=3, gc=32, stride=1, bias=1, pad_type='zero',
-            norm_type=norm_type, act_type=act_type, mode='CNA', convtype=convtype,
-            gaussian_noise=gaussian_noise, plus=plus) for _ in range(nb)]
-        LR_conv = conv_block(nf, nf, kernel_size=3, norm_type=norm_type, act_type=None, mode=mode, convtype=convtype)
-
-        if upsample_mode == 'upconv':
-            upsample_block = upconv_block
-        elif upsample_mode == 'pixelshuffle':
-            upsample_block = pixelshuffle_block
-        else:
-            raise NotImplementedError(f'upsample mode [{upsample_mode}] is not found')
-        if upscale == 3:
-            upsampler = upsample_block(nf, nf, 3, act_type=act_type, convtype=convtype)
-        else:
-            upsampler = [upsample_block(nf, nf, act_type=act_type, convtype=convtype) for _ in range(n_upscale)]
-        HR_conv0 = conv_block(nf, nf, kernel_size=3, norm_type=None, act_type=act_type, convtype=convtype)
-        HR_conv1 = conv_block(nf, out_nc, kernel_size=3, norm_type=None, act_type=None, convtype=convtype)
-
-        outact = act(finalact) if finalact else None
-
-        self.model = sequential(fea_conv, ShortcutBlock(sequential(*rb_blocks, LR_conv)),
-            *upsampler, HR_conv0, HR_conv1, outact)
-
-    def forward(self, x, outm=None):
-        if self.resrgan_scale == 1:
-            feat = pixel_unshuffle(x, scale=4)
-        elif self.resrgan_scale == 2:
-            feat = pixel_unshuffle(x, scale=2)
-        else:
-            feat = x
-
-        return self.model(feat)
-
-
-class RRDB(nn.Module):
-    """
-    Residual in Residual Dense Block
-    (ESRGAN: Enhanced Super-Resolution Generative Adversarial Networks)
-    """
-
-    def __init__(self, nf, nr=3, kernel_size=3, gc=32, stride=1, bias=1, pad_type='zero',
-            norm_type=None, act_type='leakyrelu', mode='CNA', convtype='Conv2D',
-            spectral_norm=False, gaussian_noise=False, plus=False):
-        super(RRDB, self).__init__()
-        # This is for backwards compatibility with existing models
-        if nr == 3:
-            self.RDB1 = ResidualDenseBlock_5C(nf, kernel_size, gc, stride, bias, pad_type,
-                    norm_type, act_type, mode, convtype, spectral_norm=spectral_norm,
-                    gaussian_noise=gaussian_noise, plus=plus)
-            self.RDB2 = ResidualDenseBlock_5C(nf, kernel_size, gc, stride, bias, pad_type,
-                    norm_type, act_type, mode, convtype, spectral_norm=spectral_norm,
-                    gaussian_noise=gaussian_noise, plus=plus)
-            self.RDB3 = ResidualDenseBlock_5C(nf, kernel_size, gc, stride, bias, pad_type,
-                    norm_type, act_type, mode, convtype, spectral_norm=spectral_norm,
-                    gaussian_noise=gaussian_noise, plus=plus)
-        else:
-            RDB_list = [ResidualDenseBlock_5C(nf, kernel_size, gc, stride, bias, pad_type,
-                                              norm_type, act_type, mode, convtype, spectral_norm=spectral_norm,
-                                              gaussian_noise=gaussian_noise, plus=plus) for _ in range(nr)]
-            self.RDBs = nn.Sequential(*RDB_list)
-
-    def forward(self, x):
-        if hasattr(self, 'RDB1'):
-            out = self.RDB1(x)
-            out = self.RDB2(out)
-            out = self.RDB3(out)
-        else:
-            out = self.RDBs(x)
-        return out * 0.2 + x
-
-
-class ResidualDenseBlock_5C(nn.Module):
-    """
-    Residual Dense Block
-    The core module of paper: (Residual Dense Network for Image Super-Resolution, CVPR 18)
-    Modified options that can be used:
-        - "Partial Convolution based Padding" arXiv:1811.11718
-        - "Spectral normalization" arXiv:1802.05957
-        - "ICASSP 2020 - ESRGAN+ : Further Improving ESRGAN" N. C.
-            {Rakotonirina} and A. {Rasoanaivo}
-    """
-
-    def __init__(self, nf=64, kernel_size=3, gc=32, stride=1, bias=1, pad_type='zero',
-            norm_type=None, act_type='leakyrelu', mode='CNA', convtype='Conv2D',
-            spectral_norm=False, gaussian_noise=False, plus=False):
-        super(ResidualDenseBlock_5C, self).__init__()
-
-        self.noise = GaussianNoise() if gaussian_noise else None
-        self.conv1x1 = conv1x1(nf, gc) if plus else None
-
-        self.conv1 = conv_block(nf, gc, kernel_size, stride, bias=bias, pad_type=pad_type,
-            norm_type=norm_type, act_type=act_type, mode=mode, convtype=convtype,
-            spectral_norm=spectral_norm)
-        self.conv2 = conv_block(nf+gc, gc, kernel_size, stride, bias=bias, pad_type=pad_type,
-            norm_type=norm_type, act_type=act_type, mode=mode, convtype=convtype,
-            spectral_norm=spectral_norm)
-        self.conv3 = conv_block(nf+2*gc, gc, kernel_size, stride, bias=bias, pad_type=pad_type,
-            norm_type=norm_type, act_type=act_type, mode=mode, convtype=convtype,
-            spectral_norm=spectral_norm)
-        self.conv4 = conv_block(nf+3*gc, gc, kernel_size, stride, bias=bias, pad_type=pad_type,
-            norm_type=norm_type, act_type=act_type, mode=mode, convtype=convtype,
-            spectral_norm=spectral_norm)
-        if mode == 'CNA':
-            last_act = None
-        else:
-            last_act = act_type
-        self.conv5 = conv_block(nf+4*gc, nf, 3, stride, bias=bias, pad_type=pad_type,
-            norm_type=norm_type, act_type=last_act, mode=mode, convtype=convtype,
-            spectral_norm=spectral_norm)
-
-    def forward(self, x):
-        x1 = self.conv1(x)
-        x2 = self.conv2(torch.cat((x, x1), 1))
-        if self.conv1x1:
-            x2 = x2 + self.conv1x1(x)
-        x3 = self.conv3(torch.cat((x, x1, x2), 1))
-        x4 = self.conv4(torch.cat((x, x1, x2, x3), 1))
-        if self.conv1x1:
-            x4 = x4 + x2
-        x5 = self.conv5(torch.cat((x, x1, x2, x3, x4), 1))
-        if self.noise:
-            return self.noise(x5.mul(0.2) + x)
-        else:
-            return x5 * 0.2 + x
-
-
-####################
-# ESRGANplus
-####################
-
-class GaussianNoise(nn.Module):
-    def __init__(self, sigma=0.1, is_relative_detach=False):
-        super().__init__()
-        self.sigma = sigma
-        self.is_relative_detach = is_relative_detach
-        self.noise = torch.tensor(0, dtype=torch.float)
-
-    def forward(self, x):
-        if self.training and self.sigma != 0:
-            self.noise = self.noise.to(x.device)
-            scale = self.sigma * x.detach() if self.is_relative_detach else self.sigma * x
-            sampled_noise = self.noise.repeat(*x.size()).normal_() * scale
-            x = x + sampled_noise
-        return x
-
-def conv1x1(in_planes, out_planes, stride=1):
-    return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)
-
-
-####################
-# SRVGGNetCompact
-####################
-
-class SRVGGNetCompact(nn.Module):
-    """A compact VGG-style network structure for super-resolution.
-    This class is copied from https://github.com/xinntao/Real-ESRGAN
-    """
-
-    def __init__(self, num_in_ch=3, num_out_ch=3, num_feat=64, num_conv=16, upscale=4, act_type='prelu'):
-        super(SRVGGNetCompact, self).__init__()
-        self.num_in_ch = num_in_ch
-        self.num_out_ch = num_out_ch
-        self.num_feat = num_feat
-        self.num_conv = num_conv
-        self.upscale = upscale
-        self.act_type = act_type
-
-        self.body = nn.ModuleList()
-        # the first conv
-        self.body.append(nn.Conv2d(num_in_ch, num_feat, 3, 1, 1))
-        # the first activation
-        if act_type == 'relu':
-            activation = nn.ReLU(inplace=True)
-        elif act_type == 'prelu':
-            activation = nn.PReLU(num_parameters=num_feat)
-        elif act_type == 'leakyrelu':
-            activation = nn.LeakyReLU(negative_slope=0.1, inplace=True)
-        self.body.append(activation)
-
-        # the body structure
-        for _ in range(num_conv):
-            self.body.append(nn.Conv2d(num_feat, num_feat, 3, 1, 1))
-            # activation
-            if act_type == 'relu':
-                activation = nn.ReLU(inplace=True)
-            elif act_type == 'prelu':
-                activation = nn.PReLU(num_parameters=num_feat)
-            elif act_type == 'leakyrelu':
-                activation = nn.LeakyReLU(negative_slope=0.1, inplace=True)
-            self.body.append(activation)
-
-        # the last conv
-        self.body.append(nn.Conv2d(num_feat, num_out_ch * upscale * upscale, 3, 1, 1))
-        # upsample
-        self.upsampler = nn.PixelShuffle(upscale)
-
-    def forward(self, x):
-        out = x
-        for i in range(0, len(self.body)):
-            out = self.body[i](out)
-
-        out = self.upsampler(out)
-        # add the nearest upsampled image, so that the network learns the residual
-        base = F.interpolate(x, scale_factor=self.upscale, mode='nearest')
-        out += base
-        return out
-
-
-####################
-# Upsampler
-####################
-
-class Upsample(nn.Module):
-    r"""Upsamples a given multi-channel 1D (temporal), 2D (spatial) or 3D (volumetric) data.
-    The input data is assumed to be of the form
-    `minibatch x channels x [optional depth] x [optional height] x width`.
-    """
-
-    def __init__(self, size=None, scale_factor=None, mode="nearest", align_corners=None):
-        super(Upsample, self).__init__()
-        if isinstance(scale_factor, tuple):
-            self.scale_factor = tuple(float(factor) for factor in scale_factor)
-        else:
-            self.scale_factor = float(scale_factor) if scale_factor else None
-        self.mode = mode
-        self.size = size
-        self.align_corners = align_corners
-
-    def forward(self, x):
-        return nn.functional.interpolate(x, size=self.size, scale_factor=self.scale_factor, mode=self.mode, align_corners=self.align_corners)
-
-    def extra_repr(self):
-        if self.scale_factor is not None:
-            info = f'scale_factor={self.scale_factor}'
-        else:
-            info = f'size={self.size}'
-        info += f', mode={self.mode}'
-        return info
-
-
-def pixel_unshuffle(x, scale):
-    """ Pixel unshuffle.
-    Args:
-        x (Tensor): Input feature with shape (b, c, hh, hw).
-        scale (int): Downsample ratio.
-    Returns:
-        Tensor: the pixel unshuffled feature.
-    """
-    b, c, hh, hw = x.size()
-    out_channel = c * (scale**2)
-    assert hh % scale == 0 and hw % scale == 0
-    h = hh // scale
-    w = hw // scale
-    x_view = x.view(b, c, h, scale, w, scale)
-    return x_view.permute(0, 1, 3, 5, 2, 4).reshape(b, out_channel, h, w)
-
-
-def pixelshuffle_block(in_nc, out_nc, upscale_factor=2, kernel_size=3, stride=1, bias=True,
-                        pad_type='zero', norm_type=None, act_type='relu', convtype='Conv2D'):
-    """
-    Pixel shuffle layer
-    (Real-Time Single Image and Video Super-Resolution Using an Efficient Sub-Pixel Convolutional
-    Neural Network, CVPR17)
-    """
-    conv = conv_block(in_nc, out_nc * (upscale_factor ** 2), kernel_size, stride, bias=bias,
-                        pad_type=pad_type, norm_type=None, act_type=None, convtype=convtype)
-    pixel_shuffle = nn.PixelShuffle(upscale_factor)
-
-    n = norm(norm_type, out_nc) if norm_type else None
-    a = act(act_type) if act_type else None
-    return sequential(conv, pixel_shuffle, n, a)
-
-
-def upconv_block(in_nc, out_nc, upscale_factor=2, kernel_size=3, stride=1, bias=True,
-                pad_type='zero', norm_type=None, act_type='relu', mode='nearest', convtype='Conv2D'):
-    """ Upconv layer """
-    upscale_factor = (1, upscale_factor, upscale_factor) if convtype == 'Conv3D' else upscale_factor
-    upsample = Upsample(scale_factor=upscale_factor, mode=mode)
-    conv = conv_block(in_nc, out_nc, kernel_size, stride, bias=bias,
-                        pad_type=pad_type, norm_type=norm_type, act_type=act_type, convtype=convtype)
-    return sequential(upsample, conv)
-
-
-
-
-
-
-
-
-####################
-# Basic blocks
-####################
-
-
-def make_layer(basic_block, num_basic_block, **kwarg):
-    """Make layers by stacking the same blocks.
-    Args:
-        basic_block (nn.module): nn.module class for basic block. (block)
-        num_basic_block (int): number of blocks. (n_layers)
-    Returns:
-        nn.Sequential: Stacked blocks in nn.Sequential.
-    """
-    layers = []
-    for _ in range(num_basic_block):
-        layers.append(basic_block(**kwarg))
-    return nn.Sequential(*layers)
-
-
-def act(act_type, inplace=True, neg_slope=0.2, n_prelu=1, beta=1.0):
-    """ activation helper """
-    act_type = act_type.lower()
-    if act_type == 'relu':
-        layer = nn.ReLU(inplace)
-    elif act_type in ('leakyrelu', 'lrelu'):
-        layer = nn.LeakyReLU(neg_slope, inplace)
-    elif act_type == 'prelu':
-        layer = nn.PReLU(num_parameters=n_prelu, init=neg_slope)
-    elif act_type == 'tanh':  # [-1, 1] range output
-        layer = nn.Tanh()
-    elif act_type == 'sigmoid':  # [0, 1] range output
-        layer = nn.Sigmoid()
-    else:
-        raise NotImplementedError(f'activation layer [{act_type}] is not found')
-    return layer
-
-
-class Identity(nn.Module):
-    def __init__(self, *kwargs):
-        super(Identity, self).__init__()
-
-    def forward(self, x, *kwargs):
-        return x
-
-
-def norm(norm_type, nc):
-    """ Return a normalization layer """
-    norm_type = norm_type.lower()
-    if norm_type == 'batch':
-        layer = nn.BatchNorm2d(nc, affine=True)
-    elif norm_type == 'instance':
-        layer = nn.InstanceNorm2d(nc, affine=False)
-    elif norm_type == 'none':
-        def norm_layer(x): return Identity()
-    else:
-        raise NotImplementedError(f'normalization layer [{norm_type}] is not found')
-    return layer
-
-
-def pad(pad_type, padding):
-    """ padding layer helper """
-    pad_type = pad_type.lower()
-    if padding == 0:
-        return None
-    if pad_type == 'reflect':
-        layer = nn.ReflectionPad2d(padding)
-    elif pad_type == 'replicate':
-        layer = nn.ReplicationPad2d(padding)
-    elif pad_type == 'zero':
-        layer = nn.ZeroPad2d(padding)
-    else:
-        raise NotImplementedError(f'padding layer [{pad_type}] is not implemented')
-    return layer
-
-
-def get_valid_padding(kernel_size, dilation):
-    kernel_size = kernel_size + (kernel_size - 1) * (dilation - 1)
-    padding = (kernel_size - 1) // 2
-    return padding
-
-
-class ShortcutBlock(nn.Module):
-    """ Elementwise sum the output of a submodule to its input """
-    def __init__(self, submodule):
-        super(ShortcutBlock, self).__init__()
-        self.sub = submodule
-
-    def forward(self, x):
-        output = x + self.sub(x)
-        return output
-
-    def __repr__(self):
-        return 'Identity + \n|' + self.sub.__repr__().replace('\n', '\n|')
-
-
-def sequential(*args):
-    """ Flatten Sequential. It unwraps nn.Sequential. """
-    if len(args) == 1:
-        if isinstance(args[0], OrderedDict):
-            raise NotImplementedError('sequential does not support OrderedDict input.')
-        return args[0]  # No sequential is needed.
-    modules = []
-    for module in args:
-        if isinstance(module, nn.Sequential):
-            for submodule in module.children():
-                modules.append(submodule)
-        elif isinstance(module, nn.Module):
-            modules.append(module)
-    return nn.Sequential(*modules)
-
-
-def conv_block(in_nc, out_nc, kernel_size, stride=1, dilation=1, groups=1, bias=True,
-               pad_type='zero', norm_type=None, act_type='relu', mode='CNA', convtype='Conv2D',
-               spectral_norm=False):
-    """ Conv layer with padding, normalization, activation """
-    assert mode in ['CNA', 'NAC', 'CNAC'], f'Wrong conv mode [{mode}]'
-    padding = get_valid_padding(kernel_size, dilation)
-    p = pad(pad_type, padding) if pad_type and pad_type != 'zero' else None
-    padding = padding if pad_type == 'zero' else 0
-
-    if convtype=='PartialConv2D':
-        from torchvision.ops import PartialConv2d  # this is definitely not going to work, but PartialConv2d doesn't work anyway and this shuts up static analyzer
-        c = PartialConv2d(in_nc, out_nc, kernel_size=kernel_size, stride=stride, padding=padding,
-               dilation=dilation, bias=bias, groups=groups)
-    elif convtype=='DeformConv2D':
-        from torchvision.ops import DeformConv2d  # not tested
-        c = DeformConv2d(in_nc, out_nc, kernel_size=kernel_size, stride=stride, padding=padding,
-               dilation=dilation, bias=bias, groups=groups)
-    elif convtype=='Conv3D':
-        c = nn.Conv3d(in_nc, out_nc, kernel_size=kernel_size, stride=stride, padding=padding,
-                dilation=dilation, bias=bias, groups=groups)
-    else:
-        c = nn.Conv2d(in_nc, out_nc, kernel_size=kernel_size, stride=stride, padding=padding,
-                dilation=dilation, bias=bias, groups=groups)
-
-    if spectral_norm:
-        c = nn.utils.spectral_norm(c)
-
-    a = act(act_type) if act_type else None
-    if 'CNA' in mode:
-        n = norm(norm_type, out_nc) if norm_type else None
-        return sequential(p, c, n, a)
-    elif mode == 'NAC':
-        if norm_type is None and act_type is not None:
-            a = act(act_type, inplace=False)
-        n = norm(norm_type, in_nc) if norm_type else None
-        return sequential(n, a, p, c)
diff --git a/modules/gfpgan_model.py b/modules/gfpgan_model.py
index 01d668ecd..6b6f17c43 100644
--- a/modules/gfpgan_model.py
+++ b/modules/gfpgan_model.py
@@ -1,8 +1,5 @@
 import os
 
-import facexlib
-import gfpgan
-
 import modules.face_restoration
 from modules import paths, shared, devices, modelloader, errors
 
@@ -41,6 +38,8 @@ def gfpgann():
         print("Unable to load gfpgan model!")
         return None
 
+    import facexlib.detection.retinaface
+
     if hasattr(facexlib.detection.retinaface, 'device'):
         facexlib.detection.retinaface.device = devices.device_gfpgan
     model_file_path = model_file
@@ -81,8 +80,10 @@ gfpgan_constructor = None
 def setup_model(dirname):
     try:
         os.makedirs(model_path, exist_ok=True)
-        from gfpgan import GFPGANer
-        from facexlib import detection, parsing  # noqa: F401
+        import gfpgan
+        import facexlib.detection
+        import facexlib.parsing
+
         global user_path
         global have_gfpgan
         global gfpgan_constructor
@@ -111,7 +112,7 @@ def setup_model(dirname):
         facexlib.parsing.load_file_from_url = facex_load_file_from_url2
         user_path = dirname
         have_gfpgan = True
-        gfpgan_constructor = GFPGANer
+        gfpgan_constructor = gfpgan.GFPGANer
 
         class FaceRestorerGFPGAN(modules.face_restoration.FaceRestoration):
             def name(self):
diff --git a/modules/launch_utils.py b/modules/launch_utils.py
index dabef0f53..c2cbd8ce7 100644
--- a/modules/launch_utils.py
+++ b/modules/launch_utils.py
@@ -345,13 +345,11 @@ def prepare_environment():
     stable_diffusion_repo = os.environ.get('STABLE_DIFFUSION_REPO', "https://github.com/Stability-AI/stablediffusion.git")
     stable_diffusion_xl_repo = os.environ.get('STABLE_DIFFUSION_XL_REPO', "https://github.com/Stability-AI/generative-models.git")
     k_diffusion_repo = os.environ.get('K_DIFFUSION_REPO', 'https://github.com/crowsonkb/k-diffusion.git')
-    codeformer_repo = os.environ.get('CODEFORMER_REPO', 'https://github.com/sczhou/CodeFormer.git')
     blip_repo = os.environ.get('BLIP_REPO', 'https://github.com/salesforce/BLIP.git')
 
     stable_diffusion_commit_hash = os.environ.get('STABLE_DIFFUSION_COMMIT_HASH', "cf1d67a6fd5ea1aa600c4df58e5b47da45f6bdbf")
     stable_diffusion_xl_commit_hash = os.environ.get('STABLE_DIFFUSION_XL_COMMIT_HASH', "45c443b316737a4ab6e40413d7794a7f5657c19f")
     k_diffusion_commit_hash = os.environ.get('K_DIFFUSION_COMMIT_HASH', "ab527a9a6d347f364e3d185ba6d714e22d80cb3c")
-    codeformer_commit_hash = os.environ.get('CODEFORMER_COMMIT_HASH', "c5b4593074ba6214284d6acd5f1719b6c5d739af")
     blip_commit_hash = os.environ.get('BLIP_COMMIT_HASH', "48211a1594f1321b00f14c9f7a5b4813144b2fb9")
 
     try:
@@ -408,15 +406,10 @@ def prepare_environment():
     git_clone(stable_diffusion_repo, repo_dir('stable-diffusion-stability-ai'), "Stable Diffusion", stable_diffusion_commit_hash)
     git_clone(stable_diffusion_xl_repo, repo_dir('generative-models'), "Stable Diffusion XL", stable_diffusion_xl_commit_hash)
     git_clone(k_diffusion_repo, repo_dir('k-diffusion'), "K-diffusion", k_diffusion_commit_hash)
-    git_clone(codeformer_repo, repo_dir('CodeFormer'), "CodeFormer", codeformer_commit_hash)
     git_clone(blip_repo, repo_dir('BLIP'), "BLIP", blip_commit_hash)
 
     startup_timer.record("clone repositores")
 
-    if not is_installed("lpips"):
-        run_pip(f"install -r \"{os.path.join(repo_dir('CodeFormer'), 'requirements.txt')}\"", "requirements for CodeFormer")
-        startup_timer.record("install CodeFormer requirements")
-
     if not os.path.isfile(requirements_file):
         requirements_file = os.path.join(script_path, requirements_file)
 
diff --git a/modules/modelloader.py b/modules/modelloader.py
index 098bcb793..30116932a 100644
--- a/modules/modelloader.py
+++ b/modules/modelloader.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import logging
 import os
 import shutil
 import importlib
@@ -10,6 +11,9 @@ from modules.upscaler import Upscaler, UpscalerLanczos, UpscalerNearest, Upscale
 from modules.paths import script_path, models_path
 
 
+logger = logging.getLogger(__name__)
+
+
 def load_file_from_url(
     url: str,
     *,
@@ -177,3 +181,15 @@ def load_upscalers():
         # Special case for UpscalerNone keeps it at the beginning of the list.
         key=lambda x: x.name.lower() if not isinstance(x.scaler, (UpscalerNone, UpscalerLanczos, UpscalerNearest)) else ""
     )
+
+
+def load_spandrel_model(path, *, device, half: bool = False, dtype=None):
+    import spandrel
+    model = spandrel.ModelLoader(device=device).load_from_file(path)
+    if half:
+        model = model.model.half()
+    if dtype:
+        model = model.model.to(dtype=dtype)
+    model.eval()
+    logger.debug("Loaded %s from %s (device=%s, half=%s, dtype=%s)", model, path, device, half, dtype)
+    return model
diff --git a/modules/paths.py b/modules/paths.py
index 187b94961..030646519 100644
--- a/modules/paths.py
+++ b/modules/paths.py
@@ -38,7 +38,6 @@ mute_sdxl_imports()
 path_dirs = [
     (sd_path, 'ldm', 'Stable Diffusion', []),
     (os.path.join(sd_path, '../generative-models'), 'sgm', 'Stable Diffusion XL', ["sgm"]),
-    (os.path.join(sd_path, '../CodeFormer'), 'inference_codeformer.py', 'CodeFormer', []),
     (os.path.join(sd_path, '../BLIP'), 'models/blip.py', 'BLIP', []),
     (os.path.join(sd_path, '../k-diffusion'), 'k_diffusion/sampling.py', 'k_diffusion', ["atstart"]),
 ]
diff --git a/modules/realesrgan_model.py b/modules/realesrgan_model.py
index 02841c302..332d8f4b1 100644
--- a/modules/realesrgan_model.py
+++ b/modules/realesrgan_model.py
@@ -1,9 +1,6 @@
 import os
 
-import numpy as np
-from PIL import Image
-from realesrgan import RealESRGANer
-
+from modules.upscaler_utils import upscale_with_model
 from modules.upscaler import Upscaler, UpscalerData
 from modules.shared import cmd_opts, opts
 from modules import modelloader, errors
@@ -14,29 +11,20 @@ class UpscalerRealESRGAN(Upscaler):
         self.name = "RealESRGAN"
         self.user_path = path
         super().__init__()
-        try:
-            from basicsr.archs.rrdbnet_arch import RRDBNet  # noqa: F401
-            from realesrgan import RealESRGANer  # noqa: F401
-            from realesrgan.archs.srvgg_arch import SRVGGNetCompact  # noqa: F401
-            self.enable = True
-            self.scalers = []
-            scalers = self.load_models(path)
+        self.enable = True
+        self.scalers = []
+        scalers = get_realesrgan_models(self)
 
-            local_model_paths = self.find_models(ext_filter=[".pth"])
-            for scaler in scalers:
-                if scaler.local_data_path.startswith("http"):
-                    filename = modelloader.friendly_name(scaler.local_data_path)
-                    local_model_candidates = [local_model for local_model in local_model_paths if local_model.endswith(f"{filename}.pth")]
-                    if local_model_candidates:
-                        scaler.local_data_path = local_model_candidates[0]
+        local_model_paths = self.find_models(ext_filter=[".pth"])
+        for scaler in scalers:
+            if scaler.local_data_path.startswith("http"):
+                filename = modelloader.friendly_name(scaler.local_data_path)
+                local_model_candidates = [local_model for local_model in local_model_paths if local_model.endswith(f"{filename}.pth")]
+                if local_model_candidates:
+                    scaler.local_data_path = local_model_candidates[0]
 
-                if scaler.name in opts.realesrgan_enabled_models:
-                    self.scalers.append(scaler)
-
-        except Exception:
-            errors.report("Error importing Real-ESRGAN", exc_info=True)
-            self.enable = False
-            self.scalers = []
+            if scaler.name in opts.realesrgan_enabled_models:
+                self.scalers.append(scaler)
 
     def do_upscale(self, img, path):
         if not self.enable:
@@ -48,20 +36,18 @@ class UpscalerRealESRGAN(Upscaler):
             errors.report(f"Unable to load RealESRGAN model {path}", exc_info=True)
             return img
 
-        upsampler = RealESRGANer(
-            scale=info.scale,
-            model_path=info.local_data_path,
-            model=info.model(),
-            half=not cmd_opts.no_half and not cmd_opts.upcast_sampling,
-            tile=opts.ESRGAN_tile,
-            tile_pad=opts.ESRGAN_tile_overlap,
+        mod = modelloader.load_spandrel_model(
+            info.local_data_path,
             device=self.device,
+            half=(not cmd_opts.no_half and not cmd_opts.upcast_sampling),
+        )
+        return upscale_with_model(
+            mod,
+            img,
+            tile_size=opts.ESRGAN_tile,
+            tile_overlap=opts.ESRGAN_tile_overlap,
+            # TODO: `outscale`?
         )
-
-        upsampled = upsampler.enhance(np.array(img), outscale=info.scale)[0]
-
-        image = Image.fromarray(upsampled)
-        return image
 
     def load_model(self, path):
         for scaler in self.scalers:
@@ -76,58 +62,43 @@ class UpscalerRealESRGAN(Upscaler):
                 return scaler
         raise ValueError(f"Unable to find model info: {path}")
 
-    def load_models(self, _):
-        return get_realesrgan_models(self)
 
-
-def get_realesrgan_models(scaler):
-    try:
-        from basicsr.archs.rrdbnet_arch import RRDBNet
-        from realesrgan.archs.srvgg_arch import SRVGGNetCompact
-        models = [
-            UpscalerData(
-                name="R-ESRGAN General 4xV3",
-                path="https://github.com/xinntao/Real-ESRGAN/releases/download/v0.2.5.0/realesr-general-x4v3.pth",
-                scale=4,
-                upscaler=scaler,
-                model=lambda: SRVGGNetCompact(num_in_ch=3, num_out_ch=3, num_feat=64, num_conv=32, upscale=4, act_type='prelu')
-            ),
-            UpscalerData(
-                name="R-ESRGAN General WDN 4xV3",
-                path="https://github.com/xinntao/Real-ESRGAN/releases/download/v0.2.5.0/realesr-general-wdn-x4v3.pth",
-                scale=4,
-                upscaler=scaler,
-                model=lambda: SRVGGNetCompact(num_in_ch=3, num_out_ch=3, num_feat=64, num_conv=32, upscale=4, act_type='prelu')
-            ),
-            UpscalerData(
-                name="R-ESRGAN AnimeVideo",
-                path="https://github.com/xinntao/Real-ESRGAN/releases/download/v0.2.5.0/realesr-animevideov3.pth",
-                scale=4,
-                upscaler=scaler,
-                model=lambda: SRVGGNetCompact(num_in_ch=3, num_out_ch=3, num_feat=64, num_conv=16, upscale=4, act_type='prelu')
-            ),
-            UpscalerData(
-                name="R-ESRGAN 4x+",
-                path="https://github.com/xinntao/Real-ESRGAN/releases/download/v0.1.0/RealESRGAN_x4plus.pth",
-                scale=4,
-                upscaler=scaler,
-                model=lambda: RRDBNet(num_in_ch=3, num_out_ch=3, num_feat=64, num_block=23, num_grow_ch=32, scale=4)
-            ),
-            UpscalerData(
-                name="R-ESRGAN 4x+ Anime6B",
-                path="https://github.com/xinntao/Real-ESRGAN/releases/download/v0.2.2.4/RealESRGAN_x4plus_anime_6B.pth",
-                scale=4,
-                upscaler=scaler,
-                model=lambda: RRDBNet(num_in_ch=3, num_out_ch=3, num_feat=64, num_block=6, num_grow_ch=32, scale=4)
-            ),
-            UpscalerData(
-                name="R-ESRGAN 2x+",
-                path="https://github.com/xinntao/Real-ESRGAN/releases/download/v0.2.1/RealESRGAN_x2plus.pth",
-                scale=2,
-                upscaler=scaler,
-                model=lambda: RRDBNet(num_in_ch=3, num_out_ch=3, num_feat=64, num_block=23, num_grow_ch=32, scale=2)
-            ),
-        ]
-        return models
-    except Exception:
-        errors.report("Error making Real-ESRGAN models list", exc_info=True)
+def get_realesrgan_models(scaler: UpscalerRealESRGAN):
+    return [
+        UpscalerData(
+            name="R-ESRGAN General 4xV3",
+            path="https://github.com/xinntao/Real-ESRGAN/releases/download/v0.2.5.0/realesr-general-x4v3.pth",
+            scale=4,
+            upscaler=scaler,
+        ),
+        UpscalerData(
+            name="R-ESRGAN General WDN 4xV3",
+            path="https://github.com/xinntao/Real-ESRGAN/releases/download/v0.2.5.0/realesr-general-wdn-x4v3.pth",
+            scale=4,
+            upscaler=scaler,
+        ),
+        UpscalerData(
+            name="R-ESRGAN AnimeVideo",
+            path="https://github.com/xinntao/Real-ESRGAN/releases/download/v0.2.5.0/realesr-animevideov3.pth",
+            scale=4,
+            upscaler=scaler,
+        ),
+        UpscalerData(
+            name="R-ESRGAN 4x+",
+            path="https://github.com/xinntao/Real-ESRGAN/releases/download/v0.1.0/RealESRGAN_x4plus.pth",
+            scale=4,
+            upscaler=scaler,
+        ),
+        UpscalerData(
+            name="R-ESRGAN 4x+ Anime6B",
+            path="https://github.com/xinntao/Real-ESRGAN/releases/download/v0.2.2.4/RealESRGAN_x4plus_anime_6B.pth",
+            scale=4,
+            upscaler=scaler,
+        ),
+        UpscalerData(
+            name="R-ESRGAN 2x+",
+            path="https://github.com/xinntao/Real-ESRGAN/releases/download/v0.2.1/RealESRGAN_x2plus.pth",
+            scale=2,
+            upscaler=scaler,
+        ),
+    ]
diff --git a/modules/sysinfo.py b/modules/sysinfo.py
index b669edd0c..5abf616b7 100644
--- a/modules/sysinfo.py
+++ b/modules/sysinfo.py
@@ -26,11 +26,9 @@ environment_whitelist = {
     "OPENCLIP_PACKAGE",
     "STABLE_DIFFUSION_REPO",
     "K_DIFFUSION_REPO",
-    "CODEFORMER_REPO",
     "BLIP_REPO",
     "STABLE_DIFFUSION_COMMIT_HASH",
     "K_DIFFUSION_COMMIT_HASH",
-    "CODEFORMER_COMMIT_HASH",
     "BLIP_COMMIT_HASH",
     "COMMANDLINE_ARGS",
     "IGNORE_CMD_ARGS_ERRORS",
diff --git a/modules/upscaler.py b/modules/upscaler.py
index b256e085b..3aee69db8 100644
--- a/modules/upscaler.py
+++ b/modules/upscaler.py
@@ -98,6 +98,9 @@ class UpscalerData:
         self.scale = scale
         self.model = model
 
+    def __repr__(self):
+        return f"<UpscalerData name={self.name} path={self.data_path} scale={self.scale}>"
+
 
 class UpscalerNone(Upscaler):
     name = "None"
diff --git a/requirements.txt b/requirements.txt
index 80b438455..36f5674ad 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,6 +6,7 @@ basicsr
 blendmodes
 clean-fid
 einops
+facexlib
 fastapi>=0.90.1
 gfpgan
 gradio==3.41.2
@@ -20,13 +21,11 @@ open-clip-torch
 piexif
 psutil
 pytorch_lightning
-realesrgan
 requests
 resize-right
 
 safetensors
 scikit-image>=0.19
-timm
 tomesd
 torch
 torchdiffeq
diff --git a/requirements_versions.txt b/requirements_versions.txt
index cb7403a9d..042fa708c 100644
--- a/requirements_versions.txt
+++ b/requirements_versions.txt
@@ -5,6 +5,7 @@ basicsr==1.4.2
 blendmodes==2022
 clean-fid==0.1.35
 einops==0.4.1
+facexlib==0.3.0
 fastapi==0.94.0
 gfpgan==1.3.8
 gradio==3.41.2
@@ -19,11 +20,10 @@ open-clip-torch==2.20.0
 piexif==1.1.3
 psutil==5.9.5
 pytorch_lightning==1.9.4
-realesrgan==0.3.0
 resize-right==0.0.2
 safetensors==0.3.1
 scikit-image==0.21.0
-timm==0.9.2
+spandrel==0.1.6
 tomesd==0.1.3
 torch
 torchdiffeq==0.2.3

From b621a63cf68c788487684250856707cb352b82d0 Mon Sep 17 00:00:00 2001
From: Aarni Koskela <akx@iki.fi>
Date: Mon, 25 Dec 2023 23:01:02 +0200
Subject: [PATCH 5/7] Unify CodeFormer and GFPGAN restoration backends, use
 Spandrel for GFPGAN

---
 .github/workflows/run_tests.yaml  |   8 ++
 .gitignore                        |   1 +
 modules/codeformer_model.py       | 158 ++++++++---------------------
 modules/face_restoration_utils.py | 163 ++++++++++++++++++++++++++++++
 modules/gfpgan_model.py           | 154 +++++++++-------------------
 requirements.txt                  |   1 -
 requirements_versions.txt         |   1 -
 test/conftest.py                  |  15 ++-
 test/test_face_restorers.py       |  29 ++++++
 test/test_files/two-faces.jpg     | Bin 0 -> 14768 bytes
 test/test_outputs/.gitkeep        |   0
 11 files changed, 302 insertions(+), 228 deletions(-)
 create mode 100644 modules/face_restoration_utils.py
 create mode 100644 test/test_face_restorers.py
 create mode 100644 test/test_files/two-faces.jpg
 create mode 100644 test/test_outputs/.gitkeep

diff --git a/.github/workflows/run_tests.yaml b/.github/workflows/run_tests.yaml
index 3dafaf8dc..cd5c3f868 100644
--- a/.github/workflows/run_tests.yaml
+++ b/.github/workflows/run_tests.yaml
@@ -20,6 +20,12 @@ jobs:
           cache-dependency-path: |
             **/requirements*txt
             launch.py
+      - name: Cache models
+        id: cache-models
+        uses: actions/cache@v3
+        with:
+          path: models
+          key: "2023-12-30"
       - name: Install test dependencies
         run: pip install wait-for-it -r requirements-test.txt
         env:
@@ -33,6 +39,8 @@ jobs:
           TORCH_INDEX_URL: https://download.pytorch.org/whl/cpu
           WEBUI_LAUNCH_LIVE_OUTPUT: "1"
           PYTHONUNBUFFERED: "1"
+      - name: Print installed packages
+        run: pip freeze
       - name: Start test server
         run: >
           python -m coverage run
diff --git a/.gitignore b/.gitignore
index 09734267f..6790e9ee7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -37,3 +37,4 @@ notification.mp3
 /node_modules
 /package-lock.json
 /.coverage*
+/test/test_outputs
diff --git a/modules/codeformer_model.py b/modules/codeformer_model.py
index 517eadfd8..ceda4bab9 100644
--- a/modules/codeformer_model.py
+++ b/modules/codeformer_model.py
@@ -1,140 +1,62 @@
-import os
+from __future__ import annotations
+
+import logging
 
-import cv2
 import torch
 
-import modules.face_restoration
-import modules.shared
-from modules import shared, devices, modelloader, errors
-from modules.paths import models_path
+from modules import (
+    devices,
+    errors,
+    face_restoration,
+    face_restoration_utils,
+    modelloader,
+    shared,
+)
+
+logger = logging.getLogger(__name__)
 
-model_dir = "Codeformer"
-model_path = os.path.join(models_path, model_dir)
 model_url = 'https://github.com/sczhou/CodeFormer/releases/download/v0.1.0/codeformer.pth'
+model_download_name = 'codeformer-v0.1.0.pth'
 
-codeformer = None
+# used by e.g. postprocessing_codeformer.py
+codeformer: face_restoration.FaceRestoration | None = None
 
 
-class FaceRestorerCodeFormer(modules.face_restoration.FaceRestoration):
+class FaceRestorerCodeFormer(face_restoration_utils.CommonFaceRestoration):
     def name(self):
         return "CodeFormer"
 
-    def __init__(self, dirname):
-        self.net = None
-        self.face_helper = None
-        self.cmd_dir = dirname
-
-    def create_models(self):
-        from facexlib.detection import retinaface
-        from facexlib.utils.face_restoration_helper import FaceRestoreHelper
-
-        if self.net is not None and self.face_helper is not None:
-            self.net.to(devices.device_codeformer)
-            return self.net, self.face_helper
-        model_paths = modelloader.load_models(
-            model_path,
-            model_url,
-            self.cmd_dir,
-            download_name='codeformer-v0.1.0.pth',
+    def load_net(self) -> torch.Module:
+        for model_path in modelloader.load_models(
+            model_path=self.model_path,
+            model_url=model_url,
+            command_path=self.model_path,
+            download_name=model_download_name,
             ext_filter=['.pth'],
-        )
+        ):
+            return modelloader.load_spandrel_model(
+                model_path,
+                device=devices.device_codeformer,
+            ).model
+        raise ValueError("No codeformer model found")
 
-        if len(model_paths) != 0:
-            ckpt_path = model_paths[0]
-        else:
-            print("Unable to load codeformer model.")
-            return None, None
-        net = modelloader.load_spandrel_model(ckpt_path, device=devices.device_codeformer)
+    def get_device(self):
+        return devices.device_codeformer
 
-        if hasattr(retinaface, 'device'):
-            retinaface.device = devices.device_codeformer
+    def restore(self, np_image, w: float | None = None):
+        if w is None:
+            w = getattr(shared.opts, "code_former_weight", 0.5)
 
-        face_helper = FaceRestoreHelper(
-            upscale_factor=1,
-            face_size=512,
-            crop_ratio=(1, 1),
-            det_model='retinaface_resnet50',
-            save_ext='png',
-            use_parse=True,
-            device=devices.device_codeformer,
-        )
+        def restore_face(cropped_face_t):
+            assert self.net is not None
+            return self.net(cropped_face_t, w=w, adain=True)[0]
 
-        self.net = net
-        self.face_helper = face_helper
-
-    def send_model_to(self, device):
-        self.net.to(device)
-        self.face_helper.face_det.to(device)
-        self.face_helper.face_parse.to(device)
-
-    def restore(self, np_image, w=None):
-        from torchvision.transforms.functional import normalize
-        from basicsr.utils import img2tensor, tensor2img
-        np_image = np_image[:, :, ::-1]
-
-        original_resolution = np_image.shape[0:2]
-
-        self.create_models()
-        if self.net is None or self.face_helper is None:
-            return np_image
-
-        self.send_model_to(devices.device_codeformer)
-
-        self.face_helper.clean_all()
-        self.face_helper.read_image(np_image)
-        self.face_helper.get_face_landmarks_5(only_center_face=False, resize=640, eye_dist_threshold=5)
-        self.face_helper.align_warp_face()
-
-        for cropped_face in self.face_helper.cropped_faces:
-            cropped_face_t = img2tensor(cropped_face / 255., bgr2rgb=True, float32=True)
-            normalize(cropped_face_t, (0.5, 0.5, 0.5), (0.5, 0.5, 0.5), inplace=True)
-            cropped_face_t = cropped_face_t.unsqueeze(0).to(devices.device_codeformer)
-
-            try:
-                with torch.no_grad():
-                    res = self.net(cropped_face_t, w=w if w is not None else shared.opts.code_former_weight, adain=True)
-                    if isinstance(res, tuple):
-                        output = res[0]
-                    else:
-                        output = res
-                    if not isinstance(res, torch.Tensor):
-                        raise TypeError(f"Expected torch.Tensor, got {type(res)}")
-                    restored_face = tensor2img(output, rgb2bgr=True, min_max=(-1, 1))
-                del output
-                devices.torch_gc()
-            except Exception:
-                errors.report('Failed inference for CodeFormer', exc_info=True)
-                restored_face = tensor2img(cropped_face_t, rgb2bgr=True, min_max=(-1, 1))
-
-            restored_face = restored_face.astype('uint8')
-            self.face_helper.add_restored_face(restored_face)
-
-        self.face_helper.get_inverse_affine(None)
-
-        restored_img = self.face_helper.paste_faces_to_input_image()
-        restored_img = restored_img[:, :, ::-1]
-
-        if original_resolution != restored_img.shape[0:2]:
-            restored_img = cv2.resize(
-                restored_img,
-                (0, 0),
-                fx=original_resolution[1]/restored_img.shape[1],
-                fy=original_resolution[0]/restored_img.shape[0],
-                interpolation=cv2.INTER_LINEAR,
-            )
-
-        self.face_helper.clean_all()
-
-        if shared.opts.face_restoration_unload:
-            self.send_model_to(devices.cpu)
-
-        return restored_img
+        return self.restore_with_helper(np_image, restore_face)
 
 
-def setup_model(dirname):
-    os.makedirs(model_path, exist_ok=True)
+def setup_model(dirname: str) -> None:
+    global codeformer
     try:
-        global codeformer
         codeformer = FaceRestorerCodeFormer(dirname)
         shared.face_restorers.append(codeformer)
     except Exception:
diff --git a/modules/face_restoration_utils.py b/modules/face_restoration_utils.py
new file mode 100644
index 000000000..c65c85ef8
--- /dev/null
+++ b/modules/face_restoration_utils.py
@@ -0,0 +1,163 @@
+from __future__ import annotations
+
+import logging
+import os
+from functools import cached_property
+from typing import TYPE_CHECKING, Callable
+
+import cv2
+import numpy as np
+import torch
+
+from modules import devices, errors, face_restoration, shared
+
+if TYPE_CHECKING:
+    from facexlib.utils.face_restoration_helper import FaceRestoreHelper
+
+logger = logging.getLogger(__name__)
+
+
+def create_face_helper(device) -> FaceRestoreHelper:
+    from facexlib.detection import retinaface
+    from facexlib.utils.face_restoration_helper import FaceRestoreHelper
+    if hasattr(retinaface, 'device'):
+        retinaface.device = device
+    return FaceRestoreHelper(
+        upscale_factor=1,
+        face_size=512,
+        crop_ratio=(1, 1),
+        det_model='retinaface_resnet50',
+        save_ext='png',
+        use_parse=True,
+        device=device,
+    )
+
+
+def restore_with_face_helper(
+    np_image: np.ndarray,
+    face_helper: FaceRestoreHelper,
+    restore_face: Callable[[np.ndarray], np.ndarray],
+) -> np.ndarray:
+    """
+    Find faces in the image using face_helper, restore them using restore_face, and paste them back into the image.
+
+    `restore_face` should take a cropped face image and return a restored face image.
+    """
+    from basicsr.utils import img2tensor, tensor2img
+    from torchvision.transforms.functional import normalize
+    np_image = np_image[:, :, ::-1]
+    original_resolution = np_image.shape[0:2]
+
+    try:
+        logger.debug("Detecting faces...")
+        face_helper.clean_all()
+        face_helper.read_image(np_image)
+        face_helper.get_face_landmarks_5(only_center_face=False, resize=640, eye_dist_threshold=5)
+        face_helper.align_warp_face()
+        logger.debug("Found %d faces, restoring", len(face_helper.cropped_faces))
+        for cropped_face in face_helper.cropped_faces:
+            cropped_face_t = img2tensor(cropped_face / 255., bgr2rgb=True, float32=True)
+            normalize(cropped_face_t, (0.5, 0.5, 0.5), (0.5, 0.5, 0.5), inplace=True)
+            cropped_face_t = cropped_face_t.unsqueeze(0).to(devices.device_codeformer)
+
+            try:
+                with torch.no_grad():
+                    restored_face = tensor2img(
+                        restore_face(cropped_face_t),
+                        rgb2bgr=True,
+                        min_max=(-1, 1),
+                    )
+                devices.torch_gc()
+            except Exception:
+                errors.report('Failed face-restoration inference', exc_info=True)
+                restored_face = tensor2img(cropped_face_t, rgb2bgr=True, min_max=(-1, 1))
+
+            restored_face = restored_face.astype('uint8')
+            face_helper.add_restored_face(restored_face)
+
+        logger.debug("Merging restored faces into image")
+        face_helper.get_inverse_affine(None)
+        img = face_helper.paste_faces_to_input_image()
+        img = img[:, :, ::-1]
+        if original_resolution != img.shape[0:2]:
+            img = cv2.resize(
+                img,
+                (0, 0),
+                fx=original_resolution[1] / img.shape[1],
+                fy=original_resolution[0] / img.shape[0],
+                interpolation=cv2.INTER_LINEAR,
+            )
+        logger.debug("Face restoration complete")
+    finally:
+        face_helper.clean_all()
+    return img
+
+
+class CommonFaceRestoration(face_restoration.FaceRestoration):
+    net: torch.Module | None
+    model_url: str
+    model_download_name: str
+
+    def __init__(self, model_path: str):
+        super().__init__()
+        self.net = None
+        self.model_path = model_path
+        os.makedirs(model_path, exist_ok=True)
+
+    @cached_property
+    def face_helper(self) -> FaceRestoreHelper:
+        return create_face_helper(self.get_device())
+
+    def send_model_to(self, device):
+        if self.net:
+            logger.debug("Sending %s to %s", self.net, device)
+            self.net.to(device)
+        if self.face_helper:
+            logger.debug("Sending face helper to %s", device)
+            self.face_helper.face_det.to(device)
+            self.face_helper.face_parse.to(device)
+
+    def get_device(self):
+        raise NotImplementedError("get_device must be implemented by subclasses")
+
+    def load_net(self) -> torch.Module:
+        raise NotImplementedError("load_net must be implemented by subclasses")
+
+    def restore_with_helper(
+        self,
+        np_image: np.ndarray,
+        restore_face: Callable[[np.ndarray], np.ndarray],
+    ) -> np.ndarray:
+        try:
+            if self.net is None:
+                self.net = self.load_net()
+        except Exception:
+            logger.warning("Unable to load face-restoration model", exc_info=True)
+            return np_image
+
+        try:
+            self.send_model_to(self.get_device())
+            return restore_with_face_helper(np_image, self.face_helper, restore_face)
+        finally:
+            if shared.opts.face_restoration_unload:
+                self.send_model_to(devices.cpu)
+
+
+def patch_facexlib(dirname: str) -> None:
+    import facexlib.detection
+    import facexlib.parsing
+
+    det_facex_load_file_from_url = facexlib.detection.load_file_from_url
+    par_facex_load_file_from_url = facexlib.parsing.load_file_from_url
+
+    def update_kwargs(kwargs):
+        return dict(kwargs, save_dir=dirname, model_dir=None)
+
+    def facex_load_file_from_url(**kwargs):
+        return det_facex_load_file_from_url(**update_kwargs(kwargs))
+
+    def facex_load_file_from_url2(**kwargs):
+        return par_facex_load_file_from_url(**update_kwargs(kwargs))
+
+    facexlib.detection.load_file_from_url = facex_load_file_from_url
+    facexlib.parsing.load_file_from_url = facex_load_file_from_url2
diff --git a/modules/gfpgan_model.py b/modules/gfpgan_model.py
index 6b6f17c43..a356b56fe 100644
--- a/modules/gfpgan_model.py
+++ b/modules/gfpgan_model.py
@@ -1,126 +1,68 @@
+from __future__ import annotations
+
+import logging
 import os
 
-import modules.face_restoration
-from modules import paths, shared, devices, modelloader, errors
+from modules import (
+    devices,
+    errors,
+    face_restoration,
+    face_restoration_utils,
+    modelloader,
+    shared,
+)
 
-model_dir = "GFPGAN"
-user_path = None
-model_path = os.path.join(paths.models_path, model_dir)
-model_file_path = None
+logger = logging.getLogger(__name__)
 model_url = "https://github.com/TencentARC/GFPGAN/releases/download/v1.3.0/GFPGANv1.4.pth"
-have_gfpgan = False
-loaded_gfpgan_model = None
+model_download_name = "GFPGANv1.4.pth"
+gfpgan_face_restorer: face_restoration.FaceRestoration | None = None
 
 
-def gfpgann():
-    global loaded_gfpgan_model
-    global model_path
-    global model_file_path
-    if loaded_gfpgan_model is not None:
-        loaded_gfpgan_model.gfpgan.to(devices.device_gfpgan)
-        return loaded_gfpgan_model
+class FaceRestorerGFPGAN(face_restoration_utils.CommonFaceRestoration):
+    def name(self):
+        return "GFPGAN"
 
-    if gfpgan_constructor is None:
-        return None
+    def get_device(self):
+        return devices.device_gfpgan
 
-    models = modelloader.load_models(model_path, model_url, user_path, ext_filter=['.pth'])
+    def load_net(self) -> None:
+        for model_path in modelloader.load_models(
+            model_path=self.model_path,
+            model_url=model_url,
+            command_path=self.model_path,
+            download_name=model_download_name,
+            ext_filter=['.pth'],
+        ):
+            if 'GFPGAN' in os.path.basename(model_path):
+                net = modelloader.load_spandrel_model(
+                    model_path,
+                    device=self.get_device(),
+                ).model
+                net.different_w = True  # see https://github.com/chaiNNer-org/spandrel/pull/81
+                return net
+        raise ValueError("No GFPGAN model found")
 
-    if len(models) == 1 and models[0].startswith("http"):
-        model_file = models[0]
-    elif len(models) != 0:
-        gfp_models = []
-        for item in models:
-            if 'GFPGAN' in os.path.basename(item):
-                gfp_models.append(item)
-        latest_file = max(gfp_models, key=os.path.getctime)
-        model_file = latest_file
-    else:
-        print("Unable to load gfpgan model!")
-        return None
+    def restore(self, np_image):
+        def restore_face(cropped_face_t):
+            assert self.net is not None
+            return self.net(cropped_face_t, return_rgb=False)[0]
 
-    import facexlib.detection.retinaface
-
-    if hasattr(facexlib.detection.retinaface, 'device'):
-        facexlib.detection.retinaface.device = devices.device_gfpgan
-    model_file_path = model_file
-    model = gfpgan_constructor(model_path=model_file, upscale=1, arch='clean', channel_multiplier=2, bg_upsampler=None, device=devices.device_gfpgan)
-    loaded_gfpgan_model = model
-
-    return model
-
-
-def send_model_to(model, device):
-    model.gfpgan.to(device)
-    model.face_helper.face_det.to(device)
-    model.face_helper.face_parse.to(device)
+        return self.restore_with_helper(np_image, restore_face)
 
 
 def gfpgan_fix_faces(np_image):
-    model = gfpgann()
-    if model is None:
-        return np_image
-
-    send_model_to(model, devices.device_gfpgan)
-
-    np_image_bgr = np_image[:, :, ::-1]
-    cropped_faces, restored_faces, gfpgan_output_bgr = model.enhance(np_image_bgr, has_aligned=False, only_center_face=False, paste_back=True)
-    np_image = gfpgan_output_bgr[:, :, ::-1]
-
-    model.face_helper.clean_all()
-
-    if shared.opts.face_restoration_unload:
-        send_model_to(model, devices.cpu)
-
+    if gfpgan_face_restorer:
+        return gfpgan_face_restorer.restore(np_image)
+    logger.warning("GFPGAN face restorer not set up")
     return np_image
 
 
-gfpgan_constructor = None
+def setup_model(dirname: str) -> None:
+    global gfpgan_face_restorer
 
-
-def setup_model(dirname):
     try:
-        os.makedirs(model_path, exist_ok=True)
-        import gfpgan
-        import facexlib.detection
-        import facexlib.parsing
-
-        global user_path
-        global have_gfpgan
-        global gfpgan_constructor
-        global model_file_path
-
-        facexlib_path = model_path
-
-        if dirname is not None:
-            facexlib_path = dirname
-
-        load_file_from_url_orig = gfpgan.utils.load_file_from_url
-        facex_load_file_from_url_orig = facexlib.detection.load_file_from_url
-        facex_load_file_from_url_orig2 = facexlib.parsing.load_file_from_url
-
-        def my_load_file_from_url(**kwargs):
-            return load_file_from_url_orig(**dict(kwargs, model_dir=model_file_path))
-
-        def facex_load_file_from_url(**kwargs):
-            return facex_load_file_from_url_orig(**dict(kwargs, save_dir=facexlib_path, model_dir=None))
-
-        def facex_load_file_from_url2(**kwargs):
-            return facex_load_file_from_url_orig2(**dict(kwargs, save_dir=facexlib_path, model_dir=None))
-
-        gfpgan.utils.load_file_from_url = my_load_file_from_url
-        facexlib.detection.load_file_from_url = facex_load_file_from_url
-        facexlib.parsing.load_file_from_url = facex_load_file_from_url2
-        user_path = dirname
-        have_gfpgan = True
-        gfpgan_constructor = gfpgan.GFPGANer
-
-        class FaceRestorerGFPGAN(modules.face_restoration.FaceRestoration):
-            def name(self):
-                return "GFPGAN"
-
-            def restore(self, np_image):
-                return gfpgan_fix_faces(np_image)
-
-        shared.face_restorers.append(FaceRestorerGFPGAN())
+        face_restoration_utils.patch_facexlib(dirname)
+        gfpgan_face_restorer = FaceRestorerGFPGAN(model_path=dirname)
+        shared.face_restorers.append(gfpgan_face_restorer)
     except Exception:
         errors.report("Error setting up GFPGAN", exc_info=True)
diff --git a/requirements.txt b/requirements.txt
index 36f5674ad..b1329c9e3 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -8,7 +8,6 @@ clean-fid
 einops
 facexlib
 fastapi>=0.90.1
-gfpgan
 gradio==3.41.2
 inflection
 jsonmerge
diff --git a/requirements_versions.txt b/requirements_versions.txt
index 042fa708c..edbb6db9e 100644
--- a/requirements_versions.txt
+++ b/requirements_versions.txt
@@ -7,7 +7,6 @@ clean-fid==0.1.35
 einops==0.4.1
 facexlib==0.3.0
 fastapi==0.94.0
-gfpgan==1.3.8
 gradio==3.41.2
 httpcore==0.15
 inflection==0.5.1
diff --git a/test/conftest.py b/test/conftest.py
index 31a5d9eaf..e4fc56785 100644
--- a/test/conftest.py
+++ b/test/conftest.py
@@ -1,10 +1,16 @@
+import base64
 import os
 
 import pytest
-import base64
-
 
 test_files_path = os.path.dirname(__file__) + "/test_files"
+test_outputs_path = os.path.dirname(__file__) + "/test_outputs"
+
+
+def pytest_configure(config):
+    # We don't want to fail on Py.test command line arguments being
+    # parsed by webui:
+    os.environ.setdefault("IGNORE_CMD_ARGS_ERRORS", "1")
 
 
 def file_to_base64(filename):
@@ -23,3 +29,8 @@ def img2img_basic_image_base64() -> str:
 @pytest.fixture(scope="session")  # session so we don't read this over and over
 def mask_basic_image_base64() -> str:
     return file_to_base64(os.path.join(test_files_path, "mask_basic.png"))
+
+
+@pytest.fixture(scope="session")
+def initialize() -> None:
+    import webui  # noqa: F401
diff --git a/test/test_face_restorers.py b/test/test_face_restorers.py
new file mode 100644
index 000000000..7760d51bf
--- /dev/null
+++ b/test/test_face_restorers.py
@@ -0,0 +1,29 @@
+import os
+from test.conftest import test_files_path, test_outputs_path
+
+import numpy as np
+import pytest
+from PIL import Image
+
+
+@pytest.mark.usefixtures("initialize")
+@pytest.mark.parametrize("restorer_name", ["gfpgan", "codeformer"])
+def test_face_restorers(restorer_name):
+    from modules import shared
+
+    if restorer_name == "gfpgan":
+        from modules import gfpgan_model
+        gfpgan_model.setup_model(shared.cmd_opts.gfpgan_models_path)
+        restorer = gfpgan_model.gfpgan_fix_faces
+    elif restorer_name == "codeformer":
+        from modules import codeformer_model
+        codeformer_model.setup_model(shared.cmd_opts.codeformer_models_path)
+        restorer = codeformer_model.codeformer.restore
+    else:
+        raise NotImplementedError("...")
+    img = Image.open(os.path.join(test_files_path, "two-faces.jpg"))
+    np_img = np.array(img, dtype=np.uint8)
+    fixed_image = restorer(np_img)
+    assert fixed_image.shape == np_img.shape
+    assert not np.allclose(fixed_image, np_img)  # should have visibly changed
+    Image.fromarray(fixed_image).save(os.path.join(test_outputs_path, f"{restorer_name}.png"))
diff --git a/test/test_files/two-faces.jpg b/test/test_files/two-faces.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..c9d1b01032a7298d76608c8b65cbb243463491c5
GIT binary patch
literal 14768
zcmbVz^LHju)9n-6wr$(Cp4hfEu`#i2+t$QRCUz#~#7-u-dEfhe_Yb(${adZmeb(t!
zyQ=oC{#yIm1t7~x%18o0KmY)c?+fs?2?zy%{yzf)0|f&E2m3Z~Nbvs{Bn$*3#P<yY
z4Fd%Q1p|!;2Ll6#h>QS_fQ*KMf`*NSO+-Ws^8eO<um1mez}FxE1sX680ssRc1%RS}
zfT4hV4Fk9U-#Y^X`8L4+7$g(~G#EGx02Blk1n^(i|9Suba0pN^NDu(j*9HI{?AwbB
zhWzb3o`TR(%7Gjh6c=V}JYy{4#r2T_%O-u=5UjYkT{xw#g$y#53p2rKXl@en&L-Vt
z!od{Kf9pPAJ%nikUyMwrqZgShm&Q9S5Rc{_Hqv{dJWpmL-+;J78!=IdI-SBlfQ3ts
zJ5;_mZcHC>pBU(=V^Q~-H4M7#g@dO9zX4+!d2gNfw(CEqw0*5$;BaF${~6H~tc~o#
z_=CaCf^y3T4B`<q@}t5LI+>tXfI!ujEhIeVg!1U-Bn4;cI5D`Ju6<r-zST}rm1B;4
z!-mtOLW_yZ5Q%G88P!%30f;FUYDDLQX+vXm-Suly>(vFJnmH}txN}~biq~yee(OMc
zX_Jzrvr-8Bw`n3>f?;Cgig5eMy#^<bZmQQt(~qSZ7E;<+6aUm@yMHMfyki%q6r}x6
zS0^q7Y>l4FnC!wf9R7c|<Cs%nf48fC+73r!SXD4+YB`0fC5?>hQBhx+$hVSM_`$oH
z3~XPl^$2<}H+=zm8571hkCPGKCRb?{E$vDwNlQL-5iK&L{w#Z#3EA3TYvG*EMkU1A
zk*f@MRjxZwZPcwl6=$wCx~_7voQc{@VAP<jsh&f9fV+Io(IrQfxV^Zk_o<cXOn@KV
z)N8eV%u4$|dI+kRSzy1(ds*z3P4}4Qh{^iP-Z^bM)-syhs%OKDam8j%;4z>v_1Mcv
z`?M)V>uVmKNSu~-A1(SP)dkwC{A$JfZ7G=XUeT`6z|i2-qhh%}ISi2tHw>$lOqOhj
z80$M#{939h#mRz#1mq9@yzVS2@>OrDJNzd{hNN*{6dH@~<Suuac2E-KWo!86j>z%;
z?_|t|JKV$;ai&gSEjH!3z?qngyNPx;f$?n*?QUUSDi&9I^k8C~cmHpV1`<ME`fAu}
ziS>+%KDCB>J6^5GDC*R!s<)wn-EW>O=K2v@>)eR0r(CaP{?EQG%`~9QN0k3O&OD5n
zv+O>Q0h3CtJ%_WCHSASGj^tjKYdaX8(@a6GE0!9gP@_-W9V_|D<>N<F>*&PpqLjpp
zWEAv+GrSV**8AFNbkaO0^FH6{68!3zx~{P%{TIL<eirK#%EHPPMkO`X+Bcnoy{cao
zf2f(`A5D>a$ym*;B3zP?WGxg*{gtk*mtmmp-mix7hY4<X;{z;D_FAoO)1p|Qv62xb
zS@OLs-Hu9xNGwRp=P4VUhy@=L2i^J_IfE>GYsC|j-3b9cuH>`3G?5;DxwaK1qAg2p
zk8a@rb3dz0fy*sq(x?ljVJ>OT7eL@mCbhr9u`$wC$Yw~s@IsxUMdfN?u3VG<;aQ0G
zS2l*kKwNjpZ9X#|st`#V)cI3d!V_$b$vs{X@I;`tK18k2>ylT;IBD23zam<fJl-LS
z@rOt&C@A|0Nj*OKFNrY`rLy5lNwibB=1LmcGP|7&TNY4+2*_||+3nxukBV<nGuWfn
zhP8@nZ=_g5STXYn43x&_IhYBR=Bg~E?U+3bzF}*lDA8|ve&ut5o4#EUt8h}PJZ@ql
zK0Ie~aTyZc(~5`Pjjj$Usm1G^xu(j@tH-`8u_22wE%Khn&}ixIJi!fbJEQmSVjn3;
z4^+{Ag!>iA{*(<P1U?t?xPONc6S<4e<aS5+#|gDpSNe16DM!i2)k`%~uSgeCy*H*|
zC6f<EPwLFIi%v7)L#Ke@$BA^OHqN!5;X$Kr-y`N1;2T0okpByz-?;j}APNQ!3IPCt
zghE9_N5R1S221E~4+tp8mTfd^VDyx8+MIx;w)VzQQ{z?LTkY{)t#B=~g;)fbtUwfV
zP9ME-+cbaxaJT7xOqO5k3A7Nwq&As!!1+Co)i#P8YLt;v$~L+sx@?3%&w2t)?u@S$
zLyne9P)pnxxJgBp<l%sj>U5l0p3NCg<+fZlkL+ghzHTA$I+6~3etKo*LhX2k28WLQ
zPtwLRF`0J6?w5a?0(Q{OvN5?CDGR|&h@|VxmCFk)3;m-}F(#HV&AG?4gO2=E5sA#P
zsPc4U82o9>P`Cyi_TgI&;PKHnhNmA-R2(9qQGfONaKDvbHV2~}ug|=&wH2$hD6`$3
zFQjzIxq8<{JXwmkNrf_#XA)JDz(fy`1Uk8YAN^9#E={NSy#8-l0)*qj0G3+aRf->6
zUUc^S7eL07CZ64MZ5^&YOG+f#6v90PSA8S5cSi=rHAlIo&JJp>ckZ?a!IIIyw&Y&U
zLBVz-FS=6h@?RJ!Mw=(~>DE(cpsS3kXfXbb*HgO39vvZ7tZe+PJ3m9OU$TMPZ<1fj
z&yMmWXa2r;988u&QX#9T2Ketp?@aH8FUrNS<krRNRqK_TZe6^@<-%UYiC+Mvx~|_G
zlXp_e?z55pfx>T=ne6e2IsuuqJ9p(OT|XE9>a0jP|4uJ+r?97atpquh)gM_oaT9Ft
zcPG=KTYk`BE0!xc{HJSk;2PBppVp+CcJix>YQ9Aud*-p-2$iGhs_dG)_E{WB@e9!Y
z7wJS(YA?JLK}u@)9^W2A+_j+M!E0gdqlt2=b5gtToMp+i%i5VX9c03-hns&f<Air|
zO?Ry9jN)h-v_9vkBQ+1S7xlT=VoC7CeW17aV#i@E@u2Z!`60%?P2xTpRRiJC?YY5#
zuJw5J5v%+@;&ptdSAC1;?3sTdm%#j2mtb+8Esf=qOJZBV=`^RCAIs?zCe-a(*9h_z
z&D2&vv+m;avqhAG>j%Md>yZE&1nx4l*d{?ORjPx|pXltABl>?3+$hB!l$(G2hl<(`
zZBlKse0pW#g-`bg#(pz->grd0))GLrOn5g$2(e&2c`-Pk#puFQ_go5^_ncMp5sC#J
zs}F`RL(Wwkb^WP4X+cHM<o>fepct2*70F~zf38+^r1FWXYI;duNz-}dbH%dF=X$Yx
zqDsf1A-~)%c1lp2nkAP7@sqt-FkolJt^5-Isyao9V7HEMgR%HU+0POILSrL2zqFW}
zaLdH0)c*2m8>)nNRD;Qw%_{Ma3)Rc%5|~zXTF-@C^L>`$&ZC1X{dzL0-^sRx#CdXU
z>vmXjS}m#vY!s?sG(W#g)xHLn4H*Ujm^$a?&{UZSU8wi+b)O3vxFt=le2mPpf{BmR
zdO=OMY9FP5&0YAu^m4TvL%+F1=os$#@E3rob#<hW73-)sHAebO2JEz*r9mM}YpGa&
zv7UHLP7-ghc3eFoTeddNrKqVbM`b%+WZRn~LAH_dZ&@NqTiM=u1JAlOG_N~2x_q-5
zIs;YXEI3k5szzuFF!u+8b{pkF>7&}YGw+(qMKqrqrj8Q1;*q}eZADq@js(w)d<z>+
zhu#TcSAp=J-BFGdMA5)ow1tP>=X4223u+f#KHGsB6?0L_xO>^T-IH`O4%uc46ZZLQ
z(MToV8@NdJ{F??#C$ossHx!B>13<t)!NI{GL7^eQAiitE|CEs6C=jS5ENG-6Dv+$k
z=wzzSL5ZS`$Qa~oVkRzyP-?+R1M_<~m=vx>gZ~4SU_u~HBsK$?Iuz56+i*5&dP`br
zS;yQ2>gs%Wcy+?^WPb!l)QI-uyUKpXwN&{^95wbFUFF<XCtr6>EhGL_K<Q0L@Gd&s
zMMy5D;ECII*3IaIY{knP&!_eBsFG>`sQi3(A(HUJ>V@fHUg)r%wQAIEp%wy~kU;62
z)nY-IzuO7{p-yLmY$lre!Yq~%D{~j+VDG7LT*An-mUNd{-q2TU&=A{}&D^`lh%puI
zS2QoHTyxNx^g@4h%JPVgi&|UB`_)8yC?RYUxEoY2^UUHF=rD#r1S9bkaEL&jx6WI}
zXL799Gy`{*Ik}~LDAOI9vL8OM&a>Uv;W*}I;W)MNmvPElyO=gwaP_&!)I@aQ3hdoj
zw>e@-;Br-HC$C^<K&;~I=Y7=ERJxGpQnfnG(>^PDgLt~at{&lZA4JAmCqC4++}HIh
z+o9z!!p}t*hJXptjHd2w)dH!-^9Fl7n;2?FC+vqp6!fZQ2`J||^ma8ELE^nwc4&V-
zCDVx{egZk)5d>x1Vl0`n!A-5L=iaKfcGNww=nEin*4bm#z-0X{Fh?eu@GBto&F5_}
z0u@r*oAwL_xI0ecoU%5VJd2WwY&qXj4b@XGpH=erdmKN=bgQ7DF1dRKHajdF5C2w4
z<}q-nSsDo%6at4311$UvkHE|O_s0zWa!FhRNS{wJDCHl?^@l3v=wXvAnv$T8Gr%&p
zL10U#5kkm~9F1MdY=2{1y7DG%2ADMPTkru#2WURYedE<zaH=XMDvEKFcG#pUwAnS6
zT^*WEn;W*uK2&@(LEg&E=g!^`7A2a|$GGO%BZwVxRxct+c;#!1D8_`L)aAm%mrWRK
z&$&<t#QbfnVPfO=z~=LO0$$Zg`i^3)&d-pZuhfCZhzMdoVv`OB|6bO%D`dq{E}bB?
z{`8oPz=&N`gKR2Wh3MuxE{W5W+!cWwrj@1Q^lDAMX}2xMg&{374PuyN&Gt*<ofd?t
zXr3UeJK_Foeh__Y#=*nYz^OHh`m|^Y?D5vGrRIqkufmrlvKu_II+}iQwGoVbH8ef*
z+4829jU<XTnh#{iv(jGdB#e&m=q^HZ4?fS+X-dVnJKpB_RIOuhqsLcv7~lCuN=2t;
z?@9(2*k9rYN3i^aNH!HKr90h@D*k^euYU-7<=OrD^#WRUb3hihyL6ytkNl2xt{L5U
z2hAoqW}8mke%MG0*;QAinfb#3;5(9ntuI#LEVQ1dZ_&5%YpHy%k=3f?x9n~0FBbE#
z<-Pk=Ia76c2C@LOZZeHK0(6Hgu}L<)_yYB{NOKR;l_QTD2VJX*O6yFVpY7-L4X?OT
z5M5QU<A-~LjH09bY`~2wYN9kMJ+sTr!+g34Qz)EYy=DA$Q2X(p(Nve{V?WcM=iS?<
z?tz&H!#7VG;pGN~d%yOm8`emVuYxQpV=4G^35v*z)Lm-;^BNs?4%NDIRVxnYNrWEy
zT;T#6?Gl*Sv*cT|?+jOJRW?dJGaW~Kh$X&H&}uNwFwmfaT3>+b0>U7%sBz~DRsKgd
z{2{?kDRVJ^m7M~nGC~#xB$pDgo0vdsY#PHJZlx2yq#`APvUIpM#EZj)ANoTotLSPP
z6doT9C$SBe{3zWQKzoI4i)AIYTwh9E0h~TI5CDp~=+5?DWqQ1>w$#w1mG$=-_{1Gf
zI13@?zRxD1E~jFXeq)>YZV@LFao_$4RLo!d%VTBB?iV{m@v?aeZ02YZpK6G19jZ_#
ztn!EzQRB{qt?V07M+^hD7pDqoKqx{b&^iXv1_VjmKoIm)VPl^W8Lb5G`^#<or6mi@
zNuMZt^+D34`4L-eT1(dNR@yB&J0KTh(!$NbG0oh81GoaVxHU=;1_CgY@6m6&Y^7LB
zM;p4*!v6>_9Oo;SFq9`L@%b%&+Rww%V@4?GZRS@cRe<?WHcMfE3rPH{A&t%aTfG<d
zYkJRpPn&XYguN+0ip8nn8b$bKs*4MbCy9-wDI1EhuI#wjd3qIO1}{~rp>vcwZ+FiD
zuI?Gtj)yOmk|_wE4ejUrehRqw%Q22&a_NAwFNoL;wLb*NJBZpBAPpT;@R9=@SwTT%
z_r><iBn9ROf2_jsV=?Pw1=w=T#XQovddvMxEBk}0+N)RW3(ym{=9|^SxA!KspV2Df
zZZ`VH`6^XufL=w)>nqEA8Dij4WfOFDXq&x-fux_xU&Et9aw(`9%AH38<{MkVNTjc3
z%iKj7d&eaq5iEML1RO)QI=@t>B>OD~T*u{V8qS6K+`5!m_7UH2CmPOs*|^>g`BikY
zf;cK6hrzD2nWxb@cIlF7$Q_4m?gZt;*8DM`X4(N<x?|nD;Djnv-R-0N8B>n5r!IK|
z=f*>nX^Kr1pRFA~zKYuk!IrlIJz$nIe=3N+03rg8EaF`c<SU35jj{LHvKwvb$QRDv
zhxf0myh}!RAZ3};=IyIYY*t37^LS0MJ>M8FOng9py5qbG@buGw2p6ZJbxK56z!CUs
z=<0Z8ZGbdp<ESK0n(`R88!(uJc17j0=O|Y-Ey%itmt_$2BM<84_~r{hkbBToII?C`
zQJfXt(w>@paj=FlGU{yjmk#wqZv&q3&&KXCjDCx0mp=&=9@Z^-e8N2@mNwVW!siGT
zDqC@l15j%eV3YOf+3c=QkUg5eK9yr&V!rJ#bJ6fZ8(?yCDIuZF`&;`6s;{alDUXHD
zlwcDxkP#2IwrYe0w}y+j@;lm?pSG|N{LkB(2UEBoO~rqNVfccvxy^0|kZ(-02A2^1
z5gqdzg!T^%=A$A-kt%XW3cuyixm&%#pYt)4;7~%OwYo4{{>{JgNUh4t%Pq4nGwygy
z{`{z7{Nk&~f=JfSL~Ap5AZwf$VSpe8mi>HMrRAa|B8AQ~oOk@(@v1E*cZuq>F()~N
zoR3^r`SXoabWKYk+j#52fNg;4$A<!eIMz`T4x{u#@7R;naKdjX{Rm#%HoKE%x&g%x
z_sE^<rCRJ9x3^_T3nK&Sde@S=ky$(xPiD}2q07a2_!%D@b#{egKXtWuo=-_K0`jGP
z7kvqNqG4lK$rAJF2F;!(7;M5tlLdMa;2RgS@=SB0y={noij2LZeHz{upq#k}F`24O
z@W)YR%{$9j>1Ro?P<X@U)p&@CRM+DNVbv^_a43upW$Y@_kh@C5`FriZR75m!!UM0S
zol_`@i2WZ|s06ALblY@?St#AO^G)Gtm@Qgbw76DVuScL0YZuA?1i0-`qL2Nr<k6eV
zeZ(Sxj5S$tK3LorSJf~+(>FjR$aaRGZ?y1?IAD00myXv&mH!G=qW4j`9Oxo8#i`%L
z@D<&&`Q!w<t7wX>^76Q>=;cI!KitXVZE@+hA*bL63hntH5&Yz`$d80D+umrSGWUxI
z8{(}iTqP|#j$#AfQq@(^8mSd^R+GtMd_dis>Iy2&F-ccBm0cp-Xf0->@2*`bqfv@)
z`~)dN)dgYGG$e2mvya@D6^YNzonbSna&LGw1*I$~NB1mm9t<Z91V@mgIOlmViYCKW
zB6SZlm0ze0j}Z4KUI878zd7UJd9a~U48=#k0Fg-hgw6_rs@|7XS$TP0Y<Qq*Jz@st
zRoUAuccY3s9BL3mSnyS8I66Xsz5_pk<DY(nHUZhqc^*0+wYJ?cINbHwtk10{ejsqU
z2L1WFVYTn<e|eQjh^J2M$*Ja;sgYN#yFR<xl@#8V)x^Zl+9vXu?2dL?kVm!M#8OA3
ze%b1lGV4rF#TNFDo};;~%DZfc<i*|l$=W{DxRpCrdj0Nn^aJWc^CeLQC#EYaHXUKf
zKEncTgnN`pmyt{x{Q)@NdYMM4#dC!d*k#$!um56qIR8nc1j2t^8CEMSq4EV7BqLJK
z9<p<O$$$M!3MNQwo%?_ok%MP)bLzcn%hZt_CeAA=t4b<dMg=v2WJ?XPXId7Rcglx}
z(M$HbLVx(fIz(sM8`??!83Bh*UY6Ls<!@sH+eL)JDLZDoSE8w=hL&Mn5^MEmTlD6W
zD~%s)l!*x^dhK~ba#POC`_`UKS6xX>PQ?p<ir*FfS7fTLh}JX~-cT)nNn;s`>t<0+
zNw8E}XQL$#$OUk|3D($@J~U}@4qR`wce7*LMy;gO1BE;-iO}j-!53g2xT@CHmS*wu
z;K;)vq-%t^9vS$1-;UK+HgkxbGCxDlpTOtAfZCm3sPdggD!e%)ahW@1NaO>>!M`%2
zIs5GyGas|QTRM(qpe%NJZ9TVFokD=lnr7FOthK2@3!H{*k{w``Z@6cZ*iDic{|}E5
z&%|i^dRnSL5(#eM_?~B0XdASfE>`&>WAXf_eaf<G8@1N+<bg#Si;KEjVe<Z>4}ZWB
zSC(PDaDyJdPZlN57XUx!>hl~rJ3SE0Ux%I!vp5O2>c{q^eGbTzvT^HNYo@*eo!)6A
zQI_p34xNk3sFz!lUEFTR`OX9grdYvMgBZ=z7FFKiKLsYT*1ttVhp`EB2iV;J%<**s
zpIq32vXm8B@}#nz-l_nUOR%LXd|oV+f01>hj@tNDEZ!5<RS16$3Qp(O=)@{cYwPXN
zU;Omc-Rc=q!!K!4jpOY@cXhm7SC38$Yr~QkV&-z~m;0hOvih;_e7KX!$`MXXp_I|t
z`z`z~msz!Y({i@A?dK>>I`j<)_a^8WEw>!1j&8}AJ%<nYU9n3r*O<@MDw%CK?K69H
zJ$}!an^u8T>-q=E){WY1Do8U1;QdaGz0$zhdjW*y$TkTaUg3`&t<jY+>yH@>xl~y|
z49<`8ogW_004KKdt`|Aqx6^F!c~v>03g_}GM8kt}+FZjXvtVb*{e-$oYD4~Pe8(<E
z@Uquv@8Hnc&JRi)?;0V{uori-`&r+C?(W6Is#}|gC3TsAuwGmNOsj#bL+q7c&ZYO@
zZ|Z4DqfkkPJ1b9hLdOArndMo%cv3iNljs<T7%2lD^Ga*wnE^qe7Q=4VrD=thiA3m{
z<!Q68p^1P&f3IPhApq_$T!jlEjA<9|c!yRx88PHw&q=X2t(wfl)E7a=esLK!<SQjU
z*+q4aGK1E0Az007AMfr-x~<L$Av$RVal|`CHG0UKLgrw%3d<Eor|76YY?|#C(1q`B
zymA&h7?XBn4@4!ZoK*XPV5=nIS!peB#2;3tl}k7C2#4;*%1U*(u&U0H>Z+tp;yde1
zp4y?KCMh|M<hcrj@%x3>)cz(qiR5xLtt8Ia!QD*8Ckq#F1RP{roIHMmP~)p*k5d^T
z)9+66_E^_mp_yyr!K?m0I+V!8mBkT$Mod=<E2q!%XlS!f@*AN4lqObBUX;-G<b0pX
zTor9b23;vi$Jy}NqlyBaA^db=gvp}2HKv(QEeFlg70zSO29fwXZD}&|adEo;;D-t_
zj;Z^rYfBw{;pQ!#`=u5n6|y5&bMMew<}_L^7}#%SObxCsyVxV%Iy^mDe-cA@g<o(&
z+t@qVh13&tq=c3oNL|P{F7Fyx7wnQnbu#!4D+q#->5z{*SAj<abxl2?r?xa@4wrt5
zHkIqy6{0+D>sswbe1+)trb;tV7=xamj#O2@uSwjm*JV%tjM<66>fsR%TMOG7`G!mC
zxp{Qaoaqc;S|qM;ng6T$qjv{2%7t`ZNjv1eH-5>n{L4S;x#>}_oO7&RoYrZgz@t+#
zpZq+Bw=7BQ+6XE@{D(4M1*79y*#vrGK3!Pf*#qn3sGCpr)4UAI^2F%zxy=~z=8+-?
zo8r^m2wdcAw{Vk{H^e`c+@aK*i3Q%Pt=#l3pU9<}E7$OAVx$RBbN3de#lt1J583_c
zf%6y@mO5Mn<Rs?3E|e5=r;8keLEf6Zc)ZFCi7l6PXI#6TN+8gAZ#e<8t2*kK_oLQ*
z_|kn^N~Flm-pci^<zk5}6s=Q#m9BQR@I1@hjp?rI@?X9ANY&mNf52MN8)Q#F-l5*0
z!u69%H%k4_&7XSyZ^bJ5Ly=VJoVneJo_PamgWsaYV68CWif)|i7of1)Z_ZDyz}7yA
zGTkp7lFoDIw0Wv-0QzR0gwXcaS%Z$V{fnOH6t?H5O{66<4GV8gV_ZRgk((V9W99Q6
zzm-~02jmokSg}jgjBD*3@Rp0FEh;Vt^^X0t1gEm$_}G7|8YMGDWrhD1F6S?)o}RZ=
zM;t?6x}LRu`Z%I+Q-?T%Mf@B^Y`?9U^*Iim5()47oCO2=YCFnzTIfUxS=_nh?p9KB
zPh;^}JI$Agn>>CskFEa#gk9(Iy^lyCdb$e$Uyf-o(7BKe`L8eWJVhn*<QvWBd|8^!
zY#{9QdWI*Z-n5I5d=ITgb84kn;DRS*DIen5`hv`gvCap+0BiI!Q}2PFGL0uen|vbv
z2WA`ml2SrbKD63vo78$@fv`MUB5><vZZXh}eN~oTid9Xu>w5R8YJ*!x?Vc`K7cRUS
zm+0EH_ace>k6p|Q@=vcnCwDUSC?kH2&%2!w>hV_N;RR<yXoV}}c#{s%F_nKSS4a^^
z`J9cN7ed0JKm70wp|P}$oX;{jJyciPr-z0DCN{znyD$XoZWo2eWZ2{2P4PM@BtnMI
zDUsh+v(;s{z4|~-m#Ce*J8|isxyY0UZYs&zdvZIo{C&!K;sfp>1H<(`OPyJ0{IH>(
zU_XCDysr=$HI3^?mgOCsD%(r2X)h)Dm*P<Uw*LaKAG4H(mU9fTD72=SA*)GD$#%EO
zUxYgK?8q^#y!@fi^=Z6I8x2V8f=G?E+D6^5>cZ<Ordvn8b2!u)ptuL=qUhW-;X{AR
zGUi8oZv0qm#C0^QZf|WckNpB*6>3*+hBuD!mGXKx>Xt!{&dz8Qq-?9nVIj6ylw@`i
z*$<pbRfYWyKpIAm4SsVL3%NYUz(k=t7^3y7M@um0&~|B~%)O&*&i80>@XjkWFAvW8
zv^*9f{-w3TZ%K(gn_IJ-M>>C-tNbJk`P}|5As;zU>Xq&9oy~KKDvI6jt|@J$vUpUa
zLTJG*q}+(fbI$!F75+`Gp6E}}$r(&lgyc8$t7;%GeRut`7Jb9;KcKo#nm3eJ?vuFr
zYmp1lkwyx;n~~M`r@SVMi~eRwDUiQe(f_FK|CQJOqppJhP)JxrRE)ow(ZWXLft&fg
z{}qjdko75Ir;y=i6=sr|d#vs8ESIl%mC1RfK*DCnwL?A%GBi4jaQV&AfSN3+`VyrZ
zg+0=eR<8~!Aqib^K}H=Yc+sjcPA!dM%<V4N96Co&&(8g}s%gfi&N>!wFc~YSadN!a
z?MLQuvPF}Nb;thu$jBH_>OK_Wzb=TQ{YN}lyCD%p(hwqCs;%{9JoN|I&uIW(fFNk;
z&??v%RV!+f$V!7W^L@;@5k}Hsm-XNjJM7FA*~*`~6>=O-OPm|#^HympQ9q5-J4Gz$
z8AKR$Q>w058{}QnWpo;qD@Brs$qzKKyJiN)!)kXAbgWs{R8SHQNEbS9F>hQS`hNm#
z2*xxclM8-|!Cx9z^n*TNp;*TRg$x(iBcl%47~z-A4_{i)g;SIxLn)dDnmZ|UrmyYE
zLbj~{q;Ihy^@dfC0!uNA8D(V0M^VRK47Rp7HONwXVA|RmG!ueqDHc@PbgxQ@bNjo5
zsX7k_2q3RWgPELuZiN(=r5*U04iJz{Fi44Uol9a*sUE{)atuS$6hIs!N!zas+)-(g
zbrTDRqk*UElA975vw?<7qN~bcb;wWl%BnV5vY4ss`J%}KMzAbd@*88fH=?qE8VtRo
z5X@3|TqT7f1+<8Nr0ZKtup+GiLIXjUMCt<k%;pXc`Npj5j9L)7Ag+FRnAohsjLn>X
z$U>YAScf#70Ke4|QW<83Uqrk)IhYuXL$*4eH~-#ACOU8pqZaB&Ni?H~B(ZWr$5jnB
zA#1BUz{LJ0b;iZEhKc8lm8uG4HGqhStf3Fhb?R0th<=KgCdhb!5nkLiM1nL8yUl@W
z{TvnQP-qyBHpQ*DUC7(!?@E6;CwYTR@3g!}#KsKNDpvjibdxi!8|7VTw8vek!n6Gl
zL=guVN%_+E_3vSEGDlPxQ8A7XBFPNX4Vm!7#|=wXd0>+lb3}G<VBqi%w=FO(#>l_H
z5L1k$Bhx7&vAk|Bs7Kb7+8xo@cQjV2fK{lq2*DO{G*^FkSvVNDmt*}xTU%8a!b;P0
zsB0ex)EPY8=aHY(VP1wf<Mv(>yO2SoI5Ph|xXKt3dYZvi!lT*8EP4T=u0I^!R)b2w
zYRlot*w#sT7xd8W&t#Wj)h4T#&zRGn)Hv6D>kc{4m<3-ZGA(kL`D)!skE)F{+>!^O
z^nMNt?Eyqw;+KNlTk_L-)MOOqyS=!D2cjpf%kQnF+@Hf60uJm&@3^LCCUIZ^6)=dZ
zB*%<?QE5!%V@0;Nueq72C7|UOM_}6m6>7A*naCI_Yq`NJI*t&29KxQ6pL>h)>B<Ah
zAq!4=PZZOb0H=vJS#XXx`FQ2pl3Oqk`FmJ@*1!3O-zhZ;`U@cIBDVx8Gy<*VxI~CF
zh?^32x3G)Z;Qp@~BJjulY4ky*NPQZ#mUWD$VKE9xJ(9Soba;k#?mZ1=(1Q~zLMmf)
z-bZHjD}-~k#x=ExFcqt((}UVg9Nc|OFS6(cU_I1*C9RWDhi;t4dJMHT+XO*)CRcqJ
zjZ`x&ng>Zj%F17%DuCEyeDj^miYTde-$fN9f^|$GDYntaZ@Ua+vPJIhFRtNREeZkY
z$ZvRWN=hR#emy3!3b_?yb$<}@U-HIgsu^VlF48f)6-4`NB05R?gEkE3`ZTob{w700
z{-T|rov7=xAIP?{SbV|#N*z?EdSb)i0O2E;cAtVEtO(@gEu&K>&i-MLrkYqM3A6}~
zB>H_5Q*DR60fAFi%=V&P`*=u}WS|}<T~qajJFchBWNd@W7XVsn*cMQS&e3;~emb@e
z5%j=k*=B0Nd9_awA~IoYjsZN=C6LS7lfWha;rABr(9mBmsV&=u>4v|^saDD=JDj)3
zsB}2=0Tbge%FOqm>?>RlkNzY(BCPXH1TP-P4|ip-ze2SGL0Y<{`_1E-%2QN>p<rQe
zf`JpR;B~phS;%3wU)`a2Xf^QdK)lQKYoT|{?`DFD?dkS|N}<;}Akub(w;+fj<(7^Y
zNuFlrozd~LW&KX@NS{~_2c0$e0)QgbqHbD52KTgF>CQV_`!eVh53<9%kyJ);4C^z?
zE)by9wlyM<vRGhL#gJ@{OqwjKL<a4yc!}Ukr(~y(C)FvN>~8tbsWmf#;yLwWr#f+m
z%Z8`&PjF>`iTr?EJGg+)q&dJG9)>Q0q}DgsZ`h;Io(HVI*;8~s$YHr8jieU&>sCas
z)#PVO?rJa2Qgi&m-j_|+P?oD$1^KaLG6`TsnvX0+C7RCt9n4FM53@N=`NuloeJvNG
z>t@DBOC+4lXp0ig6s%O>K8%jAP^Zul3wqHE%xqdsEoB-t(OREE2+9)Z-^S_}o}=kw
zy9dE=y{Am2FSiXbAe1kSh<wkb)<4WGJ*5<=ZAu0)pz^>|AnkzHM{yTX3?~GF*v!*g
zy=6q?QmE845+xO4CWK$kP9ite%LHCj8Pe40<Ui6bSPQ!=MlDO_ga=@r8#P8o0?KLm
z_%KXi-LN$B7^QDYa_2z@?ZewmD{^KO4!fl388>Px6H`Q`GYw)nLL1IwV1(e2rKh2%
z0bPn-ooV%Nhc&N<Hmf{EnqxO%)e-kQWvZ+pv=tQBr2Uedf0~t%H5Ko^a*e8OO?0De
zbjEl^XC9X8b%SjnQwme(;lmd$e*xGkW$R|t5SwbPcIo=;BH=tQ4)D?s(@^M?8!Ett
z2Xz7a%H`)CL?I|)`chE{u<|}Viu!sWsZ6I*6+>{*m@t0YuKlP>IHu5U(UG`6b4tC^
zj*0-A%V;Qx80nkg_(Gjd7c=Ur4;XC}&a_^kqKm?NJXv+Aa-O9qUeVrW8?@%uMsH)x
zYq~|jl;)s)uXHIQCn2XXijom+7HAe=coQE2Kf|f^^7|K_gH+;<dF;$87@_Ct7t!<m
z3mWIa?Q59|(~eaic3;es%i(jk(PfoV-a~!i8x*QD^+OTuiTPYrZ}z!;$HFUUZN124
zXkPZx1f9nJKu4ei9`F6206FkIt@H-h|A+w7d_=1b@u=Bbd?85|=ZrMK8*t8-#ODif
z@p4{|!?9>zL7}^8y+_M({LC}FInu9)Z_;a)k-?MPV%Jv-Cn!y?*!k%d%a*Z@Hfl)U
z7GRM_+OWHVeRJU)#%)CA6&;X?PKc^rsu>SJ;YusniL|R1FXRa+2%f&{i-Wc@JoN}$
zTC38@){YHkh9s<_Kr(_hsV8&l5irsTVA%&P1U#gB;)r@X4T<Azs__mM<CxA4Tswdg
zrR|kN6u&l0>*PHNnSihhMONo%6e4C%yBoI3lQwvFBmYuv@~fJx)8Yx1aKnNIjl;=a
zqL#BuxbT6aHPXyoJWwQ!Xi#TbegtU|SIJlDTamywlFfk}rD+J(Hs-=aFp@7vuVD=_
zWnZYQ856b@M7p3n;qfO<ZXEAN&zXwb<?fgy&JWU{;~fTP{w-TjLDL!zza!ohL_pl`
z<t<Tg%2Y<tU=y+BfE+|>k8ZfYUQjlW-IOKfG8w7r6=VIL`Y(<UIXGc5r6slCm6(^&
z8w(6mk9sgSl7l@E=?DSd2%~z;vg2*>a<pA4ZjX2doSlBH6YEn>FPuNpH4)7&2G6#a
zxQw6z=+iJhQe}L)B}Ido9x)&Q1FVu(5U&q$iP4ohhVkOGX4b~vaWFZ`tJY!BO@WH~
zCR5zXZ2fAmJOEFv_?$>m6)MIE+R7CM@BOvRgR%{&OxUqSYrLt<plXqEX_Sz=HrTc!
zOcOe(b}@e|L~q%Rj^U~|jy_UbYVcVYXXxkQ5fG4um!n&rlVUtX2)UFo&Bl<_L5*0x
z0RBdh>abAsefMw3QRfK8j|s+0eWg%;-q)z|jn`RCj5Usd#Gd?=@lCgpymXMiDq?T&
z^9We0P5v(a=zt!inyXEdv1@UF{Q@YDtcM?=I?FgA&OtT9Lp5OrBqNt5je)WRXX%?p
zKGVg$^o}{D9tIc#=$edbryl{u9j_&$G=n1ld?J6t>VuR)G{_A`2PA<t+y5h{)8MG-
zVM8zkV#X~y8G>tz<+0|B-1lvx6orHoD5qNWJO&#U0aqI~r;s%nHm}g`u?a2zQ8=Bh
zW>GXC3{{KsDYtK^Zi!b-tF9Cu6|Wdsb`9-g)F2uvcNiTw1C8b5?2S!W=dI+A3&HhH
zhs{i8NtYF`ZU$;Ff8Rf5nzey!_fIFKMmkaV(BFtyxF$fc+}iyu8sxOlXdN|mYDk|M
z5L|h`!sY~MI)&n=Jpx2`YN)S2a~tJWuFhyF=H=}au=}W?oq`7k`xok=*yx-*qhT^D
zJ}WSbG?Pt+`xaDSG?OAcN-|d30?IJT*VRoa53I~$*O-{;Jz273WkGdEz{mLfPd=49
zq*$TuQFI$D-lK6Cfj7=*L$EF7T|p}NnF6d$UZIZgSExmQO$aO&ZkcwAH?7|>aEFlr
z$AUnxbL4eR$;l2xnOX>=mj%Z~HZo%obwr74ClY;?nIlHL2#|H%(u6re?7*j)+3~Zi
zEHgQNVaQ4aFF(JEuq+NwE!)>%X5HDqVVy~kBhbB7$<;$p=ow#A6H$B-7H)IMqo9;g
zx~Nye{22!kLula;jIIG>OXz}lr!7KEIF3lS=I@S;4dr_r8VnTTd$tAQKT|E=9UCYZ
z02~FC1On~5ZzDB!hAbpw6%B$yCwEE2R849e_;3FP3IZYo$iz!?;Tn+wV-~cr(wFHk
zSNwQe{)N}mMxx&t2?^l%EbMe26vpUztp4dx?qJ&Jn71?%e_Rs6khe5XtyeKnVP-}g
z6<RZg>cL&9d5cXmK0{^%@Q$*M9!5tPkzy?BimxioV3+;76xEa=eFsSZA<H@pY&@Iw
zR;8>2fN@~74Uy&{S14FfisiGbDbfiWQICe$gqp1P-)arz<epPMKK)5U>9~%HgIIzt
znWBM=9okf!<pq)}5wXqaSAaRRT2xU?d6e24rq~~sF$HINR;;cjQIA?Ckeh@hOT{{{
zT`EAn&RPpF?<t<N${<^kyH_O6r&HN2C7}-4sgf{uA(j17#H=fXqO1)-#7BjS%2W2J
zmh*c|>^!B(H63k(6*%Wv{<*VXr}Fe;)Wx|y>!-#7$1$h5!%$^e)~-Wj#J>uC5)zAu
zRH_7*vP>-p8WW_XZd{fWFm;A*`F?^ha#9XuKGj>98`dhC>#}6xAW>^y^&CfMW~jT^
z8EJ7^rJ$@Z^tZ#9=G+Mm>ab*Z_7_3A@X2OAGGuK<nETyD{Y)k*?%&m-@vwB{2WloU
z#4$P60vom8C1l3oAI$d{F3^rFIlF3SFe9|mh|k4axZ)LAMVcS^b@AA}vvRBKuxN-y
zo&VhGh@jisS1G55bpIitn^_+X4jcMONUZ_e5%A7XnuOeK(C&le3qB8`_LO@xlbFtE
z5q$_wMbNR(?(-39Tf%VcEL!<wHTYAvV)Z@htjs@4ODslF+40nyI$}e!N$|jy)R*q`
z<>29+gOv$qBvzfpLl!cUW5Z}kz@JNFViIVnqxl8lqXZxM>xp?qj^e?3#nYxRwI4cn
zl({*;VaV>)x0m3L{w=X2;0&Wob%JvaYeG%wZ4nxTL37Ru^HkrG>q)d&Ln@cB!SAOA
z>xIw<o=sT1R+{6s>Ylmd<+h`9$L;BRR4_occk;@fPd*FUI0a<ncpwg4Ym=4Q@XT7b
z(gEcUL5^#P^JGMf+z>PGWqUfPRE6<-V1or*5)RM6$VQaFr^3bE3CGAT70rWE@BB<U
z_~{dRYB&=PefB$+;BQI?k$~Dzgb2Fj0{N`e#=j~K*b~Q453;`ig!#vHrW7G@83Q9u
zubpWlAltf-b-HiaH3XgC{E*uW=~E%2r2q0DH^eH2GZ_J9!8OVCMXUKBJw1?~8Fc45
zKn>SS4;g5*ipAvUMNYjIQ`KOY)#6Mkvfiaee+ysDG|pWuNWgLMvknHuOF(x`jrIHN
zu~izt8qfjmB;mjqWS^*geCka!eySU`j;oencrZFNO;6dAbl6=@q%5obPOWrcT!%`I
z<B50YuCFScMPJ(59hi3}HmAV2sz<#VK^H8nqAmd~Hfj4$lAoZXfwVzuur}QFl_tiX
z;KjvKC7+|kOC6)5HAZY*e+A<n6R-dPbd%z#zPiE0K)mG*7<<c@&k|zar=n@r@>Vw!
zn+@SOR_W&(n?IlX3leE;9UsdP;RFN45Aln>9xHS&Mz%7P!msVz%3~D^E_6S?0EjQ;
zNSb+r61{<DsfO|e0oH#NBpR%6-x6IBS~5<L0>&#I0Hsv}uSxXIbts;uiZ4i4Zc(m<
z>QXK=N;bXu63saWc);QFCF-X;1>H?}^V1&YI+z!g&ic|)_q}ft8dnC|5R8LhHiy$6
z0!=BHmD;_IgRmQldqNa90n3QxGHP$d<PSi4s9~Zq{Jgl`Fv~czm>>mB<oqN4&MYrh
zUZkJwxhQh7Z%O+LKy6>e0e4PD+2IEHJ3?pJ!#xJ`$zp4v6Wj{`@yd}bo-njyD(j*&
zBVBc_60V*OuKo+LC0xOpsG8601S3Y$>aB$Qi8RMJj-RNHuFIzyI9Jl9N*5c-IJ8#x
zm|MYeum3lc^!Kpceu6pNVLY#AB`?>daKZx0l3V>Z(zE{;>Hjms2nzRqe~I~Dq-TK;
zK_gXhhD2vIMh+q)XA@O*`6lZI<|+OQ_20waLI4era9R`oTIs!TItrNwjy<xNy3wcu
zlZUz}8i3tF6jK;nkUan9d?bez^{%UMV0Y|k<#8aaXL(^GXxT0`dnQ1`-6ZpYiG2ap
ztCT#?hAp(ndMmu(;z#j6WvjOSsO5HcMV=xVjB>nr;eHifMZP&ZU{(%*Z%LoiQ{Mp1
zbZJpr28Fn?lpLNQRt!#lO((i6(?!35tsjF*ECtjk1MPGOgCBKss>qFYFz7DkR{t01
zmPrH^JCY5x`>g>EzpDl&3@Y5uQQcIv%|A&zRO7KYjq4I}c4w2fGwyKY?x>^e<IwO>
z0liw<Y$Vbt_T9gXWsGfnn`Ved*A=PP<*+GndVWZmFz?q9xA#VlFf9qDjqWZ_v+2ms
zO>A~t@uA2bM6Iv{2}kJ)d$BV(E#u1;1b0<!tO!_0zM;7Yex^d(W}-lOfR_3ZkHH6-
z;u+azMNZ?HRuDw;9e2$%>yq+<k~vo(?LA^Rnl-rnGl#-azVg#<uSa@O<2bOkCw?6>
zuYT!KW+c9w?jFxUi+iM{*qw?Q56SUAyA&%QGbADK4Thauk+8uT^QP^uVfoM=>n3{D
zxM8R<AWw!Kg{9+q<^YEyMO`2(+}20nmvNZ<bbT+#`nx#Br4$x)Z%g;QpLAt?X&+m|
zdyD$lf?L^HS|s|RfIb6o`fsi!eVB*>UKlNtGUdwV5l!cGAi+-C1G1nocw_KmFdUV#
z3@YUSyQrVV;K1hn3jm?cLp&#zvtKB-U$(v@l&U>|%Zmg1646L(e=2w!J$)S(gZ-Hc
zZU(%wG6XLoIe>;-f2xjL&O|#tYTvCU21mINssE`wWNlH)*lt0k!F~vGrjN>f^(G60
zGdVn66S?8K<Es+sTZk-Ct^r!P6-c`cJNCAs{<D7oFDq}9Q^A?@(QAo)m_EFaM32ij
zq*b4eVL0P}efrHOMAU=ew36Ufqz!=ISdG1{HN!kuu_-WP!@8x@0iKVJ@3R{DiLE86
zAIEt@Fr%}z#=YQJynIHB&iC&)O^ciX58WOrm8bqE<3VRv9J){=_CNU5A+G(6V>xZ4
zB%|;0nz@o{ZX$#H$Hc(RXpL&=;e1+j*%-POPtqx-@u}!jX{vIM&G2gz_9`$!VmBo=
z6H;hIzt6(kl3Yb4<~YyJKKFyB+=UqrY|dUJn+M)iAD&`NDQd5ujjk!}sj_wO{8Wht
z+y#M3r6r5GU7v$c-eHUw?OeiLu^yw~M{Oc_d8kO3th%LT;sm|#aNgmHHa2S#j-m+0
z0`FNy2^wlUvE4!V*b4{4$l<|LNyXI|2SO*j{w9OIjes0_ROg)MSg8WH*QN@|1Qk5=
z$)8Aa8_?#q{nmW_Gef`AR$n@9PEY1MRlQGnS}Bd{zpL#(9~xIA2b;9c$<s`8vXW6o
zEB}q=%5YecL|y$6`Pang*FCS0nsDJ&x-MLuveJz-wNl6HS9>9e^PP4aA88&T>*|=^
z^tYZ=JJJVT^8BSQTs5{W{z%TVW~{wk{aR_(AF(vD-NRfJxFG*fsg8H&{sOqut3o>U
zR#!>w>h>i4I>SFxQ7$`s!i|-Lj~nSwwhr8UTUBhoWg-(5d_>(7ZAS-O_E#PmIkk{v
zd4eNEkY7OfYE*dl40zHe2Ky<}Tt1?*TuX-8+3r>LFjk@_2N9cnzk)@&*n)c;hkRA@
zoQTf@wDBfoFW^!aCBpZ9tf7S0D6zXyop8+CYK$;jQ$gtifo?J&n8t%=;;EFr&WV$K
zJaf6+JEf5qr2he9=>*G9_DZFTy@ci;<6u+_kyz4BaLlJ<B-k5Jm0y)Su~Ab4Li)ny
zZ_OiIg)fZ-E=z|pq%~rt2z#Ga0wS|DmjqNy`~|fiD&bpkwRK5!B^k+pZK&8YJhLqz
n3t-<$@XWDNV#RBJ(TL9-6X?elB+X^$DglB*vwLs&wf=tq?cI?w

literal 0
HcmV?d00001

diff --git a/test/test_outputs/.gitkeep b/test/test_outputs/.gitkeep
new file mode 100644
index 000000000..e69de29bb

From c756133541da478a35a74cda416d114a8973cf8e Mon Sep 17 00:00:00 2001
From: Aarni Koskela <akx@iki.fi>
Date: Wed, 27 Dec 2023 10:55:01 +0200
Subject: [PATCH 6/7] Add experimental HAT model

---
 modules/hat_model.py | 42 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)
 create mode 100644 modules/hat_model.py

diff --git a/modules/hat_model.py b/modules/hat_model.py
new file mode 100644
index 000000000..553e19411
--- /dev/null
+++ b/modules/hat_model.py
@@ -0,0 +1,42 @@
+import os
+import sys
+
+from modules import modelloader, devices
+from modules.shared import opts
+from modules.upscaler import Upscaler, UpscalerData
+from modules.upscaler_utils import upscale_with_model
+
+
+class UpscalerHAT(Upscaler):
+    def __init__(self, dirname):
+        self.name = "HAT"
+        self.scalers = []
+        self.user_path = dirname
+        super().__init__()
+        for file in self.find_models(ext_filter=[".pt", ".pth"]):
+            name = modelloader.friendly_name(file)
+            scale = 4  # TODO: scale might not be 4, but we can't know without loading the model
+            scaler_data = UpscalerData(name, file, upscaler=self, scale=scale)
+            self.scalers.append(scaler_data)
+
+    def do_upscale(self, img, selected_model):
+        try:
+            model = self.load_model(selected_model)
+        except Exception as e:
+            print(f"Unable to load HAT model {selected_model}: {e}", file=sys.stderr)
+            return img
+        model.to(devices.device_esrgan)  # TODO: should probably be device_hat
+        return upscale_with_model(
+            model,
+            img,
+            tile_size=opts.ESRGAN_tile,  # TODO: should probably be HAT_tile
+            tile_overlap=opts.ESRGAN_tile_overlap,  # TODO: should probably be HAT_tile_overlap
+        )
+
+    def load_model(self, path: str):
+        if not os.path.isfile(path):
+            raise FileNotFoundError(f"Model file {path} not found")
+        return modelloader.load_spandrel_model(
+            path,
+            device=devices.device_esrgan,  # TODO: should probably be device_hat
+        )

From 4ad0c0c0a805da4bac03cff86ea17c25a1291546 Mon Sep 17 00:00:00 2001
From: Aarni Koskela <akx@iki.fi>
Date: Sat, 30 Dec 2023 16:37:03 +0200
Subject: [PATCH 7/7] Verify architecture for loaded Spandrel models

---
 extensions-builtin/ScuNET/scripts/scunet_model.py |  2 +-
 extensions-builtin/SwinIR/scripts/swinir_model.py |  1 +
 modules/codeformer_model.py                       |  1 +
 modules/esrgan_model.py                           |  1 +
 modules/gfpgan_model.py                           |  1 +
 modules/hat_model.py                              |  1 +
 modules/modelloader.py                            | 13 ++++++++++++-
 modules/realesrgan_model.py                       |  7 ++++---
 8 files changed, 22 insertions(+), 5 deletions(-)

diff --git a/extensions-builtin/ScuNET/scripts/scunet_model.py b/extensions-builtin/ScuNET/scripts/scunet_model.py
index 18cf8e1a0..5f3dd08b3 100644
--- a/extensions-builtin/ScuNET/scripts/scunet_model.py
+++ b/extensions-builtin/ScuNET/scripts/scunet_model.py
@@ -121,7 +121,7 @@ class UpscalerScuNET(modules.upscaler.Upscaler):
             filename = modelloader.load_file_from_url(self.model_url, model_dir=self.model_download_path, file_name=f"{self.name}.pth")
         else:
             filename = path
-        return modelloader.load_spandrel_model(filename, device=device)
+        return modelloader.load_spandrel_model(filename, device=device, expected_architecture='SCUNet')
 
 
 def on_ui_settings():
diff --git a/extensions-builtin/SwinIR/scripts/swinir_model.py b/extensions-builtin/SwinIR/scripts/swinir_model.py
index 85c18b9e9..aae159af5 100644
--- a/extensions-builtin/SwinIR/scripts/swinir_model.py
+++ b/extensions-builtin/SwinIR/scripts/swinir_model.py
@@ -75,6 +75,7 @@ class UpscalerSwinIR(Upscaler):
             filename,
             device=self._get_device(),
             dtype=devices.dtype,
+            expected_architecture="SwinIR",
         )
         if getattr(opts, 'SWIN_torch_compile', False):
             try:
diff --git a/modules/codeformer_model.py b/modules/codeformer_model.py
index ceda4bab9..44b84618e 100644
--- a/modules/codeformer_model.py
+++ b/modules/codeformer_model.py
@@ -37,6 +37,7 @@ class FaceRestorerCodeFormer(face_restoration_utils.CommonFaceRestoration):
             return modelloader.load_spandrel_model(
                 model_path,
                 device=devices.device_codeformer,
+                expected_architecture='CodeFormer',
             ).model
         raise ValueError("No codeformer model found")
 
diff --git a/modules/esrgan_model.py b/modules/esrgan_model.py
index a7c7c9e30..70041ab02 100644
--- a/modules/esrgan_model.py
+++ b/modules/esrgan_model.py
@@ -49,6 +49,7 @@ class UpscalerESRGAN(Upscaler):
         return modelloader.load_spandrel_model(
             filename,
             device=('cpu' if devices.device_esrgan.type == 'mps' else None),
+            expected_architecture='ESRGAN',
         )
 
 
diff --git a/modules/gfpgan_model.py b/modules/gfpgan_model.py
index a356b56fe..48f8ad5e2 100644
--- a/modules/gfpgan_model.py
+++ b/modules/gfpgan_model.py
@@ -37,6 +37,7 @@ class FaceRestorerGFPGAN(face_restoration_utils.CommonFaceRestoration):
                 net = modelloader.load_spandrel_model(
                     model_path,
                     device=self.get_device(),
+                    expected_architecture='GFPGAN',
                 ).model
                 net.different_w = True  # see https://github.com/chaiNNer-org/spandrel/pull/81
                 return net
diff --git a/modules/hat_model.py b/modules/hat_model.py
index 553e19411..7f2abb416 100644
--- a/modules/hat_model.py
+++ b/modules/hat_model.py
@@ -39,4 +39,5 @@ class UpscalerHAT(Upscaler):
         return modelloader.load_spandrel_model(
             path,
             device=devices.device_esrgan,  # TODO: should probably be device_hat
+            expected_architecture='HAT',
         )
diff --git a/modules/modelloader.py b/modules/modelloader.py
index 30116932a..f4182559e 100644
--- a/modules/modelloader.py
+++ b/modules/modelloader.py
@@ -6,6 +6,8 @@ import shutil
 import importlib
 from urllib.parse import urlparse
 
+import torch
+
 from modules import shared
 from modules.upscaler import Upscaler, UpscalerLanczos, UpscalerNearest, UpscalerNone
 from modules.paths import script_path, models_path
@@ -183,9 +185,18 @@ def load_upscalers():
     )
 
 
-def load_spandrel_model(path, *, device, half: bool = False, dtype=None):
+def load_spandrel_model(
+    path: str,
+    *,
+    device: str | torch.device | None,
+    half: bool = False,
+    dtype: str | None = None,
+    expected_architecture: str | None = None,
+):
     import spandrel
     model = spandrel.ModelLoader(device=device).load_from_file(path)
+    if expected_architecture and model.architecture != expected_architecture:
+        raise TypeError(f"Model {path} is not a {expected_architecture} model")
     if half:
         model = model.model.half()
     if dtype:
diff --git a/modules/realesrgan_model.py b/modules/realesrgan_model.py
index 332d8f4b1..2a2be5ad7 100644
--- a/modules/realesrgan_model.py
+++ b/modules/realesrgan_model.py
@@ -1,9 +1,9 @@
 import os
 
-from modules.upscaler_utils import upscale_with_model
-from modules.upscaler import Upscaler, UpscalerData
-from modules.shared import cmd_opts, opts
 from modules import modelloader, errors
+from modules.shared import cmd_opts, opts
+from modules.upscaler import Upscaler, UpscalerData
+from modules.upscaler_utils import upscale_with_model
 
 
 class UpscalerRealESRGAN(Upscaler):
@@ -40,6 +40,7 @@ class UpscalerRealESRGAN(Upscaler):
             info.local_data_path,
             device=self.device,
             half=(not cmd_opts.no_half and not cmd_opts.upcast_sampling),
+            expected_architecture="RealESRGAN",
         )
         return upscale_with_model(
             mod,