controlnet

Lassi-Ki · Feb 4, 2024 · 36bf8dd · 36bf8dd
1 parent 130c6fa
commit 36bf8dd
Show file tree

Hide file tree

Showing 940 changed files with 151,143 additions and 1 deletion.
diff --git a/extensions/sd-webui-controlnet b/extensions/sd-webui-controlnet
diff --git a/extensions/sd-webui-controlnet/LICENSE b/extensions/sd-webui-controlnet/LICENSE
diff --git a/extensions/sd-webui-controlnet/README.md b/extensions/sd-webui-controlnet/README.md
diff --git a/extensions/sd-webui-controlnet/annotator/anime_face_segment/LICENSE b/extensions/sd-webui-controlnet/annotator/anime_face_segment/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2021 Miaomiao Li
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/extensions/sd-webui-controlnet/annotator/anime_face_segment/__init__.py b/extensions/sd-webui-controlnet/annotator/anime_face_segment/__init__.py
@@ -0,0 +1,172 @@
+import os
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from PIL import Image
+import fnmatch
+import cv2
+
+import sys
+
+import numpy as np
+from modules import devices
+from einops import rearrange
+from annotator.annotator_path import models_path
+
+import torchvision
+from torchvision.models import MobileNet_V2_Weights
+from torchvision import transforms
+
+COLOR_BACKGROUND = (255,255,0)
+COLOR_HAIR = (0,0,255)
+COLOR_EYE = (255,0,0)
+COLOR_MOUTH = (255,255,255)
+COLOR_FACE = (0,255,0)
+COLOR_SKIN = (0,255,255)
+COLOR_CLOTHES = (255,0,255)
+PALETTE = [COLOR_BACKGROUND,COLOR_HAIR,COLOR_EYE,COLOR_MOUTH,COLOR_FACE,COLOR_SKIN,COLOR_CLOTHES]
+
+class UNet(nn.Module):
+    def __init__(self):
+        super(UNet, self).__init__()
+        self.NUM_SEG_CLASSES = 7 # Background, hair, face, eye, mouth, skin, clothes
+
+        mobilenet_v2 = torchvision.models.mobilenet_v2(weights=MobileNet_V2_Weights.IMAGENET1K_V1)
+        mob_blocks = mobilenet_v2.features
+
+        # Encoder
+        self.en_block0 = nn.Sequential(    # in_ch=3 out_ch=16
+            mob_blocks[0],
+            mob_blocks[1]
+        )
+        self.en_block1 = nn.Sequential(    # in_ch=16 out_ch=24
+            mob_blocks[2],
+            mob_blocks[3],
+        )
+        self.en_block2 = nn.Sequential(    # in_ch=24 out_ch=32
+            mob_blocks[4],
+            mob_blocks[5],
+            mob_blocks[6],
+        )
+        self.en_block3 = nn.Sequential(    # in_ch=32 out_ch=96
+            mob_blocks[7],
+            mob_blocks[8],
+            mob_blocks[9],
+            mob_blocks[10],
+            mob_blocks[11],
+            mob_blocks[12],
+            mob_blocks[13],
+        )
+        self.en_block4 = nn.Sequential(    # in_ch=96 out_ch=160
+            mob_blocks[14],
+            mob_blocks[15],
+            mob_blocks[16],
+        )
+
+        # Decoder
+        self.de_block4 = nn.Sequential(     # in_ch=160 out_ch=96
+            nn.UpsamplingNearest2d(scale_factor=2),
+            nn.Conv2d(160, 96, kernel_size=3, padding=1),
+            nn.InstanceNorm2d(96),
+            nn.LeakyReLU(0.1),
+            nn.Dropout(p=0.2)
+        )
+        self.de_block3 = nn.Sequential(     # in_ch=96x2 out_ch=32
+            nn.UpsamplingNearest2d(scale_factor=2),
+            nn.Conv2d(96*2, 32, kernel_size=3, padding=1),
+            nn.InstanceNorm2d(32),
+            nn.LeakyReLU(0.1),
+            nn.Dropout(p=0.2)
+        )
+        self.de_block2 = nn.Sequential(     # in_ch=32x2 out_ch=24
+            nn.UpsamplingNearest2d(scale_factor=2),
+            nn.Conv2d(32*2, 24, kernel_size=3, padding=1),
+            nn.InstanceNorm2d(24),
+            nn.LeakyReLU(0.1),
+            nn.Dropout(p=0.2)
+        )
+        self.de_block1 = nn.Sequential(     # in_ch=24x2 out_ch=16
+            nn.UpsamplingNearest2d(scale_factor=2),
+            nn.Conv2d(24*2, 16, kernel_size=3, padding=1),
+            nn.InstanceNorm2d(16),
+            nn.LeakyReLU(0.1),
+            nn.Dropout(p=0.2)
+        )
+
+        self.de_block0 = nn.Sequential(     # in_ch=16x2 out_ch=7
+            nn.UpsamplingNearest2d(scale_factor=2),
+            nn.Conv2d(16*2, self.NUM_SEG_CLASSES, kernel_size=3, padding=1),
+            nn.Softmax2d()
+        )
+
+    def forward(self, x):
+        e0 = self.en_block0(x)
+        e1 = self.en_block1(e0)
+        e2 = self.en_block2(e1)
+        e3 = self.en_block3(e2)
+        e4 = self.en_block4(e3)
+
+        d4 = self.de_block4(e4)
+        d4 = F.interpolate(d4, size=e3.size()[2:], mode='bilinear', align_corners=True)
+        c4 = torch.cat((d4,e3),1)
+
+        d3 = self.de_block3(c4)
+        d3 = F.interpolate(d3, size=e2.size()[2:], mode='bilinear', align_corners=True)
+        c3 = torch.cat((d3,e2),1)
+
+        d2 = self.de_block2(c3)
+        d2 = F.interpolate(d2, size=e1.size()[2:], mode='bilinear', align_corners=True)
+        c2 =torch.cat((d2,e1),1)
+
+        d1 = self.de_block1(c2)
+        d1 = F.interpolate(d1, size=e0.size()[2:], mode='bilinear', align_corners=True)
+        c1 = torch.cat((d1,e0),1)
+        y = self.de_block0(c1)
+
+        return y
+
+
+class AnimeFaceSegment:
+
+    model_dir = os.path.join(models_path, "anime_face_segment")
+
+    def __init__(self):
+        self.model = None
+        self.device = devices.get_device_for("controlnet")
+
+    def load_model(self):
+        remote_model_path = "https://huggingface.co/bdsqlsz/qinglong_controlnet-lllite/resolve/main/Annotators/UNet.pth"
+        modelpath = os.path.join(self.model_dir, "UNet.pth")
+        if not os.path.exists(modelpath):
+            from basicsr.utils.download_util import load_file_from_url
+            load_file_from_url(remote_model_path, model_dir=self.model_dir)
+        net = UNet()
+        ckpt = torch.load(modelpath, map_location=self.device)
+        for key in list(ckpt.keys()):
+            if 'module.' in key:
+                ckpt[key.replace('module.', '')] = ckpt[key]
+                del ckpt[key]
+        net.load_state_dict(ckpt)
+        net.eval()
+        self.model = net.to(self.device)
+
+    def unload_model(self):
+        if self.model is not None:
+            self.model.cpu()
+
+    def __call__(self, input_image):
+
+        if self.model is None:
+            self.load_model()
+        self.model.to(self.device)
+        transform = transforms.Compose([  
+            transforms.Resize(512,interpolation=transforms.InterpolationMode.BICUBIC),  
+            transforms.ToTensor(),])
+        img = Image.fromarray(input_image)
+        with torch.no_grad():
+            img = transform(img).unsqueeze(dim=0).to(self.device)
+            seg = self.model(img).squeeze(dim=0)
+            seg = seg.cpu().detach().numpy()
+            img = rearrange(seg,'h w c -> w c h')
+            img = [[PALETTE[np.argmax(val)] for val in buf]for buf in img]
+            return np.array(img).astype(np.uint8)
diff --git a/extensions/sd-webui-controlnet/annotator/annotator_path.py b/extensions/sd-webui-controlnet/annotator/annotator_path.py
@@ -0,0 +1,22 @@
+import os
+from modules import shared
+
+models_path = shared.opts.data.get('control_net_modules_path', None)
+if not models_path:
+    models_path = getattr(shared.cmd_opts, 'controlnet_annotator_models_path', None)
+if not models_path:
+    models_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'downloads')
+
+if not os.path.isabs(models_path):
+    models_path = os.path.join(shared.data_path, models_path)
+
+clip_vision_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'clip_vision')
+# clip vision is always inside controlnet "extensions\sd-webui-controlnet"
+# and any problem can be solved by removing controlnet and reinstall
+
+models_path = os.path.realpath(models_path)
+os.makedirs(models_path, exist_ok=True)
+print(f'ControlNet preprocessor location: {models_path}')
+# Make sure that the default location is inside controlnet "extensions\sd-webui-controlnet"
+# so that any problem can be solved by removing controlnet and reinstall
+# if users do not change configs on their own (otherwise users will know what is wrong)
diff --git a/extensions/sd-webui-controlnet/annotator/binary/__init__.py b/extensions/sd-webui-controlnet/annotator/binary/__init__.py
@@ -0,0 +1,14 @@
+import cv2
+
+
+def apply_binary(img, bin_threshold):
+    img_gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
+
+    if bin_threshold == 0 or bin_threshold == 255:
+        # Otsu's threshold
+        otsu_threshold, img_bin = cv2.threshold(img_gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
+        print("Otsu threshold:", otsu_threshold)
+    else:
+        _, img_bin = cv2.threshold(img_gray, bin_threshold, 255, cv2.THRESH_BINARY_INV)
+
+    return cv2.cvtColor(img_bin, cv2.COLOR_GRAY2RGB)
diff --git a/extensions/sd-webui-controlnet/annotator/canny/__init__.py b/extensions/sd-webui-controlnet/annotator/canny/__init__.py
@@ -0,0 +1,5 @@
+import cv2
+
+
+def apply_canny(img, low_threshold, high_threshold):
+    return cv2.Canny(img, low_threshold, high_threshold)
diff --git a/extensions/sd-webui-controlnet/annotator/clipvision/__init__.py b/extensions/sd-webui-controlnet/annotator/clipvision/__init__.py
@@ -0,0 +1,133 @@
+import os
+import cv2
+import torch
+
+from modules import devices
+from modules.modelloader import load_file_from_url
+from annotator.annotator_path import models_path
+from transformers import CLIPVisionModelWithProjection, CLIPVisionConfig, CLIPImageProcessor
+
+
+config_clip_g = {
+  "attention_dropout": 0.0,
+  "dropout": 0.0,
+  "hidden_act": "gelu",
+  "hidden_size": 1664,
+  "image_size": 224,
+  "initializer_factor": 1.0,
+  "initializer_range": 0.02,
+  "intermediate_size": 8192,
+  "layer_norm_eps": 1e-05,
+  "model_type": "clip_vision_model",
+  "num_attention_heads": 16,
+  "num_channels": 3,
+  "num_hidden_layers": 48,
+  "patch_size": 14,
+  "projection_dim": 1280,
+  "torch_dtype": "float32"
+}
+
+config_clip_h = {
+  "attention_dropout": 0.0,
+  "dropout": 0.0,
+  "hidden_act": "gelu",
+  "hidden_size": 1280,
+  "image_size": 224,
+  "initializer_factor": 1.0,
+  "initializer_range": 0.02,
+  "intermediate_size": 5120,
+  "layer_norm_eps": 1e-05,
+  "model_type": "clip_vision_model",
+  "num_attention_heads": 16,
+  "num_channels": 3,
+  "num_hidden_layers": 32,
+  "patch_size": 14,
+  "projection_dim": 1024,
+  "torch_dtype": "float32"
+}
+
+config_clip_vitl = {
+  "attention_dropout": 0.0,
+  "dropout": 0.0,
+  "hidden_act": "quick_gelu",
+  "hidden_size": 1024,
+  "image_size": 224,
+  "initializer_factor": 1.0,
+  "initializer_range": 0.02,
+  "intermediate_size": 4096,
+  "layer_norm_eps": 1e-05,
+  "model_type": "clip_vision_model",
+  "num_attention_heads": 16,
+  "num_channels": 3,
+  "num_hidden_layers": 24,
+  "patch_size": 14,
+  "projection_dim": 768,
+  "torch_dtype": "float32"
+}
+
+configs = {
+    'clip_g': config_clip_g,
+    'clip_h': config_clip_h,
+    'clip_vitl': config_clip_vitl,
+}
+
+downloads = {
+    'clip_vitl': 'https://huggingface.co/openai/clip-vit-large-patch14/resolve/main/pytorch_model.bin',
+    'clip_g': 'https://huggingface.co/lllyasviel/Annotators/resolve/main/clip_g.pth',
+    'clip_h': 'https://huggingface.co/h94/IP-Adapter/resolve/main/models/image_encoder/pytorch_model.bin'
+}
+
+
+clip_vision_h_uc = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'clip_vision_h_uc.data')
+clip_vision_h_uc = torch.load(clip_vision_h_uc,  map_location=torch.device('cuda' if torch.cuda.is_available() else 'cpu'))['uc']
+
+clip_vision_vith_uc = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'clip_vision_vith_uc.data')
+clip_vision_vith_uc = torch.load(clip_vision_vith_uc, map_location=torch.device('cuda' if torch.cuda.is_available() else 'cpu'))['uc']
+
+
+class ClipVisionDetector:
+    def __init__(self, config, low_vram: bool):
+        assert config in downloads
+        self.download_link = downloads[config]
+        self.model_path = os.path.join(models_path, 'clip_vision')
+        self.file_name = config + '.pth'
+        self.config = configs[config]
+        self.device = (
+            torch.device("cpu") if low_vram else
+            devices.get_device_for("controlnet")
+        )
+        os.makedirs(self.model_path, exist_ok=True)
+        file_path = os.path.join(self.model_path, self.file_name)
+        if not os.path.exists(file_path):
+            load_file_from_url(url=self.download_link, model_dir=self.model_path, file_name=self.file_name)
+        config = CLIPVisionConfig(**self.config)
+
+        self.model = CLIPVisionModelWithProjection(config)
+        self.processor = CLIPImageProcessor(crop_size=224,
+                                            do_center_crop=True,
+                                            do_convert_rgb=True,
+                                            do_normalize=True,
+                                            do_resize=True,
+                                            image_mean=[0.48145466, 0.4578275, 0.40821073],
+                                            image_std=[0.26862954, 0.26130258, 0.27577711],
+                                            resample=3,
+                                            size=224)
+        sd = torch.load(file_path, map_location=self.device)
+        self.model.load_state_dict(sd, strict=False)
+        del sd
+        self.model.to(self.device)
+        self.model.eval()
+
+    def unload_model(self):
+        if self.model is not None:
+            self.model.to('meta')
+
+    def __call__(self, input_image):
+        with torch.no_grad():
+            input_image = cv2.resize(input_image, (224, 224), interpolation=cv2.INTER_AREA)
+            feat = self.processor(images=input_image, return_tensors="pt")
+            feat['pixel_values'] = feat['pixel_values'].to(self.device)
+            result = self.model(**feat, output_hidden_states=True)
+            result['hidden_states'] = [v.to(self.device) for v in result['hidden_states']]
+            result = {k: v.to(self.device) if isinstance(v, torch.Tensor) else v for k, v in result.items()}
+        return result
diff --git a/extensions/sd-webui-controlnet/annotator/clipvision/clip_vision_h_uc.data b/extensions/sd-webui-controlnet/annotator/clipvision/clip_vision_h_uc.data
diff --git a/extensions/sd-webui-controlnet/annotator/clipvision/clip_vision_vith_uc.data b/extensions/sd-webui-controlnet/annotator/clipvision/clip_vision_vith_uc.data
diff --git a/extensions/sd-webui-controlnet/annotator/color/__init__.py b/extensions/sd-webui-controlnet/annotator/color/__init__.py
@@ -0,0 +1,20 @@
+import cv2
+
+def cv2_resize_shortest_edge(image, size):
+    h, w = image.shape[:2]
+    if h < w:
+        new_h = size
+        new_w = int(round(w / h * size))
+    else:
+        new_w = size
+        new_h = int(round(h / w * size))
+    resized_image = cv2.resize(image, (new_w, new_h), interpolation=cv2.INTER_AREA)
+    return resized_image
+
+def apply_color(img, res=512):
+    img = cv2_resize_shortest_edge(img, res)
+    h, w = img.shape[:2]
+
+    input_img_color = cv2.resize(img, (w//64, h//64), interpolation=cv2.INTER_CUBIC)  
+    input_img_color = cv2.resize(input_img_color, (w, h), interpolation=cv2.INTER_NEAREST)
+    return input_img_color