tagging.py

import datetime
import os, time

import pandas as pd
import argparse
import traceback, sys
import re
from pathlib import Path
from typing import List, Tuple, Dict, Any, Optional, Callable, Protocol

import numpy as np
from numpy import signedinteger
from PIL import Image
import timm
from timm.data import create_transform, resolve_data_config
import torch
from torch import Tensor, nn
from torch.nn import functional as F
from huggingface_hub import hf_hub_download
from huggingface_hub.utils import HfHubHTTPError
import concurrent.futures

kaomojis: List[str] = [
    "0_0",
    "(o)_(o)",
    "+_+",
    "+_-",
    "._.",
    "<o>_<o>",
    "<|>_<|>",
    "=_=",
    ">_<",
    "3_3",
    "6_9",
    ">_o",
    "@_@",
    "^_^",
    "o_o",
    "u_u",
    "x_x",
    "|_|",
    "||_||",
]

TAGGER_VIT_MODEL_REPO: str = "SmilingWolf/wd-eva02-large-tagger-v3"

EXTENSIONS: List[str] = ['.png', '.jpg', '.jpeg', ".PNG", ".JPG", ".JPEG"]

BATCH_SIZE: int = 10 # max size for M1 MBA GPU
PROGRESS_INTERVAL: int = 1000

WORKER_NUM: int = 8

torch_device = torch.device("cpu")

torch_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# for apple silicon
if torch.backends.mps.is_available():
    torch_device = torch.device("mps")

def mcut_threshold(probs: np.ndarray) -> float:
    sorted_probs: np.ndarray = probs[probs.argsort()[::-1]]
    difs: np.ndarray = sorted_probs[:-1] - sorted_probs[1:]
    t: signedinteger[Any] = difs.argmax()
    thresh: float = (sorted_probs[t] + sorted_probs[t + 1]) / 2
    return thresh

def print_traceback() -> None:
    tb: traceback.StackSummary = traceback.extract_tb(sys.exc_info()[2])
    trace: List[str] = traceback.format_list(tb)
    print('---- traceback ----')
    for line in trace:
        if '~^~' in line:
            print(line.rstrip())
        else:
            text: str = re.sub(r'\n\s*', ' ', line.rstrip())
            print(text)
    print('-------------------')


class Predictor:
    def __init__(self) -> None:
        self.last_loaded_repo: Optional[str] = None
        self.tagger_model: Optional[nn.Module] = None
        self.tag_names: Optional[List[str]] = None
        self.rating_index: Optional[List[int]] = None
        self.general_index: Optional[List[int]] = None
        self.character_index: Optional[List[int]] = None
        self.transform: Optional[Callable] = None

    def list_files_recursive(self, dir_path: str) -> List[str]:
        file_list: List[str] = []
        for root, _, files in os.walk(dir_path):
            for file in files:
                file_path: str = os.path.join(root, file)
                if any(file_path.endswith(ext) for ext in EXTENSIONS):
                    file_list.append(file_path)
        return file_list

    def prepare_image(self, image: Image.Image) -> Image.Image:
        #target_size: int = self.model_target_size

        if image.mode in ('RGBA', 'LA'):
            background: Image.Image = Image.new("RGB", image.size, (255, 255, 255))
            background.paste(image, mask=image.split()[-1])
            image = background
        else:
            # copy image to avoid error at convert method call
            image = image.copy()
            image = image.convert("RGB")

        image_shape: Tuple[int, int] = image.size
        max_dim: int = max(image_shape)
        pad_left: int = (max_dim - image_shape[0]) // 2
        pad_top: int = (max_dim - image_shape[1]) // 2

        padded_image: Image.Image = Image.new("RGB", (max_dim, max_dim), (255, 255, 255))
        padded_image.paste(image, (pad_left, pad_top))

        return padded_image

    def load_labels_hf(
            self,
            repo_id: str,
            revision: Optional[str] = None,
            token: Optional[str] = None,
    ) -> None:
        try:
            csv_path = hf_hub_download(
                repo_id=repo_id, filename="selected_tags.csv", revision=revision, token=token
            )
            csv_path = Path(csv_path).resolve()
        except HfHubHTTPError as e:
            raise FileNotFoundError(f"selected_tags.csv failed to download from {repo_id}") from e

        df: pd.DataFrame = pd.read_csv(csv_path, usecols=["name", "category"])
        self.rating_index = list(np.where(df["category"] == 9)[0])
        self.general_index = list(np.where(df["category"] == 0)[0])
        self.character_index = list(np.where(df["category"] == 4)[0])
        self.tag_names = df["name"].tolist()

    def load_model(self) -> None:
        if self.tagger_model is not None:
            return

        self.tagger_model = timm.create_model("hf-hub:" + TAGGER_VIT_MODEL_REPO).eval()
        state_dict = timm.models.load_state_dict_from_hf(TAGGER_VIT_MODEL_REPO)
        self.tagger_model.load_state_dict(state_dict)

        print("Loading tag list...")
        self.load_labels_hf(repo_id=TAGGER_VIT_MODEL_REPO)

        print("Creating data transform...")
        self.transform = create_transform(**resolve_data_config(self.tagger_model.pretrained_cfg, model=self.tagger_model))

    def predict(
            self,
            tensors: List[Tensor],
            general_thresh: float,
            general_mcut_enabled: bool,
            character_thresh: float,
            character_mcut_enabled: bool,
    ) -> List[str]:
        batched_tensor = torch.stack(tensors, dim=0)

        print("Running inference...")
        with torch.inference_mode():
            # move model to GPU, if available
            model = self.tagger_model
            if torch_device.type != "cpu":
                model = self.tagger_model.to(torch_device)
                batched_tensor = batched_tensor.to(torch_device)
            # run the model
            outputs = model.forward(batched_tensor)
            # apply the final activation function (timm doesn't support doing this internally)
            outputs = F.sigmoid(outputs)
            # move inputs, outputs, and model back to to cpu if we were on GPU
            if torch_device.type != "cpu":
                outputs = outputs.to("cpu")

        print("Processing results...")
        preds = outputs.numpy()

        ret_strings: List[str] = []
        for idx in range(0, len(tensors)):
            labels: List[Tuple[str, float]] = list(zip(self.tag_names, preds[idx].astype(float)))

            general_names: List[Tuple[str, float]] = [labels[i] for i in self.general_index]

            if general_mcut_enabled:
                general_probs: np.ndarray = np.array([x[1] for x in general_names])
                general_thresh = mcut_threshold(general_probs)

            general_res: Dict[str, float] = {x[0]: x[1] for x in general_names if x[1] > general_thresh}

            character_names: List[Tuple[str, float]] = [labels[i] for i in self.character_index]

            if character_mcut_enabled:
                character_probs: np.ndarray = np.array([x[1] for x in character_names])
                character_thresh = mcut_threshold(character_probs)
                character_thresh = max(0.15, character_thresh)

            character_res: Dict[str, float] = {x[0]: x[1] for x in character_names if x[1] > character_thresh}

            sorted_general_strings: List[Tuple[str, float]] = sorted(
                general_res.items(),
                key=lambda x: x[1],
                reverse=True,
            )
            sorted_general_strings_str: List[str] = [x[0] for x in sorted_general_strings]
            sorted_general_strings_str = [x.replace(' ', '_') for x in sorted_general_strings_str]
            ret_string: str = (
                ",".join(sorted_general_strings_str)
            )

            if len(character_res) > 0:
                sorted_character_strings: List[Tuple[str, float]] = sorted(
                    character_res.items(),
                    key=lambda x: x[1],
                    reverse=True,
                )
                sorted_character_strings_str: List[str] = [x[0] for x in sorted_character_strings]
                sorted_character_strings_str = [x.replace(' ', '_') for x in sorted_character_strings_str]
                ret_string += ","
                ret_string += ",".join(sorted_character_strings_str)

            ret_strings.append(ret_string)

        return ret_strings

    def write_to_file(self, csv_line: str) -> None:
        self.f.write(csv_line + '\n')

    def gen_image_tensor(self, file_path: str) -> Tensor | None:
        img: Image.Image = None
        try:
          img = Image.open(file_path)
          img.load()
          img_tmp = self.prepare_image(img)
          # run the model's input transform to convert to tensor and rescale
          input: Tensor = self.transform(img_tmp)
          # NCHW image RGB to BGR
          input = input[[2, 1, 0]]
          return input
        except Exception as e:
          if img is not None:
            img.close()
          error_class: type = type(e)
          error_description: str = str(e)
          err_msg: str = '%s: %s' % (error_class, error_description)
          print(err_msg)
          return None

    # def load_tensor_th(self, file_path: str) -> Tensor | None:
    #     try:
    #         loaded_tensor = torch.load(file_path)
    #         return loaded_tensor
    #     except Exception as e:
    #         error_class: type = type(e)
    #         error_description: str = str(e)
    #         err_msg: str = '%s: %s' % (error_class, error_description)
    #         print(err_msg)
    #         print_traceback()
    #         return None

    def filter_files_by_date(self, file_list: List[str], added_date: datetime.date) -> List[str]:
        filtered_list: List[str] = []
        for file_path in file_list:
            stat = os.stat(file_path)
            ctime: datetime.date = datetime.date.fromtimestamp(stat.st_ctime)
            if ctime >= added_date:
                filtered_list.append(file_path)

        return filtered_list

    def process_directory(self, dir_path: str, added_date: datetime.date | None = None) -> None:
        file_list: List[str] = self.list_files_recursive(dir_path)
        print(f'{len(file_list)} files found')

        # tag new images after specified date
        if added_date is not None:
            file_list = self.filter_files_by_date(file_list, added_date)
            print(f'{len(file_list)} files found after {added_date}')
            # backup tags-wd-tagger.txt with copying to tags-wd-tagger.txt.bak
            if os.path.exists('tags-wd-tagger.txt'):
                with open('tags-wd-tagger.txt', 'r', encoding='utf-8') as f:
                    with open('tags-wd-tagger.txt.bak', 'w', encoding='utf-8') as f_bak:
                        f_bak.write(f.read())
            else:
                print('tags-wd-tagger.txt not found')
                exit(1)

        self.f = open('tags-wd-tagger.txt', 'a', encoding='utf-8')

        self.load_model()

        tensors: List[Tensor] = []
        fpathes: List[str] = []
        start: float = time.perf_counter()
        last_cnt: int = 0
        cnt: int = 0
        failed_cnt: int = 0
        passed_idx: int = 0
        with concurrent.futures.ThreadPoolExecutor(max_workers=WORKER_NUM) as executor:
            # dispatch get Tensor task to processes
            future_to_path = {executor.submit(self.gen_image_tensor, file_path): file_path for file_path in
                              file_list[0: BATCH_SIZE]}
            passed_idx += BATCH_SIZE
            while passed_idx < len(file_list):
                for future in concurrent.futures.as_completed(future_to_path):
                    path = future_to_path[future]
                    try:
                        tensor = future.result()
                        if tensor is None:
                            failed_cnt += 1
                            cnt -= 1
                            # continue

                        if tensor is not None:
                            tensors.append(tensor)
                            fpathes.append(path)

                        if len(tensors) >= BATCH_SIZE - failed_cnt:
                            # submit load Tensor tasks for next batch
                            end_idx = passed_idx + BATCH_SIZE
                            if end_idx > len(file_list):
                                end_idx = len(file_list)
                            future_to_path = {executor.submit(self.gen_image_tensor, file_path): file_path for file_path
                                              in file_list[passed_idx: end_idx]}
                            passed_idx = end_idx

                            # run inference
                            results_in_csv_format: List[str] = self.predict(tensors, 0.3, True, 0.3, True)
                            for idx, line in enumerate(results_in_csv_format):
                                self.write_to_file(fpathes[idx] + ',' + line)
                            tensors = []
                            fpathes = []
                            failed_cnt = 0

                        cnt += 1

                        if cnt - last_cnt >= PROGRESS_INTERVAL:
                            now: float = time.perf_counter()
                            print(f'{cnt} files processed')
                            diff: float = now - start
                            print('{:.2f} seconds elapsed'.format(diff))
                            if cnt > 0:
                                time_per_file: float = diff / cnt
                                print('{:.4f} seconds per file'.format(time_per_file))
                            print("", flush=True)
                            last_cnt = cnt

                    except Exception as e:
                        error_class: type = type(e)
                        error_description: str = str(e)
                        err_msg: str = '%s: %s' % (error_class, error_description)
                        print(err_msg)
                        print_traceback()
                        continue

def main(arg_str: list[str]) -> None:
    parser: argparse.ArgumentParser = argparse.ArgumentParser()
    parser.add_argument('--dir', nargs=1, required=True, help='tagging target directory path')
    # Note: when specified --after, create tags-wd-tagger.txt.bak file and update tags-wd-tagger.txt
    parser.add_argument('--after', nargs=1,
                        help='tagging new images after this date (mtime attribute). Format: YYYY-MM-DD')
    args: argparse.Namespace = parser.parse_args(arg_str)

    predictor: Predictor = Predictor()
    if args.after is not None:
        try:
            after_date: datetime.date = datetime.datetime.strptime(args.after[0], '%Y-%m-%d').date()
        except Exception as e:
            error_class: type = type(e)
            error_description: str = str(e)
            err_msg: str = '%s: %s' % (error_class, error_description)
            print(err_msg)
            print('Invalid date format. format is YYYY-MM-DD')
            exit(1)

        predictor.process_directory(args.dir[0], after_date)
    else:
        predictor.process_directory(args.dir[0])

if __name__ == "__main__":
    main(sys.argv[1:])