t5_encoder.py

from pathlib import Path

import comfy.clip_model
import comfy.latent_formats
import comfy.model_base
import comfy.model_management
import comfy.model_patcher
import comfy.sd
import comfy.sd1_clip
import comfy.supported_models_base
import comfy.utils
import folder_paths
import torch
from transformers import T5EncoderModel, T5Tokenizer

from .nodes_registry import comfy_node


class LTXVTokenizer(comfy.sd1_clip.SDTokenizer):
    def __init__(self, tokenizer_path: str):
        self.tokenizer = T5Tokenizer.from_pretrained(
            tokenizer_path, local_files_only=True
        )

    def tokenize_with_weights(self, text: str, return_word_ids: bool = False):
        """
        Takes a prompt and converts it to a list of (token, weight, word id) elements.
        Tokens can both be integer tokens and pre computed CLIP tensors.
        Word id values are unique per word and embedding, where the id 0 is reserved for non word tokens.
        Returned list has the dimensions NxM where M is the input size of CLIP
        """
        text = text.lower().strip()
        text_inputs = self.tokenizer(
            text,
            padding="max_length",  # do_not_pad, longest, max_length
            max_length=128,
            truncation=True,
            add_special_tokens=True,
            return_tensors="pt",
        )
        text_input_ids = text_inputs.input_ids
        prompt_attention_mask = text_inputs.attention_mask

        out = {
            "t5xxl": [
                (token, weight, i)
                for i, (token, weight) in enumerate(
                    zip(text_input_ids[0], prompt_attention_mask[0])
                )
            ]
        }

        if not return_word_ids:
            out = {k: [(t, w) for t, w, _ in v] for k, v in out.items()}

        return out


class LTXVTextEncoderModel(torch.nn.Module):
    def __init__(
        self, encoder_path, dtype_t5=None, device="cpu", dtype=None, model_options={}
    ):
        super().__init__()
        dtype_t5 = comfy.model_management.pick_weight_dtype(dtype_t5, dtype, device)
        self.t5xxl = (
            T5EncoderModel.from_pretrained(encoder_path, local_files_only=True)
            .to(dtype_t5)
            .to(device)
        )
        self.dtypes = set([dtype, dtype_t5])

    def set_clip_options(self, options):
        pass

    def reset_clip_options(self):
        pass

    def encode_token_weights(self, token_weight_pairs):
        token_weight_pairs_t5 = token_weight_pairs["t5xxl"]
        text_input_ids = torch.tensor(
            [[t[0] for t in token_weight_pairs_t5]],
            device=self.t5xxl.device,
        )
        prompt_attention_mask = torch.tensor(
            [[w[1] for w in token_weight_pairs_t5]],
            device=self.t5xxl.device,
        )
        self.to(self.t5xxl.device)  # comfyui skips loading some weights to gpu
        out = self.t5xxl(text_input_ids, attention_mask=prompt_attention_mask)[0]
        out = out * prompt_attention_mask.unsqueeze(2)
        return out, None, {"attention_mask": prompt_attention_mask}

    def load_sd(self, sd):
        return self.t5xxl.load_state_dict(sd, strict=False)


def ltxv_clip(encoder_path, dtype_t5=None):
    class LTXVTextEncoderModel_(LTXVTextEncoderModel):
        def __init__(self, device="cpu", dtype=None, model_options={}):
            super().__init__(
                encoder_path=encoder_path,
                dtype_t5=dtype_t5,
                device=device,
                dtype=dtype,
                model_options=model_options,
            )

    return LTXVTextEncoderModel_


def ltxv_tokenizer(tokenizer_path):
    class LTXVTokenizer_(LTXVTokenizer):
        def __init__(self, embedding_directory=None, tokenizer_data={}):
            super().__init__(tokenizer_path)

    return LTXVTokenizer_


@comfy_node(name="LTXVCLIPModelLoader", description="LTXV CLIP Model Loader")
class LTXVCLIPModelLoader:
    @classmethod
    def INPUT_TYPES(s):
        return {
            "required": {
                "clip_path": (
                    folder_paths.get_filename_list("text_encoders"),
                    {"tooltip": "The name of the text encoder model to load."},
                )
            }
        }

    RETURN_TYPES = ("CLIP",)
    RETURN_NAMES = ("clip",)
    FUNCTION = "load_model"
    CATEGORY = "lightricks/LTXV"
    TITLE = "LTXV Model Loader"
    OUTPUT_NODE = False

    def load_model(self, clip_path):
        path = Path(folder_paths.get_full_path("text_encoders", clip_path))
        tokenizer_path = path.parents[1] / "tokenizer"
        encoder_path = path.parents[1] / "text_encoder"

        clip_target = comfy.supported_models_base.ClipTarget(
            tokenizer=ltxv_tokenizer(tokenizer_path),
            clip=ltxv_clip(encoder_path, dtype_t5=torch.bfloat16),
        )

        return (comfy.sd.CLIP(clip_target),)