config.py

# -*- coding  : utf-8 -*-
# Description : gpu, model, Parallelism, data, train and inference config definition

import math, json
from constants import *
from dataclasses import dataclass
from enum import Enum
from functools import total_ordering


class ActivationRecomputation(Enum):
    NONE = 0
    """No activation recomputation; requires the most amount of memory."""

    SELECTIVE = 1
    """Selectively checkpoints and recomputes only parts of each transformer
    layer that take up a considerable amount of memory but are not
    computationally expensive to recompute, i.e. Q K V matrix multiplies, 
    QK^T matrix multiply, softmax, softmax dropout, and attention over V."""

    FULL = 2
    """Full activation recomputation stores the input to EVERY transformer
    layer, which is sharded across the tensor parallel group, thus requiring an
    extra all-gather (ignored for now) per layer and add communication
    overhead; requires the lease amount of memory; requires an extra forward
    pass."""
    
@total_ordering
class DSZeRO(Enum):
    NONE = 0
    """No DeepSPeed ZeRO; requires the most amount of memory."""

    STAGE_1 = 1
    """ZeRO stage 1 shards the optimizer states across the data parallel
    group."""

    STAGE_2 = 2
    """ZeRO stage 2 shards the optimizer states and gradients across the data
    parallel group."""

    STAGE_3 = 3
    """ZeRO stage 3 shards the optimizer states, gradients, and model weights
    across the data parallel group."""

    def __lt__(self, other):
        # 炫技写法
        if other.__class__ is self.__class__:
            return self.value < other.value # Enum 枚举类自动赋值
        return NotImplemented
    
    def __eq__(self, other):
        if isinstance(other, DSZeRO):
            return self.value == other.value
        return NotImplemented

@dataclass
class GPUEfficiencyConfig:
    flops_efficiency: float = 1.0
    hbm_memory_efficiency: float = 1.0
    intra_node_memory_efficiency: float = 1.0
    inter_node_memory_efficiency: float = 1.0
    
@dataclass
class InferenceConfig:
    """Inference configuration dataclass."""
    batch_size_per_gpu: int = None      # batch size
    seq_len: int = 522         # input sequence length
    generate_len: int = 1526    # number of tokens to generate
    context_len: int = None     # context length
    use_kv_cache: bool = True   # whether to use key/value cache
    bytes_per_param: int = BYTES_FP16  # model weight bytes
    layernorm_dtype_bytes: int = BYTES_FP16  # layernorm data type bytes
    kv_cache_dtype_bytes: int = BYTES_FP16   # key/value cache data type bytes
    def __post_init__(self):
        if self.context_len is None:
            self.context_len = self.seq_len + self.generate_len
        
@dataclass
class ParallelismConfig:
    """dataclass module provides a decorator and functions for automatically adding generated special methods 
    such as __init__() and __repr__() to user-defined classes
    """
    tp_size: int = 1  # tensor parallelism size, Megatron-LM tensor parallelism implementation
    pp_size: int = 1  # pipeline parallelism size, Megatron-LM pipeline parallelism implementation
    dp_size: int = 1  # data parallelism size, DeepSpeed Zero parallelism implementation
    sp_size: int = 1  # sequence parallelism size, Megatron-LM sequence parallelism implementation

@dataclass
class ModelConfig:
    num_layers: int  # number of transformer layers (blocks)
    n_head: int      # number of attention heads 
    hidden_dim: int  # hidden dimension
    vocab_size: int  # vocabulary size
    num_key_value_heads: int = None 
    max_seq_len: int = None   # max sequence length
    ffn_embed_dim: int = None # hidden dimension of FFN, default to 4 * hidden_dim
    model_type: str = None    # model type as tagged on Hugging Face (e.g., gpt2, opt, llama.)
    model_name: str = None    # model name as tagged on Hugging Face (e.g., gpt2-xl, opt, llama-13b.)
    
    def __post_init__(self):
        if self.num_key_value_heads is None: # 如果不存在，设置默认值
            self.num_key_value_heads = self.n_head 
            
        if self.ffn_embed_dim is None:
            self.ffn_embed_dim = self.hidden_dim * 4
        
@dataclass
class GPUConfig:
    # 1, gpu 型号和显存大小
    name: str  # GPU config name
    memory_GPU_in_GB: float  # memory per GPU in GB
    
    # 2, gpu 显存带宽、节点内带宽、节点间带宽
    hbm_bandwidth_in_GB_per_sec: float  # GPU HBM bandwidth in GB/s
    intra_node_bandwidth_in_GB_per_sec: float  # intra node GPU bandwidth in GB/s.(PCIE/NVLINK)
    intra_node_min_message_latency: float  # minimum intra node message latency in seconds
    
    inter_node_bandwidth_in_GB_per_sec: float = 200  # inter node bandwidth in GB/s, assuming Mellanox 200Gbps HDR Infiniband
    
    # 3, 不同精度的 Tensor core 的计算性能
    peak_fp32_TFLOPS: float = None  # peak Tensor TFLOPS for FP32
    peak_fp16_TFLOPS: float = None         # peak Tensor TFLOPS for FP16
    peak_int8_TFLOPS: float = None  # peak Tensor TFLOPS for INT8
    peak_int4_TFLOPS: float = None  # peak Tensor TFLOPS for INT4

    FLOPS_EFFICIENCY = 0.7
    HBM_MEMORY_EFFICIENCY = 0.9
    
    def __post_init__(self):
        """object creation of DataClass starts with __init__() (constructor-calling) and 
        ends with __post__init__() (post-init processing).
        """
        if self.peak_fp32_TFLOPS is None:
            self.peak_fp32_TFLOPS =  math.ceil(self.peak_fp16_TFLOPS / 2)
        if self.peak_int8_TFLOPS is None:
            self.peak_int8_TFLOPS = 2 * self.peak_fp16_TFLOPS
        if self.peak_int4_TFLOPS is None:
            self.peak_int4_TFLOPS = 4 * self.peak_fp16_TFLOPS
            
        if self.FLOPS_EFFICIENCY:
            self.peak_fp32_TFLOPS *= self.FLOPS_EFFICIENCY
            self.peak_fp16_TFLOPS *= self.FLOPS_EFFICIENCY
            self.peak_int8_TFLOPS *= self.FLOPS_EFFICIENCY
            self.peak_int4_TFLOPS *= self.FLOPS_EFFICIENCY
        if self.HBM_MEMORY_EFFICIENCY:
            self.hbm_bandwidth_in_GB_per_sec *= self.HBM_MEMORY_EFFICIENCY
            self.intra_node_bandwidth_in_GB_per_sec *= self.HBM_MEMORY_EFFICIENCY
class LLMConfigs(object):
    def __init__(self, gpu_config: GPUConfig,
                 model_config: ModelConfig,
                 parallelism_config: ParallelismConfig = ParallelismConfig(),
                 inference_config: InferenceConfig = InferenceConfig(),
                 gpu_efficiency_config: GPUEfficiencyConfig = GPUEfficiencyConfig()
                ) -> None:
        self.model_config = model_config
        self.gpu_config = gpu_config
        self.parallelism_config = parallelism_config
        self.inference_config = inference_config # 用户自行指定配置
        self.gpu_efficiency_config = gpu_efficiency_config # 用户自行指定配置
      
def get_model_and_gpu_config_by_name(model_name="llama-13b", gpu_name="v100-pcie-32gb") -> dict:
    """Read model and gpu configs from a json file."""
    config_files = ["configs/model_configs.json", "configs/gpu_configs.json"]
    model_config, gpu_config = {}, {}
    
    for config_filename in config_files:
        with open(config_filename, "r") as f:
            config_json = json.load(f)
            
            if "model" in config_filename:
                assert model_name in config_json, f"model name {model_name} not found in {config_filename}"
                config_dict = config_json[model_name]
                model_config = ModelConfig(**config_dict)
            
            elif "gpu" in config_filename:
                assert gpu_name in config_json, f"gpu name {gpu_name} not found in {config_filename}"
                config_dict = config_json[gpu_name]
                gpu_config = GPUConfig(**config_dict)
            else:
                assert False, f"unknown config type when reading: {type}"
            
    return model_config, gpu_config

def get_TFLOPS_per_gpu(gpu_config: GPUConfig, data_type="fp16", flops_efficiency=1.0) -> float:
    """Get the expected TFLOPS per GPU for the specified data type
    configuration/GPU (adjusted by flops_efficiency)

    Returns:
        float: TFLOPS per GPU and unit is T.
    """
    if data_type == "int8":
        gemm_TFOPS = gpu_config.peak_int8_TFLOPS
    elif data_type == "fp16":
        gemm_TFOPS = gpu_config.peak_fp16_TFLOPS
    else:
        print("weight_bits and activation_bits must be 8, or 16!")
    
    return gemm_TFOPS * flops_efficiency

def get_gpu_hbm_bandwidth(gpu_config: GPUConfig, hbm_memory_efficiency=1.0) -> float:
    return (
        gpu_config.hbm_bandwidth_in_GB_per_sec * hbm_memory_efficiency
    )
    
def get_intra_node_bandwidth(gpu_config: GPUConfig, intra_node_memory_efficiency=1.0) -> float:
    return (
        gpu_config.intra_node_bandwidth_in_GB_per_sec * intra_node_memory_efficiency
    )

def get_inter_node_bandwidth(gpu_config: GPUConfig, inter_node_memory_efficiency=1.0) -> float:
    return (
        gpu_config.inter_node_bandwidth_in_GB_per_sec * inter_node_memory_efficiency
    )