-
Notifications
You must be signed in to change notification settings - Fork 2
/
config.py
executable file
·213 lines (178 loc) · 8.91 KB
/
config.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
# -*- coding : utf-8 -*-
# Description : gpu, model, Parallelism, data, train and inference config definition
import math, json
from constants import *
from dataclasses import dataclass
from enum import Enum
from functools import total_ordering
class ActivationRecomputation(Enum):
NONE = 0
"""No activation recomputation; requires the most amount of memory."""
SELECTIVE = 1
"""Selectively checkpoints and recomputes only parts of each transformer
layer that take up a considerable amount of memory but are not
computationally expensive to recompute, i.e. Q K V matrix multiplies,
QK^T matrix multiply, softmax, softmax dropout, and attention over V."""
FULL = 2
"""Full activation recomputation stores the input to EVERY transformer
layer, which is sharded across the tensor parallel group, thus requiring an
extra all-gather (ignored for now) per layer and add communication
overhead; requires the lease amount of memory; requires an extra forward
pass."""
@total_ordering
class DSZeRO(Enum):
NONE = 0
"""No DeepSPeed ZeRO; requires the most amount of memory."""
STAGE_1 = 1
"""ZeRO stage 1 shards the optimizer states across the data parallel
group."""
STAGE_2 = 2
"""ZeRO stage 2 shards the optimizer states and gradients across the data
parallel group."""
STAGE_3 = 3
"""ZeRO stage 3 shards the optimizer states, gradients, and model weights
across the data parallel group."""
def __lt__(self, other):
# 炫技写法
if other.__class__ is self.__class__:
return self.value < other.value # Enum 枚举类自动赋值
return NotImplemented
def __eq__(self, other):
if isinstance(other, DSZeRO):
return self.value == other.value
return NotImplemented
@dataclass
class GPUEfficiencyConfig:
flops_efficiency: float = 1.0
hbm_memory_efficiency: float = 1.0
intra_node_memory_efficiency: float = 1.0
inter_node_memory_efficiency: float = 1.0
@dataclass
class InferenceConfig:
"""Inference configuration dataclass."""
batch_size_per_gpu: int = None # batch size
seq_len: int = 522 # input sequence length
generate_len: int = 1526 # number of tokens to generate
context_len: int = None # context length
use_kv_cache: bool = True # whether to use key/value cache
bytes_per_param: int = BYTES_FP16 # model weight bytes
layernorm_dtype_bytes: int = BYTES_FP16 # layernorm data type bytes
kv_cache_dtype_bytes: int = BYTES_FP16 # key/value cache data type bytes
def __post_init__(self):
if self.context_len is None:
self.context_len = self.seq_len + self.generate_len
@dataclass
class ParallelismConfig:
"""dataclass module provides a decorator and functions for automatically adding generated special methods
such as __init__() and __repr__() to user-defined classes
"""
tp_size: int = 1 # tensor parallelism size, Megatron-LM tensor parallelism implementation
pp_size: int = 1 # pipeline parallelism size, Megatron-LM pipeline parallelism implementation
dp_size: int = 1 # data parallelism size, DeepSpeed Zero parallelism implementation
sp_size: int = 1 # sequence parallelism size, Megatron-LM sequence parallelism implementation
@dataclass
class ModelConfig:
num_layers: int # number of transformer layers (blocks)
n_head: int # number of attention heads
hidden_dim: int # hidden dimension
vocab_size: int # vocabulary size
num_key_value_heads: int = None
max_seq_len: int = None # max sequence length
ffn_embed_dim: int = None # hidden dimension of FFN, default to 4 * hidden_dim
model_type: str = None # model type as tagged on Hugging Face (e.g., gpt2, opt, llama.)
model_name: str = None # model name as tagged on Hugging Face (e.g., gpt2-xl, opt, llama-13b.)
def __post_init__(self):
if self.num_key_value_heads is None: # 如果不存在,设置默认值
self.num_key_value_heads = self.n_head
if self.ffn_embed_dim is None:
self.ffn_embed_dim = self.hidden_dim * 4
@dataclass
class GPUConfig:
# 1, gpu 型号和显存大小
name: str # GPU config name
memory_GPU_in_GB: float # memory per GPU in GB
# 2, gpu 显存带宽、节点内带宽、节点间带宽
hbm_bandwidth_in_GB_per_sec: float # GPU HBM bandwidth in GB/s
intra_node_bandwidth_in_GB_per_sec: float # intra node GPU bandwidth in GB/s.(PCIE/NVLINK)
intra_node_min_message_latency: float # minimum intra node message latency in seconds
inter_node_bandwidth_in_GB_per_sec: float = 200 # inter node bandwidth in GB/s, assuming Mellanox 200Gbps HDR Infiniband
# 3, 不同精度的 Tensor core 的计算性能
peak_fp32_TFLOPS: float = None # peak Tensor TFLOPS for FP32
peak_fp16_TFLOPS: float = None # peak Tensor TFLOPS for FP16
peak_int8_TFLOPS: float = None # peak Tensor TFLOPS for INT8
peak_int4_TFLOPS: float = None # peak Tensor TFLOPS for INT4
FLOPS_EFFICIENCY = 0.7
HBM_MEMORY_EFFICIENCY = 0.9
def __post_init__(self):
"""object creation of DataClass starts with __init__() (constructor-calling) and
ends with __post__init__() (post-init processing).
"""
if self.peak_fp32_TFLOPS is None:
self.peak_fp32_TFLOPS = math.ceil(self.peak_fp16_TFLOPS / 2)
if self.peak_int8_TFLOPS is None:
self.peak_int8_TFLOPS = 2 * self.peak_fp16_TFLOPS
if self.peak_int4_TFLOPS is None:
self.peak_int4_TFLOPS = 4 * self.peak_fp16_TFLOPS
if self.FLOPS_EFFICIENCY:
self.peak_fp32_TFLOPS *= self.FLOPS_EFFICIENCY
self.peak_fp16_TFLOPS *= self.FLOPS_EFFICIENCY
self.peak_int8_TFLOPS *= self.FLOPS_EFFICIENCY
self.peak_int4_TFLOPS *= self.FLOPS_EFFICIENCY
if self.HBM_MEMORY_EFFICIENCY:
self.hbm_bandwidth_in_GB_per_sec *= self.HBM_MEMORY_EFFICIENCY
self.intra_node_bandwidth_in_GB_per_sec *= self.HBM_MEMORY_EFFICIENCY
class LLMConfigs(object):
def __init__(self, gpu_config: GPUConfig,
model_config: ModelConfig,
parallelism_config: ParallelismConfig = ParallelismConfig(),
inference_config: InferenceConfig = InferenceConfig(),
gpu_efficiency_config: GPUEfficiencyConfig = GPUEfficiencyConfig()
) -> None:
self.model_config = model_config
self.gpu_config = gpu_config
self.parallelism_config = parallelism_config
self.inference_config = inference_config # 用户自行指定配置
self.gpu_efficiency_config = gpu_efficiency_config # 用户自行指定配置
def get_model_and_gpu_config_by_name(model_name="llama-13b", gpu_name="v100-pcie-32gb") -> dict:
"""Read model and gpu configs from a json file."""
config_files = ["configs/model_configs.json", "configs/gpu_configs.json"]
model_config, gpu_config = {}, {}
for config_filename in config_files:
with open(config_filename, "r") as f:
config_json = json.load(f)
if "model" in config_filename:
assert model_name in config_json, f"model name {model_name} not found in {config_filename}"
config_dict = config_json[model_name]
model_config = ModelConfig(**config_dict)
elif "gpu" in config_filename:
assert gpu_name in config_json, f"gpu name {gpu_name} not found in {config_filename}"
config_dict = config_json[gpu_name]
gpu_config = GPUConfig(**config_dict)
else:
assert False, f"unknown config type when reading: {type}"
return model_config, gpu_config
def get_TFLOPS_per_gpu(gpu_config: GPUConfig, data_type="fp16", flops_efficiency=1.0) -> float:
"""Get the expected TFLOPS per GPU for the specified data type
configuration/GPU (adjusted by flops_efficiency)
Returns:
float: TFLOPS per GPU and unit is T.
"""
if data_type == "int8":
gemm_TFOPS = gpu_config.peak_int8_TFLOPS
elif data_type == "fp16":
gemm_TFOPS = gpu_config.peak_fp16_TFLOPS
else:
print("weight_bits and activation_bits must be 8, or 16!")
return gemm_TFOPS * flops_efficiency
def get_gpu_hbm_bandwidth(gpu_config: GPUConfig, hbm_memory_efficiency=1.0) -> float:
return (
gpu_config.hbm_bandwidth_in_GB_per_sec * hbm_memory_efficiency
)
def get_intra_node_bandwidth(gpu_config: GPUConfig, intra_node_memory_efficiency=1.0) -> float:
return (
gpu_config.intra_node_bandwidth_in_GB_per_sec * intra_node_memory_efficiency
)
def get_inter_node_bandwidth(gpu_config: GPUConfig, inter_node_memory_efficiency=1.0) -> float:
return (
gpu_config.inter_node_bandwidth_in_GB_per_sec * inter_node_memory_efficiency
)