|
@@ -14,11 +14,12 @@ class ModelTrainer:
|
|
|
初始化 ModelTrainer 类,加载配置参数。
|
|
|
:param config: 配置对象,包含模型训练所需的参数
|
|
|
"""
|
|
|
- self.config = config
|
|
|
+ self.config:Config = config
|
|
|
self.model_name = config.model_name
|
|
|
self.max_seq_length = config.max_seq_length
|
|
|
self.dtype = torch.float16 if config.dtype == "float16" else torch.bfloat16
|
|
|
self.load_in_4bit = config.load_in_4bit
|
|
|
+ self.fast_inference=config.fast_inference
|
|
|
self.lora_rank = config.lora_rank
|
|
|
self.gpu_memory_utilization=config.gpu_memory_utilization
|
|
|
|
|
@@ -32,9 +33,9 @@ class ModelTrainer:
|
|
|
max_seq_length=self.max_seq_length,
|
|
|
load_in_4bit=self.load_in_4bit,
|
|
|
dtype=self.dtype,
|
|
|
- fast_inference=False,
|
|
|
+ fast_inference=self.fast_inference,
|
|
|
max_lora_rank=self.lora_rank,
|
|
|
- gpu_memory_utilization=0.6,
|
|
|
+ gpu_memory_utilization=config.gpu_memory_utilization,
|
|
|
)
|
|
|
|
|
|
model = model.to_empty(device='cuda')
|
|
@@ -91,7 +92,7 @@ class ModelTrainer:
|
|
|
torch.cuda.empty_cache()
|
|
|
|
|
|
training_args = GRPOConfig(
|
|
|
- use_vllm=False,
|
|
|
+ use_vllm=self.config.use_vllm,
|
|
|
learning_rate=self.config.learning_rate,
|
|
|
adam_beta1=self.config.adam_beta1,
|
|
|
adam_beta2=self.config.adam_beta2,
|
|
@@ -233,8 +234,14 @@ if __name__ == "__main__":
|
|
|
os.environ['MASTER_PORT'] = '12345'
|
|
|
|
|
|
|
|
|
- # 初始化进程组
|
|
|
- dist.init_process_group(backend='nccl', init_method='env://')
|
|
|
+ # 根据操作系统选择后端
|
|
|
+ backend = 'gloo' if os.name == 'nt' else 'nccl'
|
|
|
+
|
|
|
+ # 使用文件初始化方法
|
|
|
+ init_method = 'file:///tmp/shared_file' # 文件路径需要所有进程都能访问
|
|
|
+ dist.init_process_group(backend=backend, init_method=init_method)
|
|
|
+
|
|
|
+ print(f"Initialized distributed training with backend: {backend}")
|
|
|
|
|
|
# 初始化 ModelTrainer
|
|
|
trainer = ModelTrainer(config)
|