|
@@ -114,31 +114,31 @@ class ModelTrainer:
|
|
|
self.load_in_4bit = load_in_4bit # load_in_4bit: 是否以4位精度加载模型,用于节省显存
|
|
|
self.lora_rank=lora_rank #Larger rank = smarter, but slower
|
|
|
|
|
|
- def load_model(self):
|
|
|
+ def load_model(self,lora_rank=64):
|
|
|
# 加载预训练模型和分词器
|
|
|
model, tokenizer = FastLanguageModel.from_pretrained(
|
|
|
model_name=self.model_name,
|
|
|
max_seq_length=self.max_seq_length,
|
|
|
load_in_4bit=self.load_in_4bit, # 值为True 以 4 bit量化进行微调,为False LoRA 16bit。这将内存使用量减少了 4 倍,使我们能够在免费的 16GB 内存 GPU 中实际进行微调。4 位量化本质上将权重转换为一组有限的数字以减少内存使用量。这样做的缺点是准确度会下降 1-2%。如果您想要这种微小的额外准确度,请在较大的 GPU(如 H100)上将其设置为 False。
|
|
|
dtype=self.dtype,
|
|
|
- fast_inference = True, # Enable vLLM fast inference
|
|
|
+ fast_inference = False, # Enable vLLM fast inference
|
|
|
max_lora_rank = lora_rank,
|
|
|
- gpu_memory_utilization=0.1, # 0.6 # Reduce if out of memory
|
|
|
+ gpu_memory_utilization=0.6, # 0.6 # Reduce if out of memory
|
|
|
)
|
|
|
|
|
|
- # 将模型移动到设备上
|
|
|
- model = model.to_empty(device='cuda') # 使用 to_empty 而不是 to
|
|
|
+ # # 将模型移动到设备上
|
|
|
+ # model = model.to_empty(device='cuda') # 使用 to_empty 而不是 to
|
|
|
|
|
|
- # 初始化模型的权重
|
|
|
- for param in model.parameters():
|
|
|
- if param.is_meta:
|
|
|
- param.data = torch.randn_like(param) # 随机初始化
|
|
|
+ # # 初始化模型的权重
|
|
|
+ # for param in model.parameters():
|
|
|
+ # if param.is_meta:
|
|
|
+ # param.data = torch.randn_like(param) # 随机初始化
|
|
|
|
|
|
# 添加 LoRA 适配器
|
|
|
model = FastLanguageModel.get_peft_model(
|
|
|
model,
|
|
|
max_seq_length=self.max_seq_length, # 最大上下文(序列)长度
|
|
|
- r=8, # 16 # LoRA 的秩,控制适配器的复杂度
|
|
|
+ r=lora_rank, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
|
|
|
target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
|
|
|
"gate_proj", "up_proj", "down_proj"], # 应用 LoRA 的目标模块
|
|
|
lora_alpha=16, # LoRA 的 alpha 参数,控制适配器的缩放
|
|
@@ -154,7 +154,8 @@ class ModelTrainer:
|
|
|
|
|
|
def load_data(self, train_data_path):
|
|
|
# 加载训练集和测试集
|
|
|
- train_dataset = load_dataset("json", data_files={"train": train_data_path}, split="train")
|
|
|
+ with open(train_data_path, 'r') as f:
|
|
|
+ train_dataset = load_dataset("json", data_files={"train": train_data_path}, split="train")
|
|
|
|
|
|
# train_data_path: 训练数据路径,格式为 JSONL
|
|
|
return train_dataset
|
|
@@ -166,16 +167,16 @@ class ModelTrainer:
|
|
|
print(f"Reserved memory: {torch.cuda.memory_reserved()}")
|
|
|
print(f"Allocated memory: {torch.cuda.memory_allocated()}")
|
|
|
|
|
|
- # 启用 pin_memory 2025年3月7日未能验证通过
|
|
|
- train_loader = torch.utils.data.DataLoader(
|
|
|
- train_dataset, batch_size=1, shuffle=True, pin_memory=True
|
|
|
- )
|
|
|
+ # # 启用 pin_memory 2025年3月10日未能验证通过
|
|
|
+ # train_loader = torch.utils.data.DataLoader(
|
|
|
+ # train_dataset, batch_size=1, shuffle=True, pin_memory=True
|
|
|
+ # )
|
|
|
|
|
|
- # 释放未使用的显存
|
|
|
- torch.cuda.empty_cache()
|
|
|
+ # # 释放未使用的显存
|
|
|
+ # torch.cuda.empty_cache()
|
|
|
|
|
|
training_args = GRPOConfig(
|
|
|
- use_vllm = True, # use vLLM for fast inference!
|
|
|
+ use_vllm = False, # use vLLM for fast inference!
|
|
|
learning_rate = 5e-6,
|
|
|
adam_beta1 = 0.9,
|
|
|
adam_beta2 = 0.99,
|
|
@@ -188,12 +189,12 @@ class ModelTrainer:
|
|
|
fp16 = not is_bfloat16_supported(),
|
|
|
per_device_train_batch_size = 1,
|
|
|
gradient_accumulation_steps = 1, # Increase to 4 for smoother training
|
|
|
- num_generations = 128, # 256 # 每次生成 输出个数,值范围: 1 - 256
|
|
|
- max_prompt_length = 128, # 256 # 输入提示的最大长度
|
|
|
- max_completion_length = 128,# 200 # 生成内容的最大长度
|
|
|
+ num_generations = 8, # 8 # 每次生成 输出 个数
|
|
|
+ max_prompt_length = 256, # 256 # 输入提示的最大长度
|
|
|
+ max_completion_length = 200,# 200 # 生成内容的最大长度
|
|
|
num_train_epochs = 1, # Set to 1 for a full training run
|
|
|
- max_steps = 10, # 250
|
|
|
- save_steps = 10, # 250
|
|
|
+ max_steps = 250, # 250
|
|
|
+ save_steps = 250, # 250
|
|
|
max_grad_norm = 0.1,
|
|
|
report_to = "none", # Can use Weights & Biases
|
|
|
output_dir = os.path.join('..', 'models',"outputs"),
|
|
@@ -229,10 +230,10 @@ if __name__ == "__main__":
|
|
|
# 配置参数
|
|
|
model_name = os.path.join('..', 'models', 'pretrained', 'DeepSeek-R1-Distill-Qwen-1.5B')
|
|
|
# model_name: 预训练模型的路径
|
|
|
- max_seq_length = 512 # 单次会话(single session) 的最大 token 长度,一个token大约3-4 字节(Byte)
|
|
|
+ max_seq_length = 6144 # 单次会话(single session) 的最大 token 长度,一个token大约3-4 字节(Byte)
|
|
|
dtype = torch.float16 # 数据类型
|
|
|
load_in_4bit = True # 是否以4位精度加载模型
|
|
|
- lora_rank=16
|
|
|
+ lora_rank=64
|
|
|
|
|
|
# 定义训练集和测试集路径
|
|
|
train_data_path = os.path.join('..', 'data', 'processed', 'train.jsonl')
|
|
@@ -241,10 +242,10 @@ if __name__ == "__main__":
|
|
|
try:
|
|
|
# 设置环境变量
|
|
|
# 单机多卡
|
|
|
- os.environ['RANK'] = '0' # 第一张卡的 rank
|
|
|
- os.environ['WORLD_SIZE'] = '1' # 总共有 1 张卡
|
|
|
- os.environ['MASTER_ADDR'] = 'localhost'
|
|
|
- os.environ['MASTER_PORT'] = '12345'
|
|
|
+ # os.environ['RANK'] = '0' # 第一张卡的 rank
|
|
|
+ # os.environ['WORLD_SIZE'] = '1' # 总共有 1 张卡
|
|
|
+ # os.environ['MASTER_ADDR'] = 'localhost'
|
|
|
+ # os.environ['MASTER_PORT'] = '12345'
|
|
|
# 多机多卡
|
|
|
# export RANK=0 # 第一台机器的 rank
|
|
|
# export WORLD_SIZE=4 # 总共有 4 台机器
|
|
@@ -252,12 +253,12 @@ if __name__ == "__main__":
|
|
|
# export MASTER_PORT=12345
|
|
|
|
|
|
# 初始化进程组
|
|
|
- dist.init_process_group(backend='nccl', init_method='env://')
|
|
|
+ # dist.init_process_group(backend='nccl', init_method='env://')
|
|
|
# 初始化 ModelTrainer
|
|
|
trainer = ModelTrainer(model_name, max_seq_length, dtype, load_in_4bit,lora_rank)
|
|
|
|
|
|
# 加载模型和分词器
|
|
|
- model, tokenizer = trainer.load_model()
|
|
|
+ model, tokenizer = trainer.load_model(lora_rank)
|
|
|
|
|
|
# 加载数据集
|
|
|
train_dataset = trainer.load_data(train_data_path)
|
|
@@ -269,6 +270,7 @@ if __name__ == "__main__":
|
|
|
save_path = os.path.join('..', 'models', 'trained', 'DeepSeek-R1-Distill-Qwen-1.5B-GRPO')
|
|
|
trainer.save_model(model, tokenizer, save_path)
|
|
|
finally:
|
|
|
- # 确保进程组被销毁
|
|
|
- if dist.is_initialized():
|
|
|
- dist.destroy_process_group()
|
|
|
+ # # 确保进程组被销毁
|
|
|
+ # if dist.is_initialized():
|
|
|
+ # dist.destroy_process_group()
|
|
|
+ print("train finally")
|