|
|
@@ -39,7 +39,7 @@ class ModelTrainer:
|
|
|
model, tokenizer = FastLanguageModel.from_pretrained(
|
|
|
model_name=self.model_name,
|
|
|
max_seq_length=self.max_seq_length,
|
|
|
- load_in_4bit=self.load_in_4bit,
|
|
|
+ load_in_4bit=self.load_in_4bit, # False for LoRA 16bit
|
|
|
dtype=self.dtype,
|
|
|
fast_inference=self.fast_inference,
|
|
|
max_lora_rank=self.lora_rank,
|
|
|
@@ -57,16 +57,17 @@ class ModelTrainer:
|
|
|
model = FastLanguageModel.get_peft_model(
|
|
|
model,
|
|
|
max_seq_length=self.max_seq_length,
|
|
|
- r=self.lora_rank,
|
|
|
+ r=self.lora_rank, # Choose any number>0!suggested 8,16,32,64,128
|
|
|
target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
|
|
|
- "gate_proj", "up_proj", "down_proj"],
|
|
|
+ "gate_proj", "up_proj", "down_proj"], # Remove QKVO if out of memory
|
|
|
lora_alpha=16,
|
|
|
- lora_dropout=0,
|
|
|
- bias="none",
|
|
|
- use_gradient_checkpointing="unsloth",
|
|
|
+ lora_dropout=0, #Supports any, but = 0 is optimized
|
|
|
+ bias="none", # Supports any, but = "none" is optimized
|
|
|
+ #[NEW]"unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
|
|
|
+ use_gradient_checkpointing="unsloth", # True or "unsloth" for very long context
|
|
|
random_state=3407,
|
|
|
- use_rslora=False,
|
|
|
- loftq_config=None,
|
|
|
+ use_rslora=False, # We support rank stabilized LoRA
|
|
|
+ loftq_config=None, # And LoftQ
|
|
|
)
|
|
|
|
|
|
return model, tokenizer
|
|
|
@@ -79,6 +80,7 @@ class ModelTrainer:
|
|
|
"""
|
|
|
with open(train_data_path, 'r') as f:
|
|
|
train_dataset = load_dataset("json", data_files={"train": train_data_path}, split="train")
|
|
|
+ print("train_dataset",train_dataset)
|
|
|
return train_dataset
|
|
|
|
|
|
def train(self, model, tokenizer, train_dataset):
|
|
|
@@ -124,22 +126,26 @@ class ModelTrainer:
|
|
|
output_dir=self.config.output_dir,
|
|
|
)
|
|
|
|
|
|
+ """
|
|
|
+ PyTorch 的分布式进程组已初始化,但并行模式不等于 “分布式并行模式(ParallelMode.DISTRIBUTED)”。
|
|
|
+ 为了使用 PyTorch 的分布式数据并行(DDP),请使用 python -m torch.distributed.launch 来启动你的脚本。
|
|
|
+ """
|
|
|
+
|
|
|
trainer = GRPOTrainer(
|
|
|
model=model,
|
|
|
- processing_class=tokenizer,
|
|
|
+ processing_class=tokenizer, # 用于处理输入文本的分词器(tokenizer)。它将文本转换为模型可以理解的数字格式。
|
|
|
reward_funcs=[
|
|
|
- self.xmlcount_reward_func,
|
|
|
- self.soft_format_reward_func,
|
|
|
- # self.strict_format_reward_func,
|
|
|
- self.int_reward_func,
|
|
|
- self.correctness_reward_func,
|
|
|
- self.strict_format_reward_func,
|
|
|
- self.semantic_correctness_reward_func,
|
|
|
- self.reasoning_quality_reward_func,
|
|
|
- self.combined_reward_func,
|
|
|
- ],
|
|
|
- args=training_args,
|
|
|
- train_dataset=train_dataset,
|
|
|
+ self.xmlcount_reward_func, # 某种特定的基于XML计数的奖励函数
|
|
|
+ self.soft_format_reward_func, # 基于软格式的奖励函数。
|
|
|
+ self.strict_format_reward_func, # 基于严格格式的奖励函数。
|
|
|
+ self.int_reward_func, # 整数奖励函数。
|
|
|
+ self.correctness_reward_func, # 基于输出正确性的奖励函数
|
|
|
+ self.semantic_correctness_reward_func, # 语义正确性奖励函数
|
|
|
+ self.reasoning_quality_reward_func, # 推理质量奖励函数
|
|
|
+ self.combined_reward_func, # combined_reward_func
|
|
|
+ ], # 这是一个奖励函数的列表,决定了模型输出的好坏。在GRPO训练中,奖励函数通常用来评估模型输出的质量。
|
|
|
+ args=training_args, # 定义的训练超参数。
|
|
|
+ train_dataset=train_dataset, # 训练数据集,
|
|
|
)
|
|
|
|
|
|
trainer.train()
|