|
@@ -114,7 +114,7 @@ class ModelTrainer:
|
|
|
self.load_in_4bit = load_in_4bit # load_in_4bit: 是否以4位精度加载模型,用于节省显存
|
|
|
self.lora_rank=lora_rank #Larger rank = smarter, but slower
|
|
|
|
|
|
- def load_model(self):
|
|
|
+ def load_model(self,lora_rank=64):
|
|
|
# 加载预训练模型和分词器
|
|
|
model, tokenizer = FastLanguageModel.from_pretrained(
|
|
|
model_name=self.model_name,
|
|
@@ -138,7 +138,7 @@ class ModelTrainer:
|
|
|
model = FastLanguageModel.get_peft_model(
|
|
|
model,
|
|
|
max_seq_length=self.max_seq_length, # 最大上下文(序列)长度
|
|
|
- r=16, # LoRA 的秩,控制适配器的复杂度
|
|
|
+ r=lora_rank, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
|
|
|
target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
|
|
|
"gate_proj", "up_proj", "down_proj"], # 应用 LoRA 的目标模块
|
|
|
lora_alpha=16, # LoRA 的 alpha 参数,控制适配器的缩放
|
|
@@ -189,7 +189,7 @@ class ModelTrainer:
|
|
|
fp16 = not is_bfloat16_supported(),
|
|
|
per_device_train_batch_size = 1,
|
|
|
gradient_accumulation_steps = 1, # Increase to 4 for smoother training
|
|
|
- num_generations = 256, # 256 # 每次生成 4 个输出
|
|
|
+ num_generations = 8, # 8 # 每次生成 输出 个数
|
|
|
max_prompt_length = 256, # 256 # 输入提示的最大长度
|
|
|
max_completion_length = 200,# 200 # 生成内容的最大长度
|
|
|
num_train_epochs = 1, # Set to 1 for a full training run
|
|
@@ -233,7 +233,7 @@ if __name__ == "__main__":
|
|
|
max_seq_length = 6144 # 单次会话(single session) 的最大 token 长度,一个token大约3-4 字节(Byte)
|
|
|
dtype = torch.float16 # 数据类型
|
|
|
load_in_4bit = True # 是否以4位精度加载模型
|
|
|
- lora_rank=16
|
|
|
+ lora_rank=64
|
|
|
|
|
|
# 定义训练集和测试集路径
|
|
|
train_data_path = os.path.join('..', 'data', 'processed', 'train.jsonl')
|
|
@@ -258,7 +258,7 @@ if __name__ == "__main__":
|
|
|
trainer = ModelTrainer(model_name, max_seq_length, dtype, load_in_4bit,lora_rank)
|
|
|
|
|
|
# 加载模型和分词器
|
|
|
- model, tokenizer = trainer.load_model()
|
|
|
+ model, tokenizer = trainer.load_model(lora_rank)
|
|
|
|
|
|
# 加载数据集
|
|
|
train_dataset = trainer.load_data(train_data_path)
|