|
@@ -180,15 +180,15 @@ class ModelTrainer:
|
|
|
weight_decay = 0.1,
|
|
|
warmup_ratio = 0.1,
|
|
|
lr_scheduler_type = "cosine",
|
|
|
- optim = "adamw_8bit",
|
|
|
+ optim ="adamw_8bit", # "adamw_8bit" if device == "cuda" else "adamw_torch", # CPU 使用 adamw_torch
|
|
|
logging_steps = 1,
|
|
|
bf16 = is_bfloat16_supported(),
|
|
|
fp16 = not is_bfloat16_supported(),
|
|
|
per_device_train_batch_size = 1,
|
|
|
gradient_accumulation_steps = 1, # Increase to 4 for smoother training
|
|
|
- num_generations = 4, # 每次生成 4 个输出
|
|
|
- max_prompt_length = 256, # 输入提示的最大长度
|
|
|
- max_completion_length = 200, # 生成内容的最大长度
|
|
|
+ num_generations = 128, # 256 # 每次生成 4 个输出
|
|
|
+ max_prompt_length = 128, # 256 # 输入提示的最大长度
|
|
|
+ max_completion_length = 128,# 200 # 生成内容的最大长度
|
|
|
num_train_epochs = 1, # Set to 1 for a full training run
|
|
|
max_steps = 10, # 250
|
|
|
save_steps = 10, # 250
|
|
@@ -227,7 +227,7 @@ if __name__ == "__main__":
|
|
|
# 配置参数
|
|
|
model_name = os.path.join('..', 'models', 'pretrained', 'DeepSeek-R1-Distill-Qwen-1.5B')
|
|
|
# model_name: 预训练模型的路径
|
|
|
- max_seq_length = 2048 # 单次会话(single session) 的最大 token 长度,一个token大约3-4 字节(Byte)
|
|
|
+ max_seq_length = 512 # 单次会话(single session) 的最大 token 长度,一个token大约3-4 字节(Byte)
|
|
|
dtype = torch.float16 # 数据类型
|
|
|
load_in_4bit = True # 是否以4位精度加载模型
|
|
|
lora_rank=16
|