model_trainer_v2.2_no.py 4.2 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697
  1. import torch
  2. from torch.utils.data import Dataset, DataLoader
  3. from transformers import AutoTokenizer, AutoModelForCausalLM, AdamW, get_linear_schedule_with_warmup, DataCollatorWithPadding
  4. from src.config import Config
  5. import logging
  6. import os
  7. from torch.amp import autocast, GradScaler # 使用新的 torch.amp 模块
  8. logging.basicConfig(level=logging.INFO)
  9. logger = logging.getLogger(__name__)
  10. class TextDataset(Dataset):
  11. """自定义数据集类"""
  12. def __init__(self, file_path, tokenizer, seq_length):
  13. self.tokenizer = tokenizer
  14. self.seq_length = seq_length
  15. with open(file_path, "r", encoding="utf-8") as f:
  16. self.lines = f.read().splitlines()
  17. def __len__(self):
  18. return len(self.lines)
  19. def __getitem__(self, idx):
  20. line = self.lines[idx]
  21. tokens = self.tokenizer.encode(
  22. line,
  23. max_length=self.seq_length,
  24. truncation=True,
  25. padding="max_length"
  26. )
  27. return torch.tensor(tokens) # 返回 CPU 上的张量
  28. class ModelTrainer:
  29. def __init__(self):
  30. """初始化模型、分词器和优化器"""
  31. self.modelId = Config.PRETRAINED_MODEL_DIR
  32. self.device = Config.DEVICE
  33. logger.info(f"Using device: {self.device}")
  34. try:
  35. # 加载分词器和模型
  36. self.tokenizer = AutoTokenizer.from_pretrained(self.modelId)
  37. self.model = AutoModelForCausalLM.from_pretrained(self.modelId)
  38. self.model.to(self.device) # 将模型移动到设备
  39. logger.info("Pretrained model and tokenizer loaded.")
  40. except Exception as e:
  41. logger.error(f"Failed to load pretrained model: {e}")
  42. raise
  43. def train(self):
  44. """训练模型"""
  45. try:
  46. # 加载数据集
  47. dataset = TextDataset(Config.PROCESSED_DATA_PATH, self.tokenizer, Config.SEQ_LENGTH)
  48. data_collator = DataCollatorWithPadding(self.tokenizer) # 动态填充数据
  49. dataloader = DataLoader(dataset, batch_size=Config.BATCH_SIZE, shuffle=True, num_workers=Config.NUM_WORKERS, collate_fn=data_collator)
  50. # 初始化优化器和学习率调度器
  51. optimizer = AdamW(self.model.parameters(), lr=Config.LEARNING_RATE, weight_decay=0.01) # 添加权重衰减
  52. total_steps = len(dataloader) * Config.EPOCHS
  53. scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=100, num_training_steps=total_steps)
  54. # 混合精度训练
  55. scaler = GradScaler(enabled=Config.USE_FP16) # 使用混合精度
  56. # 训练循环
  57. self.model.train()
  58. for epoch in range(Config.EPOCHS):
  59. for i, batch in enumerate(dataloader):
  60. # 将输入数据移动到设备
  61. inputs = batch["input_ids"].to(self.device)
  62. attention_mask = batch["attention_mask"].to(self.device)
  63. # 前向传播
  64. with autocast(device_type=Config.DEVICE, enabled=Config.USE_FP16): # 使用混合精度
  65. outputs = self.model(inputs, attention_mask=attention_mask, labels=inputs)
  66. loss = outputs.loss / Config.GRADIENT_ACCUMULATION_STEPS # 归一化损失
  67. # 反向传播和优化
  68. scaler.scale(loss).backward() # 缩放损失并反向传播
  69. if (i + 1) % Config.GRADIENT_ACCUMULATION_STEPS == 0:
  70. scaler.step(optimizer) # 更新参数
  71. scaler.update() # 更新缩放器
  72. optimizer.zero_grad()
  73. scheduler.step() # 更新学习率
  74. logger.info(f"Epoch {epoch + 1}/{Config.EPOCHS}, Loss: {loss.item()}")
  75. # 保存训练后的模型
  76. os.makedirs(Config.TRAINED_MODEL_DIR, exist_ok=True)
  77. self.model.save_pretrained(Config.TRAINED_MODEL_DIR)
  78. self.tokenizer.save_pretrained(Config.TRAINED_MODEL_DIR)
  79. logger.info(f"Model saved to {Config.TRAINED_MODEL_DIR}")
  80. except Exception as e:
  81. logger.error(f"Training failed: {e}")
  82. raise