|
@@ -1,12 +1,10 @@
|
|
|
|
+# model_trainer.py
|
|
import torch
|
|
import torch
|
|
from torch.utils.data import Dataset, DataLoader
|
|
from torch.utils.data import Dataset, DataLoader
|
|
-from torch.optim import AdamW
|
|
|
|
-from torch.quantization import quantize_dynamic # 导入量化工具
|
|
|
|
-from transformers import AutoTokenizer, AutoModelForCausalLM, get_linear_schedule_with_warmup
|
|
|
|
|
|
+from transformers import AutoTokenizer, AutoModelForCausalLM, AdamW
|
|
from src.config import Config
|
|
from src.config import Config
|
|
import logging
|
|
import logging
|
|
import os
|
|
import os
|
|
-from torch.amp import autocast, GradScaler
|
|
|
|
|
|
|
|
logging.basicConfig(level=logging.INFO)
|
|
logging.basicConfig(level=logging.INFO)
|
|
logger = logging.getLogger(__name__)
|
|
logger = logging.getLogger(__name__)
|
|
@@ -35,15 +33,13 @@ class TextDataset(Dataset):
|
|
class ModelTrainer:
|
|
class ModelTrainer:
|
|
def __init__(self):
|
|
def __init__(self):
|
|
"""初始化模型、分词器和优化器"""
|
|
"""初始化模型、分词器和优化器"""
|
|
- self.modelId = Config.PRETRAINED_MODEL_DIR
|
|
|
|
self.device = Config.DEVICE
|
|
self.device = Config.DEVICE
|
|
logger.info(f"Using device: {self.device}")
|
|
logger.info(f"Using device: {self.device}")
|
|
- logger.info(f"model id: {self.modelId}")
|
|
|
|
|
|
|
|
try:
|
|
try:
|
|
# 加载分词器和模型
|
|
# 加载分词器和模型
|
|
- self.tokenizer = AutoTokenizer.from_pretrained(self.modelId)
|
|
|
|
- self.model = AutoModelForCausalLM.from_pretrained(self.modelId)
|
|
|
|
|
|
+ self.tokenizer = AutoTokenizer.from_pretrained(Config.PRETRAINED_MODEL_DIR)
|
|
|
|
+ self.model = AutoModelForCausalLM.from_pretrained(Config.PRETRAINED_MODEL_DIR)
|
|
self.model.to(self.device) # 将模型移动到设备
|
|
self.model.to(self.device) # 将模型移动到设备
|
|
logger.info("Pretrained model and tokenizer loaded.")
|
|
logger.info("Pretrained model and tokenizer loaded.")
|
|
except Exception as e:
|
|
except Exception as e:
|
|
@@ -57,72 +53,33 @@ class ModelTrainer:
|
|
dataset = TextDataset(Config.PROCESSED_DATA_PATH, self.tokenizer, Config.SEQ_LENGTH)
|
|
dataset = TextDataset(Config.PROCESSED_DATA_PATH, self.tokenizer, Config.SEQ_LENGTH)
|
|
dataloader = DataLoader(dataset, batch_size=Config.BATCH_SIZE, shuffle=True, num_workers=Config.NUM_WORKERS)
|
|
dataloader = DataLoader(dataset, batch_size=Config.BATCH_SIZE, shuffle=True, num_workers=Config.NUM_WORKERS)
|
|
|
|
|
|
- # 初始化优化器和学习率调度器
|
|
|
|
|
|
+ # 初始化优化器
|
|
optimizer = AdamW(self.model.parameters(), lr=Config.LEARNING_RATE)
|
|
optimizer = AdamW(self.model.parameters(), lr=Config.LEARNING_RATE)
|
|
- total_steps = len(dataloader) * Config.EPOCHS
|
|
|
|
- scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=100, num_training_steps=total_steps)
|
|
|
|
-
|
|
|
|
- # 混合精度训练
|
|
|
|
- scaler = GradScaler(enabled=Config.USE_FP16) # 移除 device_type 参数
|
|
|
|
|
|
|
|
# 训练循环
|
|
# 训练循环
|
|
self.model.train()
|
|
self.model.train()
|
|
for epoch in range(Config.EPOCHS):
|
|
for epoch in range(Config.EPOCHS):
|
|
- for i, batch in enumerate(dataloader):
|
|
|
|
|
|
+ for batch in dataloader:
|
|
# 将输入数据移动到设备
|
|
# 将输入数据移动到设备
|
|
inputs = batch.to(self.device)
|
|
inputs = batch.to(self.device)
|
|
|
|
|
|
# 前向传播
|
|
# 前向传播
|
|
- with autocast(device_type="cuda", enabled=Config.USE_FP16): # 使用新的 autocast
|
|
|
|
- outputs = self.model(inputs, labels=inputs)
|
|
|
|
- loss = outputs.loss # / Config.GRADIENT_ACCUMULATION_STEPS # 归一化损失
|
|
|
|
|
|
+ outputs = self.model(inputs, labels=inputs)
|
|
|
|
+ loss = outputs.loss
|
|
|
|
|
|
# 反向传播和优化
|
|
# 反向传播和优化
|
|
- scaler.scale(loss).backward() # 缩放损失并反向传播
|
|
|
|
- if (i + 1) % Config.GRADIENT_ACCUMULATION_STEPS == 0:
|
|
|
|
- scaler.step(optimizer) # 更新参数
|
|
|
|
- scaler.update() # 更新缩放器
|
|
|
|
- optimizer.zero_grad()
|
|
|
|
- scheduler.step() # 更新学习率
|
|
|
|
|
|
+ loss.backward()
|
|
|
|
+ optimizer.step()
|
|
|
|
+ optimizer.zero_grad()
|
|
|
|
|
|
logger.info(f"Epoch {epoch + 1}/{Config.EPOCHS}, Loss: {loss.item()}")
|
|
logger.info(f"Epoch {epoch + 1}/{Config.EPOCHS}, Loss: {loss.item()}")
|
|
|
|
|
|
# 保存训练后的模型
|
|
# 保存训练后的模型
|
|
os.makedirs(Config.TRAINED_MODEL_DIR, exist_ok=True)
|
|
os.makedirs(Config.TRAINED_MODEL_DIR, exist_ok=True)
|
|
- # self.model.save_pretrained(Config.TRAINED_MODEL_DIR)
|
|
|
|
- # self.tokenizer.save_pretrained(Config.TRAINED_MODEL_DIR)
|
|
|
|
- # logger.info(f"Model saved to {Config.TRAINED_MODEL_DIR}")
|
|
|
|
-
|
|
|
|
- # 量化模型
|
|
|
|
- self.quantize_model()
|
|
|
|
|
|
+ self.model.save_pretrained(Config.TRAINED_MODEL_DIR)
|
|
|
|
+ self.tokenizer.save_pretrained(Config.TRAINED_MODEL_DIR)
|
|
|
|
+ logger.info(f"Model saved to {Config.TRAINED_MODEL_DIR}")
|
|
|
|
|
|
except Exception as e:
|
|
except Exception as e:
|
|
logger.error(f"Training failed: {e}")
|
|
logger.error(f"Training failed: {e}")
|
|
raise
|
|
raise
|
|
-
|
|
|
|
- def quantize_model(self):
|
|
|
|
- """量化模型"""
|
|
|
|
- try:
|
|
|
|
- logger.info("Quantizing model...")
|
|
|
|
- # 将模型移动到 CPU(量化需要在 CPU 上进行)
|
|
|
|
- self.model.to("cpu")
|
|
|
|
-
|
|
|
|
- # 动态量化(Q4)
|
|
|
|
- quantized_model = quantize_dynamic(
|
|
|
|
- self.model, # 原始模型
|
|
|
|
- {torch.nn.Linear}, # 需要量化的层类型
|
|
|
|
- dtype=torch.qint8 # 量化精度(Q4)
|
|
|
|
- )
|
|
|
|
-
|
|
|
|
- # 保存量化后的模型
|
|
|
|
- # quantized_model_dir = os.path.join(Config.TRAINED_MODEL_DIR, "quantized")
|
|
|
|
-
|
|
|
|
- quantized_model.save_pretrained(Config.TRAINED_MODEL_DIR)
|
|
|
|
- quantized_model.save_pretrained(Config.TRAINED_MODEL_DIR)
|
|
|
|
- # os.makedirs(quantized_model_dir, exist_ok=True)
|
|
|
|
- # torch.save(quantized_model.state_dict(), os.path.join(quantized_model_dir, "quantized_model.pth"))
|
|
|
|
- logger.info(f"Quantized model saved to {Config.TRAINED_MODEL_DIR}")
|
|
|
|
-
|
|
|
|
- except Exception as e:
|
|
|
|
- logger.error(f"Quantization failed: {e}")
|
|
|
|
- raise
|
|
|