9 tháng trước cách đây · 73a5634a5e
--- a/README.MD
+++ b/README.MD
@@ -280,6 +280,11 @@ DeepSeek-R1-Distill-Qwen-32B 是基于知识蒸馏技术对原模型（如 Qwen-
 
				 
			
 
				 ### vLLM 
			
 
				 vLLM （其官方未给出正式英文全称）是一个专门针对大语言模型（Large Language Model，LLM）推理场景所设计的高性能推理引擎/库。它通过对显存管理、注意力机制（KV Cache）和张量计算进行优化，显著提高大模型在推理时的速度并减少显存占用。  
			
 
				+vllm是UC Berkeley大佬Ion Stoica组开源的大模型推理引擎。其在2023.6.20首次发布，于2024.3.30发布0.4.0版本。vllm性能优异、使用简单、代码结构清晰，因此获得了诸多关注，Github上也已有17k stars。
			
 
				+
			
 
				+vllm首次提出并实现了Paged Attention，除此之外，还实现了Continuous Batching，量化（GPTQ, AWQ, SqueezeLLM, FP8 KV Cache等）、Tensor Parallelism、高性能CUDA kernel等功能。
			
 
				+
			
 
				+vllm基于PyTorch、transformers、xformers、Ray、fastapi、triton、sentencepiece等库进行开发。
			
 
				 
			
 
				 ---
			
 
				 
			
@@ -708,25 +713,30 @@ GPU：VRAM≥ 192 GB ( 164 GB)；硬盘：NVMe SSD，≥ 8 TB，读取速度 ≥
 
				 **参考资料：**
			
 
				 [fine-tuning-vram-requirements]:  https://docs.unsloth.ai/get-started/beginner-start-here/unsloth-requirements#fine-tuning-vram-requirements "Unsloth训练、微调模型对GPU VRAM需求"
			
 
				 
			
 
				-## 计算资源经济评估
			
 
				+## 计算资源及经济评估
			
 
				 
			
 
				 **AI智算云方案**
			
 
				 昇腾910B (计算节点)x2 ≈ 61.4 万元/年
			
 
				 网络（100Mbps）：6.6万元/年
			
 
				-合计：68万元/年
			
 
				+费用合计：68万元/年
			
 
				 
			
 
				 计算资源配置及报价如下图：
			
 
				 <div align=center><img src="./resources/images/计算资源-单节点-源自-AI智算云.png"></div>
			
 
				-<div align=center><img src="./resources/images/报价-源自-AI智算云.png"></div>
			
 
				+<div align=center><img src="./resources/images/计算资源及报价-源自-AI智算云.png"></div>
			
 
				 
			
 
				 **华为云方案**
			
 
				-昇腾云服务
			
 
				+昇腾云服务 费用合计 ≥ 120 万元
			
 
				 
			
 
				-<div align=center><img src="./resources/images/报价-源自-华为云.png"></div>
			
 
				+<div align=center><img src="./resources/images/计算资源及报价-源自-华为云.png"></div>
			
 
				 注：
			
 
				 1. 未提供大模型训练、微调的计算机资源方案。
			
 
				-2. "AI专业服务"包含训练、微调解决方案，但不提供计算机资源（需单独购买）。
			
 
				+2. "AI专业服务"包含训练、微调解决方案，但 **不提供计算机资源（需单独购买）**。
			
 
				 
			
 
				+**李明星**
			
 
				+昇腾云服务 费用合计: 16.3 万元
			
 
				+
			
 
				+计算资源配置及报价如下图：
			
 
				+<div align=center><img src="./resources/images/计算资源及报价-源自-李明星.png"></div>
			
 
				 
			
 
				 
			
 
				 ## 大模型所需计算资源评估
			
--- a/conf/conf_train.yaml
+++ b/conf/conf_train.yaml
@@ -4,29 +4,29 @@ max_seq_length: 6144  # 2048 单次会话的最大 token 长度
 
				 dtype: "float16"  # 数据类型，可选 "float16" 或 "bfloat16"
			
 
				 load_in_4bit: True  # 是否以4位精度加载模型
			
 
				 fast_inference: False # Enable vLLM fast inference
			
 
				-lora_rank: 64  # LoRA 的 rank 值
			
 
				+lora_rank: 64  # LoRA 的 rank 值 Choose any number>0!suggested 8,16,32,64，128
			
 
				 gpu_memory_utilization: 0.6 # GPU VRAM 占用率
			
 
				 
			
 
				 # 训练配置
			
 
				 use_vllm: False # use vLLM for fast inference!
			
 
				-learning_rate: 1e-5  # 5e-6 学习率
			
 
				+learning_rate: 1e-5  # 5e-6 学习率 1e-4 (0.0001) to 5e-5 (0.00005).
			
 
				 adam_beta1: 0.9  # Adam 优化器的 beta1 参数
			
 
				 adam_beta2: 0.99  # Adam 优化器的 beta2 参数
			
 
				-weight_decay: 0.1  # 权重衰减
			
 
				-warmup_ratio: 0.1  # 学习率预热比例
			
 
				-lr_scheduler_type: "cosine"  # 学习率调度器类型
			
 
				-optim: "adamw_8bit"  # 优化器类型
			
 
				+weight_decay: 0.1  # 权重衰减，用于防止过拟合，设置为0.1
			
 
				+warmup_ratio: 0.1  # 学习率预热比例，用于初期，避免学习率过大导致不稳定
			
 
				+lr_scheduler_type: "cosine"  # 学习率衰减策略，cosine为余弦衰减
			
 
				+optim: "adamw_8bit"  # 优化器类型 , adamw_8bit为AdmaW优化器，并启动8位精度以减少内存占用
			
 
				 logging_steps: 1  # 日志记录步数
			
 
				-per_device_train_batch_size: 1  # 每个设备的训练批次大小
			
 
				-gradient_accumulation_steps: 1  # 梯度累积步数
			
 
				-num_generations: 8  # 8 每次生成的输出个数
			
 
				-max_prompt_length: 256  # 输入提示的最大长度
			
 
				-max_completion_length: 200  # 生成内容的最大长度
			
 
				+per_device_train_batch_size: 2  # 1 每个设备的训练批次（batch）大小
			
 
				+gradient_accumulation_steps: 4  # 1 梯度累积步数 ,用于在较小的batch size下模拟较大的batch
			
 
				+num_generations: 6  # 8 表示每次训练时生成的候选输出数量
			
 
				+max_prompt_length: 256  # 模型输入的最大长度
			
 
				+max_completion_length: 200  # 模型输入（生成）的最大长度
			
 
				 num_train_epochs: 1  # 训练轮数
			
 
				-max_steps: 250  # 最大训练步数
			
 
				-save_steps: 250  # 保存模型的步数
			
 
				-max_grad_norm: 0.1  # 最大梯度范数
			
 
				-report_to: "none"  # 报告工具，如 Weights & Biases
			
 
				+max_steps: 250  # 训练的最大步数
			
 
				+save_steps: 250  # 保存模型的步数（多少步保存一次模型）
			
 
				+max_grad_norm: 0.1  # 梯度裁剪的最大阈值，防止梯度爆炸
			
 
				+report_to: "none"  # 报告工具，报告内容如 Weights & Biases，设置为none 表不将训练结果报告到外部工具
			
 
				 output_dir: "../models/outputs"  # 输出目录
			
 
				 
			
 
				 # 数据配置
			
--- a/resources/images/计算资源及报价-源自-AI智算云.png
+++ b/resources/images/计算资源及报价-源自-AI智算云.png
--- a/resources/images/计算资源及报价-源自-华为云.png
+++ b/resources/images/计算资源及报价-源自-华为云.png
--- a/resources/images/计算资源及报价-源自-李明星.png
+++ b/resources/images/计算资源及报价-源自-李明星.png
--- a/src/inference.py
+++ b/src/inference.py
@@ -1,6 +1,7 @@
 
				 import os
			
 
				 import torch
			
 
				 from unsloth import FastLanguageModel
			
 
				+from transformers import TextStreamer
			
 
				 
			
 
				 class ModelInference:
			
 
				     def __init__(self, model_path, max_seq_length, dtype, load_in_4bit):
			
@@ -44,7 +45,8 @@ class ModelInference:
 
				             
			
 
				             # 生成模型的回复
			
 
				             with torch.no_grad():
			
 
				-                outputs = self.model.generate(**inputs, max_length=self.max_seq_length, pad_token_id=self.tokenizer.eos_token_id)
			
 
				+                text_streamer =TextStreamer(self.tokenizer ,skip_prompt = True)
			
 
				+                outputs = self.model.generate(**inputs, streamer = text_streamer, max_length=self.max_seq_length, pad_token_id=self.tokenizer.eos_token_id)
			
 
				             
			
 
				             # 解码模型的输出
			
 
				             model_response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
			
--- a/src/train_model_grpo_v1.1.py
+++ b/src/train_model_grpo_v1.1.py
@@ -39,7 +39,7 @@ class ModelTrainer:
 
				         model, tokenizer = FastLanguageModel.from_pretrained(
			
 
				             model_name=self.model_name,
			
 
				             max_seq_length=self.max_seq_length,
			
 
				-            load_in_4bit=self.load_in_4bit,
			
 
				+            load_in_4bit=self.load_in_4bit, # False for LoRA 16bit
			
 
				             dtype=self.dtype,
			
 
				             fast_inference=self.fast_inference,
			
 
				             max_lora_rank=self.lora_rank,
			
@@ -57,16 +57,17 @@ class ModelTrainer:
 
				         model = FastLanguageModel.get_peft_model(
			
 
				             model,
			
 
				             max_seq_length=self.max_seq_length,
			
 
				-            r=self.lora_rank,
			
 
				+            r=self.lora_rank, # Choose any number>0!suggested 8,16,32,64，128
			
 
				             target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
			
 
				-                          "gate_proj", "up_proj", "down_proj"],
			
 
				+                          "gate_proj", "up_proj", "down_proj"],  # Remove QKVO if out of memory
			
 
				             lora_alpha=16,
			
 
				-            lora_dropout=0,
			
 
				-            bias="none",
			
 
				-            use_gradient_checkpointing="unsloth",
			
 
				+            lora_dropout=0, #Supports any, but = 0 is optimized
			
 
				+            bias="none", # Supports any, but = "none" is optimized
			
 
				+            #[NEW]"unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
			
 
				+            use_gradient_checkpointing="unsloth",  # True or "unsloth" for very long context
			
 
				             random_state=3407,
			
 
				-            use_rslora=False,
			
 
				-            loftq_config=None,
			
 
				+            use_rslora=False,  # We support rank stabilized LoRA
			
 
				+            loftq_config=None,  # And LoftQ
			
 
				         )
			
 
				 
			
 
				         return model, tokenizer
			
@@ -79,6 +80,7 @@ class ModelTrainer:
 
				         """
			
 
				         with open(train_data_path, 'r') as f:
			
 
				             train_dataset = load_dataset("json", data_files={"train": train_data_path}, split="train")
			
 
				+            print("train_dataset",train_dataset)
			
 
				         return train_dataset
			
 
				 
			
 
				     def train(self, model, tokenizer, train_dataset):
			
@@ -124,22 +126,26 @@ class ModelTrainer:
 
				             output_dir=self.config.output_dir,
			
 
				         )
			
 
				 
			
 
				+        """
			
 
				+        PyTorch 的分布式进程组已初始化，但并行模式不等于 “分布式并行模式（ParallelMode.DISTRIBUTED）”。
			
 
				+        为了使用 PyTorch 的分布式数据并行（DDP），请使用 python -m torch.distributed.launch 来启动你的脚本。
			
 
				+        """
			
 
				+
			
 
				         trainer = GRPOTrainer(
			
 
				             model=model,
			
 
				-            processing_class=tokenizer,
			
 
				+            processing_class=tokenizer, # 用于处理输入文本的分词器(tokenizer)。它将文本转换为模型可以理解的数字格式。
			
 
				             reward_funcs=[
			
 
				-                self.xmlcount_reward_func,
			
 
				-                self.soft_format_reward_func,
			
 
				-                # self.strict_format_reward_func,
			
 
				-                self.int_reward_func,
			
 
				-                self.correctness_reward_func,
			
 
				-                self.strict_format_reward_func,
			
 
				-                self.semantic_correctness_reward_func,
			
 
				-                self.reasoning_quality_reward_func,
			
 
				-                self.combined_reward_func,
			
 
				-            ],
			
 
				-            args=training_args,
			
 
				-            train_dataset=train_dataset,
			
 
				+                self.xmlcount_reward_func, # 某种特定的基于XML计数的奖励函数
			
 
				+                self.soft_format_reward_func, # 基于软格式的奖励函数。
			
 
				+                self.strict_format_reward_func, # 基于严格格式的奖励函数。
			
 
				+                self.int_reward_func,  # 整数奖励函数。
			
 
				+                self.correctness_reward_func, # 基于输出正确性的奖励函数
			
 
				+                self.semantic_correctness_reward_func, # 语义正确性奖励函数
			
 
				+                self.reasoning_quality_reward_func,  # 推理质量奖励函数
			
 
				+                self.combined_reward_func,  # combined_reward_func
			
 
				+            ],   # 这是一个奖励函数的列表，决定了模型输出的好坏。在GRPO训练中，奖励函数通常用来评估模型输出的质量。
			
 
				+            args=training_args, # 定义的训练超参数。
			
 
				+            train_dataset=train_dataset, # 训练数据集,
			
 
				         )
			
 
				 
			
 
				         trainer.train()
			
--- a/src/unsloth_compiled_cache/UnslothAlignPropTrainer.py
+++ b/src/unsloth_compiled_cache/UnslothAlignPropTrainer.py
@@ -120,7 +120,7 @@ class UnslothAlignPropConfig(AlignPropConfig):
 
				     )
			
 
				     def __init__(
			
 
				         self,
			
 
				-        exp_name = 'train_model_grpo_v1',
			
 
				+        exp_name = 'inference',
			
 
				         run_name = '',
			
 
				         seed = 3407,
			
 
				         log_with = None,
			
--- a/src/unsloth_compiled_cache/UnslothDDPOTrainer.py
+++ b/src/unsloth_compiled_cache/UnslothDDPOTrainer.py
@@ -136,7 +136,7 @@ class UnslothDDPOConfig(DDPOConfig):
 
				     )
			
 
				     def __init__(
			
 
				         self,
			
 
				-        exp_name = 'train_model_grpo_v1',
			
 
				+        exp_name = 'inference',
			
 
				         run_name = '',
			
 
				         seed = 3407,
			
 
				         log_with = None,
			
--- a/src/unsloth_compiled_cache/__pycache__/UnslothAlignPropTrainer.cpython-311.pyc
+++ b/src/unsloth_compiled_cache/__pycache__/UnslothAlignPropTrainer.cpython-311.pyc
--- a/src/unsloth_compiled_cache/__pycache__/UnslothBCOTrainer.cpython-311.pyc
+++ b/src/unsloth_compiled_cache/__pycache__/UnslothBCOTrainer.cpython-311.pyc
--- a/src/unsloth_compiled_cache/__pycache__/UnslothCPOTrainer.cpython-311.pyc
+++ b/src/unsloth_compiled_cache/__pycache__/UnslothCPOTrainer.cpython-311.pyc
--- a/src/unsloth_compiled_cache/__pycache__/UnslothDDPOTrainer.cpython-311.pyc
+++ b/src/unsloth_compiled_cache/__pycache__/UnslothDDPOTrainer.cpython-311.pyc
--- a/src/unsloth_compiled_cache/__pycache__/UnslothDPOTrainer.cpython-311.pyc
+++ b/src/unsloth_compiled_cache/__pycache__/UnslothDPOTrainer.cpython-311.pyc
--- a/src/unsloth_compiled_cache/__pycache__/UnslothGKDTrainer.cpython-311.pyc
+++ b/src/unsloth_compiled_cache/__pycache__/UnslothGKDTrainer.cpython-311.pyc
--- a/src/unsloth_compiled_cache/__pycache__/UnslothGRPOTrainer.cpython-311.pyc
+++ b/src/unsloth_compiled_cache/__pycache__/UnslothGRPOTrainer.cpython-311.pyc
--- a/src/unsloth_compiled_cache/__pycache__/UnslothKTOTrainer.cpython-311.pyc
+++ b/src/unsloth_compiled_cache/__pycache__/UnslothKTOTrainer.cpython-311.pyc
--- a/src/unsloth_compiled_cache/__pycache__/UnslothNashMDTrainer.cpython-311.pyc
+++ b/src/unsloth_compiled_cache/__pycache__/UnslothNashMDTrainer.cpython-311.pyc
--- a/src/unsloth_compiled_cache/__pycache__/UnslothORPOTrainer.cpython-311.pyc
+++ b/src/unsloth_compiled_cache/__pycache__/UnslothORPOTrainer.cpython-311.pyc
--- a/src/unsloth_compiled_cache/__pycache__/UnslothOnlineDPOTrainer.cpython-311.pyc
+++ b/src/unsloth_compiled_cache/__pycache__/UnslothOnlineDPOTrainer.cpython-311.pyc
--- a/src/unsloth_compiled_cache/__pycache__/UnslothPPOTrainer.cpython-311.pyc
+++ b/src/unsloth_compiled_cache/__pycache__/UnslothPPOTrainer.cpython-311.pyc
--- a/src/unsloth_compiled_cache/__pycache__/UnslothPRMTrainer.cpython-311.pyc
+++ b/src/unsloth_compiled_cache/__pycache__/UnslothPRMTrainer.cpython-311.pyc
--- a/src/unsloth_compiled_cache/__pycache__/UnslothRLOOTrainer.cpython-311.pyc
+++ b/src/unsloth_compiled_cache/__pycache__/UnslothRLOOTrainer.cpython-311.pyc
--- a/src/unsloth_compiled_cache/__pycache__/UnslothRewardTrainer.cpython-311.pyc
+++ b/src/unsloth_compiled_cache/__pycache__/UnslothRewardTrainer.cpython-311.pyc
--- a/src/unsloth_compiled_cache/__pycache__/UnslothSFTTrainer.cpython-311.pyc
+++ b/src/unsloth_compiled_cache/__pycache__/UnslothSFTTrainer.cpython-311.pyc
--- a/src/unsloth_compiled_cache/__pycache__/UnslothXPOTrainer.cpython-311.pyc
+++ b/src/unsloth_compiled_cache/__pycache__/UnslothXPOTrainer.cpython-311.pyc