12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273 |
- import os
- import torch
- from unsloth import FastLanguageModel
- from transformers import TextStreamer
- from conf_train import load_config
- class ModelInference:
- def __init__(self, model_path, max_seq_length, dtype, load_in_4bit):
- self.model_path = model_path
- self.max_seq_length = max_seq_length
- self.dtype = dtype
- self.load_in_4bit = load_in_4bit
- self.model = None
- self.tokenizer = None
- self.lora_rank=64
- def load_model(self):
- # 加载训练好的模型和分词器
- self.model, self.tokenizer = FastLanguageModel.from_pretrained(
- model_name=self.model_path,
- max_seq_length=self.max_seq_length,
- load_in_4bit=self.load_in_4bit, # 值为True 以 4 bit量化进行微调,为False LoRA 16bit。这将内存使用量减少了 4 倍,使我们能够在免费的 16GB 内存 GPU 中实际进行微调。4 位量化本质上将权重转换为一组有限的数字以减少内存使用量。这样做的缺点是准确度会下降 1-2%。如果您想要这种微小的额外准确度,请在较大的 GPU(如 H100)上将其设置为 False。
- dtype=self.dtype,
- fast_inference = False, # Enable vLLM fast inference
- max_lora_rank = self.lora_rank,
- gpu_memory_utilization=0.6, # 0.6 # Reduce if out of memory
- )
-
- # 将模型设置为推理模式
- self.model = FastLanguageModel.for_inference(self.model)
-
- print("Model and tokenizer loaded successfully.")
- def chat(self):
- # 与模型进行交互
- print("Start chatting with the model (type 'exit' to stop)!")
- while True:
- user_input = input("You: ")
- if user_input.lower() == "exit":
- print("Exiting chat.")
- break
-
- # 将用户输入编码为模型输入
- inputs = self.tokenizer(user_input, return_tensors="pt", max_length=self.max_seq_length, truncation=True)
- inputs = inputs.to("cuda") # 将输入数据移动到GPU
-
- # 生成模型的回复
- with torch.no_grad():
- text_streamer =TextStreamer(self.tokenizer ,skip_prompt = True)
- outputs = self.model.generate(**inputs, streamer = text_streamer, max_length=self.max_seq_length, pad_token_id=self.tokenizer.eos_token_id)
-
- # 解码模型的输出
- model_response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
- print(f"人工智能: {model_response}")
- if __name__ == "__main__":
- # Load configuration
- config = load_config()
- # 配置参数
- model_path = config.save_path
- max_seq_length = 2048
- dtype = torch.float16
- load_in_4bit = True
- # 初始化 ModelInference
- inference = ModelInference(model_path, max_seq_length, dtype, load_in_4bit)
-
- # 加载模型和分词器
- inference.load_model()
- # 开始与模型对话
- inference.chat()
|