import os import torch from unsloth import FastLanguageModel class ModelQuantizer: def __init__(self, model_path): self.model_path = model_path def quantize(self): model, tokenizer = FastLanguageModel.from_pretrained(self.model_path) # Apply dynamic quantization quantized_model = torch.quantization.quantize_dynamic( model, {torch.nn.Linear}, dtype=torch.qint8 ) return quantized_model, tokenizer def save_quantized_model(self, model, save_path): model.save_pretrained(save_path) if __name__ == "__main__": model_path = os.path.join('..', 'models', 'deepseek-r1-distill-1.5B-finetuned') quantizer = ModelQuantizer(model_path) quantized_model, tokenizer = quantizer.quantize() quantizer.save_quantized_model(quantized_model, os.path.join('..', 'models', 'deepseek-r1-distill-1.5B-quantized'))