output_dir="./distilled", temperature=2.0, # 軟化概率分布 alpha_ce=0.5, # 交叉熵?fù)p失權(quán)重 alpha_mse=0.5 # 隱藏層MSE損失權(quán)重 ) trainer = DistillationTrainer( teacher=teacher, student=student, args=training_args, train_dataset=tokenized_datasets["train"], tokenizer=tokenizer ) trainer.train()2...
www.dbjr.com.cn/python/340361h...htm 2025-6-9