import argparse
import torch
from openmind import is_torch_npu_available
from transformers import AutoTokenizer, AutoModelForCausalLM
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument(
"--model_name_or_path",
type=str,
help="Path to model",
default=None,
)
args = parser.parse_args()
return args
def main():
args = parse_args()
if args.model_name_or_path:
model_path = args.model_name_or_path
else:
model_path = "../"
if is_torch_npu_available():
device = "npu:0"
else:
device = "cpu"
tokenizer = AutoTokenizer.from_pretrained("Rose/Qwen2.5-14B-Instruct-4bit")
model = AutoModelForCausalLM.from_pretrained("Rose/Qwen2.5-14B-Instruct-4bit").to(device)
input_ids = tokenizer("Gra", return_tensors='pt').to(model.device)["input_ids"]
output = model.generate(input_ids, max_new_tokens=48, do_sample=True, temperature=0.7)
print(tokenizer.decode(output[0]))
if __name__ == "__main__":
main()模型 mlx-community/Qwen2.5-14B-Instruct-4bit 是使用 mlx-lm 版本 0.18.1 从 Qwen/Qwen2.5-14B-Instruct 转换为 MLX 格式的。
pip install mlx-lmfrom mlx_lm import load, generate
model, tokenizer = load("mlx-community/Qwen2.5-14B-Instruct-4bit")
response = generate(model, tokenizer, prompt="hello", verbose=True)