CvT-13模型在ImageNet-1k数据集上以224x224分辨率进行预训练。该模型由Wu等人在论文[CvT: Introducing Convolutions to Vision Transformers]中提出,并首次在此仓库发布。
免责声明:发布CvT的团队未为此模型编写模型卡片,因此本模型卡片由Hugging Face团队编写。
以下是如何使用此模型将COCO 2017数据集的图像分类为1000个ImageNet类别之一的方法:
from openmind import AutoFeatureExtractor
from transformers import CvtForImageClassification
from openmind import is_torch_npu_available
from openmind_hub import snapshot_download
from PIL import Image
import requests
import warnings
import argparse
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument( "--model_name_or_path", type=str, help="Path to model", default=None)
args = parser.parse_args()
return args
def main():
warnings.filterwarnings("ignore")
url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
image = Image.open(requests.get(url, stream=True).raw)
device = "npu" if is_torch_npu_available() else "cpu"
args = parse_args()
if args.model_name_or_path:
model_path = args.model_name_or_path
else:
model_path = snapshot_download(
"ChongqingAscend/cvt-13",
revision="main",
resume_download=True,
ignore_patterns=["*.h5", "*.ot", "*.msgpack"]
)
feature_extractor = AutoFeatureExtractor.from_pretrained(model_path)
model = CvtForImageClassification.from_pretrained(model_path).to(device)
inputs = feature_extractor(images=image, return_tensors="pt").to(device)
outputs = model(**inputs)
logits = outputs.logits
# model predicts one of the 1000 ImageNet classes
predicted_class_idx = logits.argmax(-1).item()
print("Predicted class:", model.config.id2label[predicted_class_idx])
if __name__ == "__main__":
main()