HuggingFace镜像/swinv2_tiny_window16_256.ms_in1k
模型介绍文件和版本分析
下载使用量0

swinv2_tiny_window16_256.ms_in1k 模型卡片

一个 Swin Transformer V2 图像分类模型。由论文作者在 ImageNet-1k 上进行预训练。

模型详情

  • 模型类型: 图像分类 / 特征骨干网络
  • 模型统计:
    • 参数(M):28.3
    • GMACs:6.7
    • 激活值(M):39.0
    • 图像尺寸:256 x 256
  • 论文:
    • Swin Transformer V2: Scaling Up Capacity and Resolution(Swin Transformer V2:提升容量与分辨率):https://arxiv.org/abs/2111.09883
  • 原始地址: https://github.com/microsoft/Swin-Transformer
  • 数据集: ImageNet-1k

模型使用

图像分类

import torch
import torch_npu
import argparse
from openmind import pipeline, is_torch_npu_available
from PIL import Image
import requests
import timm

def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--model_name_or_path",
        type=str,
        help="Path to model",
        default=None,
    )
    args = parser.parse_args()
    return args

def main():
    args = parse_args()
    model_path = args.model_name_or_path
    
    if is_torch_npu_available():
        device = "npu:0"
    else:
        device = "cpu"
    
    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
    img = Image.open(requests.get(url, stream=True).raw)
    
    model = timm.create_model(model_path, pretrained=True).to(device)
    model = model.eval()

    # get model specific transforms (normalization, resize)
    data_config = timm.data.resolve_model_data_config(model)
    transforms = timm.data.create_transform(**data_config, is_training=False)

    output = model(transforms(img).unsqueeze(0).to(device))  # unsqueeze single image into batch of 1
    top5_probabilities, top5_class_indices = torch.topk(output.softmax(dim=1) * 100, k=5)
    print(top5_class_indices)

if __name__=="__main__":
    main()

特征图提取

import torch
import torch_npu
import argparse
from openmind import pipeline, is_torch_npu_available
from PIL import Image
import requests
import timm

def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--model_name_or_path",
        type=str,
        help="Path to model",
        default=None,
    )
    args = parser.parse_args()
    return args



def main():
    args = parse_args()
    model_path = args.model_name_or_path
    
    if is_torch_npu_available():
        device = "npu:0"
    else:
        device = "cpu"
    
    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
    img = Image.open(requests.get(url, stream=True).raw)
    
    model = timm.create_model(model_path, pretrained=True).to(device)
    model = model.eval()

    # get model specific transforms (normalization, resize)
    data_config = timm.data.resolve_model_data_config(model)
    transforms = timm.data.create_transform(**data_config, is_training=False)

    output = model(transforms(img).unsqueeze(0).to(device))  # unsqueeze single image into batch of 1
    for o in output:
        # print shape of each feature map in output
        # e.g. for swin_base_patch4_window7_224 (NHWC output)
        #  torch.Size([1, 56, 56, 128])
        #  torch.Size([1, 28, 28, 256])
        #  torch.Size([1, 14, 14, 512])
        #  torch.Size([1, 7, 7, 1024])
        # e.g. for swinv2_cr_small_ns_224 (NCHW output)
        #  torch.Size([1, 96, 56, 56]) 
        #  torch.Size([1, 192, 28, 28])
        #  torch.Size([1, 384, 14, 14])
        #  torch.Size([1, 768, 7, 7])
        print(o.shape)

图像嵌入

import torch
import torch_npu
import argparse
from openmind import pipeline, is_torch_npu_available
from PIL import Image
import requests
import timm

def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--model_name_or_path",
        type=str,
        help="Path to model",
        default=None,
    )
    args = parser.parse_args()
    return args



def main():
    args = parse_args()
    model_path = args.model_name_or_path
    
    if is_torch_npu_available():
        device = "npu:0"
    else:
        device = "cpu"
    
    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
    img = Image.open(requests.get(url, stream=True).raw)
    
    model = timm.create_model(model_path, pretrained=True).to(device)
    model = model.eval()

    # get model specific transforms (normalization, resize)
    data_config = timm.data.resolve_model_data_config(model)
    transforms = timm.data.create_transform(**data_config, is_training=False)

    output = model(transforms(img).unsqueeze(0).to(device))  # unsqueeze single image into batch of 1

    # or equivalently (without needing to set num_classes=0)

    output = model.forward_features(transforms(img).unsqueeze(0))
    # output is unpooled (ie.e a (batch_size, H, W,  num_features) tensor for swin / swinv2
    # or (batch_size, num_features, H, W) for swinv2_cr

    output = model.forward_head(output, pre_logits=True)
    # output is (batch_size, num_features) tensor

模型对比

在 timm 模型结果中探索此模型的数据集和运行时指标。

引用

@inproceedings{liu2021swinv2,
  title={Swin Transformer V2: Scaling Up Capacity and Resolution}, 
  author={Ze Liu and Han Hu and Yutong Lin and Zhuliang Yao and Zhenda Xie and Yixuan Wei and Jia Ning and Yue Cao and Zheng Zhang and Li Dong and Furu Wei and Baining Guo},
  booktitle={International Conference on Computer Vision and Pattern Recognition (CVPR)},
  year={2022}
}
@misc{rw2019timm,
  author = {Ross Wightman},
  title = {PyTorch Image Models},
  year = {2019},
  publisher = {GitHub},
  journal = {GitHub repository},
  doi = {10.5281/zenodo.4414861},
  howpublished = {\url{https://github.com/huggingface/pytorch-image-models}}
}