表 1 版本配套表
| 配套 | 版本 | 环境准备指导 |
|---|---|---|
| Python | 3.11.10 | - |
| torch | 2.6.0 | - |
# 增加软件包可执行权限,{version}表示软件版本号,{arch}表示CPU架构,{soc}表示昇腾AI处理器的版本。
chmod +x ./Ascend-cann-toolkit_{version}_linux-{arch}.run
chmod +x ./Ascend-cann-kernels-{soc}_{version}_linux.run
# 校验软件包安装文件的一致性和完整性
./Ascend-cann-toolkit_{version}_linux-{arch}.run --check
./Ascend-cann-kernels-{soc}_{version}_linux.run --check
# 安装
./Ascend-cann-toolkit_{version}_linux-{arch}.run --install
./Ascend-cann-kernels-{soc}_{version}_linux.run --install
# 设置环境变量
source /usr/local/Ascend/ascend-toolkit/set_env.sh# 增加软件包可执行权限,{version}表示软件版本号,{arch}表示CPU架构。
chmod +x ./Ascend-mindie_${version}_linux-${arch}.run
./Ascend-mindie_${version}_linux-${arch}.run --check
# 方式一:默认路径安装
./Ascend-mindie_${version}_linux-${arch}.run --install
# 设置环境变量
cd /usr/local/Ascend/mindie && source set_env.sh
# 方式二:指定路径安装
./Ascend-mindie_${version}_linux-${arch}.run --install-path=${AieInstallPath}
# 设置环境变量
cd ${AieInstallPath}/mindie && source set_env.sh下载 pytorch_v{pytorchversion}_py{pythonversion}.tar.gz
tar -xzvf pytorch_v{pytorchversion}_py{pythonversion}.tar.gz
# 解压后,会有whl包
pip install torch_npu-{pytorchversion}.xxxx.{arch}.whl| 模型 | 链接 |
|---|---|
| Qwen-Image | 🤗 huggingface |
| Qwen-Image-2512 | 🤗 huggingface |
| Qwen-Image-Edit | 🤗 huggingface |
| Qwen-Image-Edit-2509 | 🤗 huggingface |
| Qwen-Image-Edit-2511 | 🤗 huggingface |
| Qwen-Image-Layered | 🤗 huggingface |
| 模型 | 链接 |
|---|---|
| Qwen-Image | 🤖 ModelScope |
| Qwen-Image-2512 | 🤖 ModelScope |
| Qwen-Image-Edit | 🤖 ModelScope |
| Qwen-Image-Edit-2509 | 🤖 ModelScope |
| Qwen-Image-Edit-2511 | 🤖 ModelScope |
| Qwen-Image-Layered | 🤖 ModelScope |
| 模型 | 链接 |
|---|---|
| Qwen-Image-Lightning | 🤗 huggingface |
| Qwen-Image-2512-Lightning | 🤗 huggingface |
| Qwen-Image-Edit-2509-Lightning | 🤗 huggingface |
| Qwen-Image-Edit-2511-Lightning | 🤗 huggingface |
| 模型 | 链接 |
|---|---|
| Qwen-Image-Lightning | 🤖 ModelScope |
| Qwen-Image-2512-Lightning | 🤖 ModelScope |
| Qwen-Image-Edit-2509-Lightning | 🤖 ModelScope |
| Qwen-Image-Edit-2511-Lightning | 🤖 ModelScope |
git clone https://modelers.cn/MindIE/Qwen-Image-series.git
cd Qwen-Image-series
pip3 install -r requirements.txt执行命令:
# ====================== 全局环境变量配置 ======================
export PYTORCH_NPU_ALLOC_CONF='expandable_segments:True'
export ALGO=1 # 0:普通, 1:FA, 2:LA
export OVERLAP=0 # 通信-计算重叠开关
export ROPE_FUSE=1 # RoPE算子融合
export ADALN_FUSE=1 # ADALN算子融合
# ====================== 任务配置 ======================
TASK="Qwen-Image-2512"
MODEL_PATH="/weights/Qwen-Image-2512"
DEVICE_IDS="0" # 单卡:0;多卡:0,1
NPROC_PER_NODE=$(echo $DEVICE_IDS | tr ',' '\n' | wc -l) # 自动计算进程数
MASTER_PORT=29508
# ====================== 设备配置 ======================
export ASCEND_RT_VISIBLE_DEVICES=${DEVICE_IDS}
# ====================== 执行推理 ======================
torchrun --nproc_per_node=${NPROC_PER_NODE} --master-port ${MASTER_PORT} generate.py \
--task ${TASK} \
--ckpt_dir ${MODEL_PATH} \
--prompt '''A coffee shop entrance features a chalkboard sign reading "Qwen Coffee 😊 $2 per cup," with a neon light beside it displaying "通义千问". Next to it hangs a poster showing a beautiful Chinese woman, and beneath the poster is written "π≈3.1415926-53589793-23846264-33832795-02384197". Ultra HD, 4K, cinematic composition''' \
--negative_prompt " " \
--width 1024 \
--height 1024 \
--num_inference_steps 50 \
--seed 42 \
--output_file "./output/text_to_image_2512.png" \
--vae_tiling \
--vae_slicing \参数说明:
执行命令:
# ====================== 全局环境变量配置 ======================
export PYTORCH_NPU_ALLOC_CONF='expandable_segments:True'
export ALGO=1 # 0:普通, 1:FA, 2:LA
export OVERLAP=0 # 通信-计算重叠开关
export ROPE_FUSE=1 # RoPE算子融合
export ADALN_FUSE=1 # ADALN算子融合
export COND_CACHE=1 # 条件缓存
export UNCOND_CACHE=1 # 无条件缓存
export CACHE_STEP_START=10 # 缓存开始步骤
export CACHE_STEP_INTERVAL=3 # 缓存步骤间隔
export CACHE_STEP_END=35 # 缓存结束步骤
export CACHE_BLOCK_START=10 # 缓存开始块
export CACHE_BLOCK_END=50 # 缓存结束块
# ====================== 任务配置 ======================
TASK="Qwen-Image-2512"
MODEL_PATH="/weights/Qwen-Image-2512"
DEVICE_IDS="0" # 单卡:0;多卡:0,1
NPROC_PER_NODE=$(echo $DEVICE_IDS | tr ',' '\n' | wc -l) # 自动计算进程数
MASTER_PORT=29508
# ====================== 设备配置 ======================
export ASCEND_RT_VISIBLE_DEVICES=${DEVICE_IDS}
# ====================== 执行推理 ======================
torchrun --nproc_per_node=${NPROC_PER_NODE} --master-port ${MASTER_PORT} generate.py \
--task ${TASK} \
--ckpt_dir ${MODEL_PATH} \
--prompt '''A coffee shop entrance features a chalkboard sign reading "Qwen Coffee 😊 $2 per cup," with a neon light beside it displaying "通义千问". Next to it hangs a poster showing a beautiful Chinese woman, and beneath the poster is written "π≈3.1415926-53589793-23846264-33832795-02384197". Ultra HD, 4K, cinematic composition''' \
--negative_prompt " " \
--width 1024 \
--height 1024 \
--num_inference_steps 50 \
--seed 42 \
--output_file "./output/text_to_image_2512.png" \
--vae_tiling \
--vae_slicing \参数说明:
执行命令:
# ====================== 全局环境变量配置 ======================
export PYTORCH_NPU_ALLOC_CONF='expandable_segments:True'
export ALGO=1 # 0:普通, 1:FA, 2:LA
export OVERLAP=0 # 通信-计算重叠开关,0:关闭,1:开启
export ROPE_FUSE=1 # RoPE算子融合
export ADALN_FUSE=1 # ADALN算子融合
# export COND_CACHE=1 # 条件缓存(按需开启)
# export UNCOND_CACHE=1 # 无条件缓存(按需开启)
# ====================== 任务配置 ======================
TASK="Qwen-Image-2512"
MODEL_PATH="/weights/Qwen-Image-2512"
DEVICE_IDS="0,1"
NPROC_PER_NODE=$(echo $DEVICE_IDS | tr ',' '\n' | wc -l) # 自动计算进程数
MASTER_PORT=29508
# ====================== 设备配置 ======================
export ASCEND_RT_VISIBLE_DEVICES=${DEVICE_IDS}
# ====================== 执行推理 ======================
# 2卡并行(cfg_size=2 ulysses_size=1 优于 cfg_size=1 ulysses_size=2 )
torchrun --nproc_per_node=${NPROC_PER_NODE} --master-port ${MASTER_PORT} generate.py \
--task ${TASK} \
--ckpt_dir ${MODEL_PATH} \
--prompt '''A coffee shop entrance features a chalkboard sign reading "Qwen Coffee 😊 $2 per cup," with a neon light beside it displaying "通义千问". Next to it hangs a poster showing a beautiful Chinese woman, and beneath the poster is written "π≈3.1415926-53589793-23846264-33832795-02384197". Ultra HD, 4K, cinematic composition''' \
--negative_prompt " " \
--width 1024 \
--height 1024 \
--num_inference_steps 50 \
--seed 42 \
--output_file "./output/text_to_image_2512.png" \
--vae_tiling \
--vae_slicing \
--cfg_size 2 \
--ulysses_size 1参数说明:
执行命令:
# ====================== 全局环境变量配置 ======================
export PYTORCH_NPU_ALLOC_CONF='expandable_segments:True'
export ALGO=1 # 0:普通, 1:FA, 2:LA
export OVERLAP=0 # 通信-计算重叠开关,0:关闭,1:开启
export ROPE_FUSE=1 # RoPE算子融合
export ADALN_FUSE=1 # ADALN算子融合
# export COND_CACHE=1 # 条件缓存(按需开启)
# export UNCOND_CACHE=1 # 无条件缓存(按需开启)
# ====================== 任务配置 ======================
TASK="Qwen-Image-2512"
MODEL_PATH="/weights/Qwen-Image-2512"
DEVICE_IDS="0,1,2,3"
NPROC_PER_NODE=$(echo $DEVICE_IDS | tr ',' '\n' | wc -l) # 自动计算进程数
MASTER_PORT=29508
# ====================== 设备配置 ======================
export ASCEND_RT_VISIBLE_DEVICES=${DEVICE_IDS}
# ====================== 执行推理 ======================
# 4卡并行(cfg_size=2 ulysses_size=2 优于 cfg_size=1 ulysses_size=4)
torchrun --nproc_per_node=${NPROC_PER_NODE} --master-port ${MASTER_PORT} generate.py \
--task ${TASK} \
--ckpt_dir ${MODEL_PATH} \
--prompt '''A coffee shop entrance features a chalkboard sign reading "Qwen Coffee 😊 $2 per cup," with a neon light beside it displaying "通义千问". Next to it hangs a poster showing a beautiful Chinese woman, and beneath the poster is written "π≈3.1415926-53589793-23846264-33832795-02384197". Ultra HD, 4K, cinematic composition''' \
--negative_prompt " " \
--width 1024 \
--height 1024 \
--num_inference_steps 50 \
--seed 42 \
--output_file "./output/text_to_image_2512.png" \
--vae_tiling \
--vae_slicing \
--cfg_size 2 \
--ulysses_size 2参数说明:
执行命令:
# ====================== 全局环境变量配置 ======================
export PYTORCH_NPU_ALLOC_CONF='expandable_segments:True'
export ALGO=1 # 0:普通, 1:FA, 2:LA
export OVERLAP=0 # 通信-计算重叠开关,0:关闭,1:开启
export ROPE_FUSE=1 # RoPE算子融合
export ADALN_FUSE=1 # ADALN算子融合
# export COND_CACHE=1 # 条件缓存(按需开启)
# export UNCOND_CACHE=1 # 无条件缓存(按需开启)
# ====================== 任务配置 ======================
TASK="Qwen-Image-2512"
MODEL_PATH="/weights/Qwen-Image-2512"
DEVICE_IDS="0,1,2,3,4,5,6,7"
NPROC_PER_NODE=$(echo $DEVICE_IDS | tr ',' '\n' | wc -l) # 自动计算进程数
MASTER_PORT=29508
# ====================== 设备配置 ======================
export ASCEND_RT_VISIBLE_DEVICES=${DEVICE_IDS}
# ====================== 执行推理 ======================
# 8卡并行(cfg_size=2 ulysses_size=4 优于 cfg_size=1 ulysses_size=8)
torchrun --nproc_per_node=${NPROC_PER_NODE} --master-port ${MASTER_PORT} generate.py \
--task ${TASK} \
--ckpt_dir ${MODEL_PATH} \
--prompt '''A coffee shop entrance features a chalkboard sign reading "Qwen Coffee 😊 $2 per cup," with a neon light beside it displaying "通义千问". Next to it hangs a poster showing a beautiful Chinese woman, and beneath the poster is written "π≈3.1415926-53589793-23846264-33832795-02384197". Ultra HD, 4K, cinematic composition''' \
--negative_prompt " " \
--width 1024 \
--height 1024 \
--num_inference_steps 50 \
--seed 42 \
--output_file "./output/text_to_image_2512.png" \
--vae_tiling \
--vae_slicing \
--cfg_size 2 \
--ulysses_size 4参数说明:
执行命令:
# ====================== 全局环境变量配置 ======================
export PYTORCH_NPU_ALLOC_CONF='expandable_segments:True'
export ALGO=1 # 0:普通, 1:FA, 2:LA
export OVERLAP=0 # 通信-计算重叠开关,0:关闭,1:开启
export ROPE_FUSE=1 # RoPE算子融合
export ADALN_FUSE=1 # ADALN算子融合
# export COND_CACHE=1 # 条件缓存(按需开启)
# export UNCOND_CACHE=1 # 无条件缓存(按需开启)
# ====================== 任务配置 ======================
TASK="Qwen-Image-2512"
MODEL_PATH="/weights/Qwen-Image-2512"
DEVICE_IDS="0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15"
NPROC_PER_NODE=$(echo $DEVICE_IDS | tr ',' '\n' | wc -l) # 自动计算进程数
MASTER_PORT=29508
# ====================== 设备配置 ======================
export ASCEND_RT_VISIBLE_DEVICES=${DEVICE_IDS}
# ====================== 执行推理 ======================
torchrun --nproc_per_node=${NPROC_PER_NODE} --master-port ${MASTER_PORT} generate.py \
--task ${TASK} \
--ckpt_dir ${MODEL_PATH} \
--prompt '''A coffee shop entrance features a chalkboard sign reading "Qwen Coffee 😊 $2 per cup," with a neon light beside it displaying "通义千问". Next to it hangs a poster showing a beautiful Chinese woman, and beneath the poster is written "π≈3.1415926-53589793-23846264-33832795-02384197". Ultra HD, 4K, cinematic composition''' \
--negative_prompt " " \
--width 1024 \
--height 1024 \
--num_inference_steps 50 \
--seed 42 \
--output_file "./output/text_to_image_2512.png" \
--vae_tiling \
--vae_slicing \
--cfg_size 2 \
--ulysses_size 8参数说明:
执行命令:
# ====================== 全局环境变量配置 ======================
export PYTORCH_NPU_ALLOC_CONF='expandable_segments:True'
export ALGO=1 # 0:普通, 1:FA, 2:LA
export OVERLAP=0 # 通信-计算重叠开关
export ROPE_FUSE=1 # RoPE算子融合
export ADALN_FUSE=1 # ADALN算子融合
# ====================== 任务配置 ======================
TASK="Qwen-Image-Edit-2511"
MODEL_PATH="/weights/Qwen-Image-Edit-2511"
DEVICE_IDS="0" # 单卡:0;多卡:0,1
NPROC_PER_NODE=$(echo $DEVICE_IDS | tr ',' '\n' | wc -l) # 自动计算进程数
MASTER_PORT=29508
# ====================== 设备配置 ======================
export ASCEND_RT_VISIBLE_DEVICES=${DEVICE_IDS}
# ====================== 执行推理 ======================
torchrun --nproc_per_node=${NPROC_PER_NODE} --master-port ${MASTER_PORT} generate.py \
--task ${TASK} \
--ckpt_dir ${MODEL_PATH} \
--image "./examples/yarn-art-pikachu.png" \
--prompt "Make Pikachu hold a sign that says 'Qwen Edit is awesome', yarn art style, detailed, vibrant color" \
--negative_prompt " " \
--width 1024 \
--height 1024 \
--num_inference_steps 40 \
--seed 42 \
--output_file "./output/image_edit_2511.png" \
--vae_tiling \
--vae_slicing \参数说明:
img1 img2执行命令:
# ====================== 全局环境变量配置 ======================
export PYTORCH_NPU_ALLOC_CONF='expandable_segments:True'
export ALGO=1 # 0:普通, 1:FA, 2:LA
export OVERLAP=0 # 通信-计算重叠开关
export ROPE_FUSE=1 # RoPE算子融合
export ADALN_FUSE=1 # ADALN算子融合
export COND_CACHE=1 # 条件缓存
export UNCOND_CACHE=1 # 无条件缓存
export CACHE_STEP_START=10 # 缓存开始步骤
export CACHE_STEP_INTERVAL=3 # 缓存步骤间隔
export CACHE_STEP_END=35 # 缓存结束步骤
export CACHE_BLOCK_START=10 # 缓存开始块
export CACHE_BLOCK_END=50 # 缓存结束块
# ====================== 任务配置 ======================
TASK="Qwen-Image-Edit-2511"
MODEL_PATH="/weights/Qwen-Image-Edit-2511"
DEVICE_IDS="0" # 单卡:0;多卡:0,1
NPROC_PER_NODE=$(echo $DEVICE_IDS | tr ',' '\n' | wc -l) # 自动计算进程数
MASTER_PORT=29508
# ====================== 设备配置 ======================
export ASCEND_RT_VISIBLE_DEVICES=${DEVICE_IDS}
# ====================== 执行推理 ======================
torchrun --nproc_per_node=${NPROC_PER_NODE} --master-port ${MASTER_PORT} generate.py \
--task ${TASK} \
--ckpt_dir ${MODEL_PATH} \
--image "./examples/yarn-art-pikachu.png" \
--prompt "Make Pikachu hold a sign that says 'Qwen Edit is awesome', yarn art style, detailed, vibrant color" \
--negative_prompt " " \
--width 1024 \
--height 1024 \
--num_inference_steps 40 \
--seed 42 \
--output_file "./output/image_edit_2511.png" \
--vae_tiling \
--vae_slicing \参数说明:
执行命令:
# ====================== 全局环境变量配置 ======================
export PYTORCH_NPU_ALLOC_CONF='expandable_segments:True'
export ALGO=1 # 0:普通, 1:FA, 2:LA
export OVERLAP=0 # 通信-计算重叠开关,0:关闭,1:开启
export ROPE_FUSE=1 # RoPE算子融合
export ADALN_FUSE=1 # ADALN算子融合
# export COND_CACHE=1 # 条件缓存(按需开启)
# export UNCOND_CACHE=1 # 无条件缓存(按需开启)
# ====================== 任务配置 ======================
TASK="Qwen-Image-Edit-2511"
MODEL_PATH="/weights/Qwen-Image-Edit-2511"
DEVICE_IDS="0,1"
NPROC_PER_NODE=$(echo $DEVICE_IDS | tr ',' '\n' | wc -l) # 自动计算进程数
MASTER_PORT=29508
# ====================== 设备配置 ======================
export ASCEND_RT_VISIBLE_DEVICES=${DEVICE_IDS}
# ====================== 执行推理 ======================
# 2卡并行(cfg_size=2 ulysses_size=1 优于 cfg_size=1 ulysses_size=2 )
torchrun --nproc_per_node=${NPROC_PER_NODE} --master-port ${MASTER_PORT} generate.py \
--task ${TASK} \
--ckpt_dir ${MODEL_PATH} \
--image "./examples/yarn-art-pikachu.png" \
--prompt "Make Pikachu hold a sign that says 'Qwen Edit is awesome', yarn art style, detailed, vibrant color" \
--negative_prompt " " \
--width 1024 \
--height 1024 \
--num_inference_steps 40 \
--seed 42 \
--output_file "./output/image_edit_2511_.png" \
--vae_tiling \
--vae_slicing \
--cfg_size 2 \
--ulysses_size 1参数说明:
执行命令:
# ====================== 全局环境变量配置 ======================
export PYTORCH_NPU_ALLOC_CONF='expandable_segments:True'
export ALGO=1 # 0:普通, 1:FA, 2:LA
export OVERLAP=0 # 通信-计算重叠开关,0:关闭,1:开启
export ROPE_FUSE=1 # RoPE算子融合
export ADALN_FUSE=1 # ADALN算子融合
# export COND_CACHE=1 # 条件缓存(按需开启)
# export UNCOND_CACHE=1 # 无条件缓存(按需开启)
# ====================== 任务配置 ======================
TASK="Qwen-Image-Edit-2511"
MODEL_PATH="/weights/Qwen-Image-Edit-2511"
DEVICE_IDS="0,1,2,3"
NPROC_PER_NODE=$(echo $DEVICE_IDS | tr ',' '\n' | wc -l) # 自动计算进程数
MASTER_PORT=29508
# ====================== 设备配置 ======================
export ASCEND_RT_VISIBLE_DEVICES=${DEVICE_IDS}
# ====================== 执行推理 ======================
# 4卡并行(cfg_size=2 ulysses_size=2 优于 cfg_size=1 ulysses_size=4)
torchrun --nproc_per_node=${NPROC_PER_NODE} --master-port ${MASTER_PORT} generate.py \
--task ${TASK} \
--ckpt_dir ${MODEL_PATH} \
--image "./examples/yarn-art-pikachu.png" \
--prompt "Make Pikachu hold a sign that says 'Qwen Edit is awesome', yarn art style, detailed, vibrant color" \
--negative_prompt " " \
--width 1024 \
--height 1024 \
--num_inference_steps 40 \
--seed 42 \
--output_file "./output/image_edit_2511_.png" \
--vae_tiling \
--vae_slicing \
--cfg_size 2 \
--ulysses_size 2参数说明:
执行命令:
# ====================== 全局环境变量配置 ======================
export PYTORCH_NPU_ALLOC_CONF='expandable_segments:True'
export ALGO=1 # 0:普通, 1:FA, 2:LA
export OVERLAP=0 # 通信-计算重叠开关,0:关闭,1:开启
export ROPE_FUSE=1 # RoPE算子融合
export ADALN_FUSE=1 # ADALN算子融合
# export COND_CACHE=1 # 条件缓存(按需开启)
# export UNCOND_CACHE=1 # 无条件缓存(按需开启)
# ====================== 任务配置 ======================
TASK="Qwen-Image-Edit-2511"
MODEL_PATH="/weights/Qwen-Image-Edit-2511"
DEVICE_IDS="0,1,2,3,4,5,6,7"
NPROC_PER_NODE=$(echo $DEVICE_IDS | tr ',' '\n' | wc -l) # 自动计算进程数
MASTER_PORT=29508
# ====================== 设备配置 ======================
export ASCEND_RT_VISIBLE_DEVICES=${DEVICE_IDS}
# ====================== 执行推理 ======================
# 8卡并行(cfg_size=2 ulysses_size=4 优于 cfg_size=1 ulysses_size=8)
torchrun --nproc_per_node=${NPROC_PER_NODE} --master-port ${MASTER_PORT} generate.py \
--task ${TASK} \
--ckpt_dir ${MODEL_PATH} \
--image "./examples/yarn-art-pikachu.png" \
--prompt "Make Pikachu hold a sign that says 'Qwen Edit is awesome', yarn art style, detailed, vibrant color" \
--negative_prompt " " \
--width 1024 \
--height 1024 \
--num_inference_steps 40 \
--seed 42 \
--output_file "./output/image_edit_2511_.png" \
--vae_tiling \
--vae_slicing \
--cfg_size 2 \
--ulysses_size 4参数说明:
执行命令:
# ====================== 全局环境变量配置 ======================
export PYTORCH_NPU_ALLOC_CONF='expandable_segments:True'
export ALGO=1 # 0:普通, 1:FA, 2:LA
export OVERLAP=0 # 通信-计算重叠开关,0:关闭,1:开启
export ROPE_FUSE=1 # RoPE算子融合
export ADALN_FUSE=1 # ADALN算子融合
# export COND_CACHE=1 # 条件缓存(按需开启)
# export UNCOND_CACHE=1 # 无条件缓存(按需开启)
# ====================== 任务配置 ======================
TASK="Qwen-Image-Edit-2511"
MODEL_PATH="/weights/Qwen-Image-Edit-2511"
DEVICE_IDS="0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15"
NPROC_PER_NODE=$(echo $DEVICE_IDS | tr ',' '\n' | wc -l) # 自动计算进程数
MASTER_PORT=29508
# ====================== 设备配置 ======================
export ASCEND_RT_VISIBLE_DEVICES=${DEVICE_IDS}
# ====================== 执行推理 ======================
torchrun --nproc_per_node=${NPROC_PER_NODE} --master-port ${MASTER_PORT} generate.py \
--task ${TASK} \
--ckpt_dir ${MODEL_PATH} \
--image "./examples/yarn-art-pikachu.png" \
--prompt "Make Pikachu hold a sign that says 'Qwen Edit is awesome', yarn art style, detailed, vibrant color" \
--negative_prompt " " \
--width 1024 \
--height 1024 \
--num_inference_steps 40 \
--seed 42 \
--output_file "./output/image_edit_2511_.png" \
--vae_tiling \
--vae_slicing \
--cfg_size 2 \
--ulysses_size 8参数说明:
执行命令:
# ====================== 全局环境变量配置 ======================
export PYTORCH_NPU_ALLOC_CONF='expandable_segments:True'
export ALGO=1 # 0:普通, 1:FA, 2:LA
export OVERLAP=0 # 通信-计算重叠开关
export ROPE_FUSE=1 # RoPE算子融合
export ADALN_FUSE=1 # ADALN算子融合
# ====================== 任务配置 ======================
TASK="Qwen-Image-Layered"
MODEL_PATH="/weights/Qwen-Image-Layered"
DEVICE_IDS="0" # 单卡:0;多卡:0,1
NPROC_PER_NODE=$(echo $DEVICE_IDS | tr ',' '\n' | wc -l) # 自动计算进程数
MASTER_PORT=29508
# ====================== 设备配置 ======================
export ASCEND_RT_VISIBLE_DEVICES=${DEVICE_IDS}
# ====================== 执行推理 ======================
torchrun --nproc_per_node=${NPROC_PER_NODE} --master-port ${MASTER_PORT} generate.py \
--task ${TASK} \
--ckpt_dir ${MODEL_PATH} \
--image "./examples/2.png" \
--negative_prompt " " \
--num_inference_steps 50 \
--seed 42 \
--layers 3 \
--resolution 640 \
--color_format "RGBA" \
--output_file "./image_layered" \
--vae_tiling \
--vae_slicing \参数说明:
执行命令:
# ====================== 全局环境变量配置 ======================
export PYTORCH_NPU_ALLOC_CONF='expandable_segments:True'
export ALGO=1 # 0:普通, 1:FA, 2:LA
export OVERLAP=0 # 通信-计算重叠开关
export ROPE_FUSE=1 # RoPE算子融合
export ADALN_FUSE=1 # ADALN算子融合
export COND_CACHE=1 # 条件缓存
export UNCOND_CACHE=1 # 无条件缓存
export CACHE_STEP_START=10 # 缓存开始步骤
export CACHE_STEP_INTERVAL=3 # 缓存步骤间隔
export CACHE_STEP_END=35 # 缓存结束步骤
export CACHE_BLOCK_START=10 # 缓存开始块
export CACHE_BLOCK_END=50 # 缓存结束块
# ====================== 任务配置 ======================
TASK="Qwen-Image-Layered"
MODEL_PATH="/weights/Qwen-Image-Layered"
DEVICE_IDS="0" # 单卡:0;多卡:0,1
NPROC_PER_NODE=$(echo $DEVICE_IDS | tr ',' '\n' | wc -l) # 自动计算进程数
MASTER_PORT=29508
# ====================== 设备配置 ======================
export ASCEND_RT_VISIBLE_DEVICES=${DEVICE_IDS}
# ====================== 执行推理 ======================
torchrun --nproc_per_node=${NPROC_PER_NODE} --master-port ${MASTER_PORT} generate.py \
--task ${TASK} \
--ckpt_dir ${MODEL_PATH} \
--image "./examples/2.png" \
--negative_prompt " " \
--num_inference_steps 50 \
--seed 42 \
--layers 3 \
--resolution 640 \
--color_format "RGBA" \
--output_file "./image_layered" \
--vae_tiling \
--vae_slicing \参数说明:
执行命令:
# ====================== 全局环境变量配置 ======================
export PYTORCH_NPU_ALLOC_CONF='expandable_segments:True'
export ALGO=1 # 0:普通, 1:FA, 2:LA
export OVERLAP=0 # 通信-计算重叠开关,0:关闭,1:开启
export ROPE_FUSE=1 # RoPE算子融合
export ADALN_FUSE=1 # ADALN算子融合
# export COND_CACHE=1 # 条件缓存(按需开启)
# export UNCOND_CACHE=1 # 无条件缓存(按需开启)
# ====================== 任务配置 ======================
TASK="Qwen-Image-Layered"
MODEL_PATH="/weights/Qwen-Image-Layered"
DEVICE_IDS="0,1"
NPROC_PER_NODE=$(echo $DEVICE_IDS | tr ',' '\n' | wc -l) # 自动计算进程数
MASTER_PORT=29508
# ====================== 设备配置 ======================
export ASCEND_RT_VISIBLE_DEVICES=${DEVICE_IDS}
# ====================== 执行推理 ======================
# 2卡并行(cfg_size=2 ulysses_size=1 优于 cfg_size=1 ulysses_size=2 )
torchrun --nproc_per_node=${NPROC_PER_NODE} --master-port ${MASTER_PORT} generate.py \
--task ${TASK} \
--ckpt_dir ${MODEL_PATH} \
--image "./examples/2.png" \
--negative_prompt " " \
--num_inference_steps 50 \
--seed 42 \
--layers 3 \
--resolution 640 \
--color_format "RGBA" \
--output_file "./image_layered" \
--vae_tiling \
--vae_slicing \
--cfg_size 2 \
--ulysses_size 1参数说明:
执行命令:
# ====================== 全局环境变量配置 ======================
export PYTORCH_NPU_ALLOC_CONF='expandable_segments:True'
export ALGO=1 # 0:普通, 1:FA, 2:LA
export OVERLAP=0 # 通信-计算重叠开关,0:关闭,1:开启
export ROPE_FUSE=1 # RoPE算子融合
export ADALN_FUSE=1 # ADALN算子融合
# export COND_CACHE=1 # 条件缓存(按需开启)
# export UNCOND_CACHE=1 # 无条件缓存(按需开启)
# ====================== 任务配置 ======================
TASK="Qwen-Image-Layered"
MODEL_PATH="/weights/Qwen-Image-Layered"
DEVICE_IDS="0,1,2,3"
NPROC_PER_NODE=$(echo $DEVICE_IDS | tr ',' '\n' | wc -l) # 自动计算进程数
MASTER_PORT=29508
# ====================== 设备配置 ======================
export ASCEND_RT_VISIBLE_DEVICES=${DEVICE_IDS}
# ====================== 执行推理 ======================
# 4卡并行(cfg_size=2 ulysses_size=2 优于 cfg_size=1 ulysses_size=4 )
torchrun --nproc_per_node=${NPROC_PER_NODE} --master-port ${MASTER_PORT} generate.py \
--task ${TASK} \
--ckpt_dir ${MODEL_PATH} \
--image "./examples/2.png" \
--negative_prompt " " \
--num_inference_steps 50 \
--seed 42 \
--layers 3 \
--resolution 640 \
--color_format "RGBA" \
--output_file "./image_layered" \
--vae_tiling \
--vae_slicing \
--cfg_size 2 \
--ulysses_size 2参数说明:
执行命令:
# ====================== 全局环境变量配置 ======================
export PYTORCH_NPU_ALLOC_CONF='expandable_segments:True'
export ALGO=1 # 0:普通, 1:FA, 2:LA
export OVERLAP=0 # 通信-计算重叠开关,0:关闭,1:开启
export ROPE_FUSE=1 # RoPE算子融合
export ADALN_FUSE=1 # ADALN算子融合
# export COND_CACHE=1 # 条件缓存(按需开启)
# export UNCOND_CACHE=1 # 无条件缓存(按需开启)
# ====================== 任务配置 ======================
TASK="Qwen-Image-Layered"
MODEL_PATH="/weights/Qwen-Image-Layered"
DEVICE_IDS="0,1,2,3,4,5,6,7"
NPROC_PER_NODE=$(echo $DEVICE_IDS | tr ',' '\n' | wc -l) # 自动计算进程数
MASTER_PORT=29508
# ====================== 设备配置 ======================
export ASCEND_RT_VISIBLE_DEVICES=${DEVICE_IDS}
# ====================== 执行推理 ======================
# 8卡并行(cfg_size=2 ulysses_size=4 优于 cfg_size=1 ulysses_size=8 )
torchrun --nproc_per_node=${NPROC_PER_NODE} --master-port ${MASTER_PORT} generate.py \
--task ${TASK} \
--ckpt_dir ${MODEL_PATH} \
--image "./examples/2.png" \
--negative_prompt " " \
--num_inference_steps 50 \
--seed 42 \
--layers 3 \
--resolution 640 \
--color_format "RGBA" \
--output_file "./image_layered" \
--vae_tiling \
--vae_slicing \
--cfg_size 2 \
--ulysses_size 4参数说明:
执行命令:
# ====================== 全局环境变量配置 ======================
export PYTORCH_NPU_ALLOC_CONF='expandable_segments:True'
export ALGO=1 # 0:普通, 1:FA, 2:LA
export OVERLAP=0 # 通信-计算重叠开关,0:关闭,1:开启
export ROPE_FUSE=1 # RoPE算子融合
export ADALN_FUSE=1 # ADALN算子融合
# export COND_CACHE=1 # 条件缓存(按需开启)
# export UNCOND_CACHE=1 # 无条件缓存(按需开启)
# ====================== 任务配置 ======================
TASK="Qwen-Image-Layered"
MODEL_PATH="/weights/Qwen-Image-Layered"
DEVICE_IDS="0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15"
NPROC_PER_NODE=$(echo $DEVICE_IDS | tr ',' '\n' | wc -l) # 自动计算进程数
MASTER_PORT=29508
# ====================== 设备配置 ======================
export ASCEND_RT_VISIBLE_DEVICES=${DEVICE_IDS}
# ====================== 执行推理 ======================
torchrun --nproc_per_node=${NPROC_PER_NODE} --master-port ${MASTER_PORT} generate.py \
--task ${TASK} \
--ckpt_dir ${MODEL_PATH} \
--image "./examples/2.png" \
--negative_prompt " " \
--num_inference_steps 50 \
--seed 42 \
--layers 3 \
--resolution 640 \
--color_format "RGBA" \
--output_file "./image_layered" \
--vae_tiling \
--vae_slicing \
--cfg_size 2 \
--ulysses_size 8参数说明:
新增Qwen-Image、Qwen-Image-2512、Qwen-Image-Edit、Qwen-Image-Edit-2509、Qwen-Image-Edit-2511、Qwen-Image-Layered模型的量化支持,支持权重 8 位(w8)与激活 8 位 / 16 位(a8/a16)的量化组合,针对DiT模型进行量化,降低显存占用,提高模型推理性能
git clone https://gitcode.com/Ascend/msit
cd msit/msmodelslim
bash install.sh通过quant_qwenimage.py脚本生成量化模型及描述文件,需基于原始模型权重进行量化。
以Qwen-Image-Edit-2511模型为例
执行命令:
model_path="/weights/Qwen-Image-Edit-2511"
python quant_qwenimage.py \
--model_path ${model_path} \
--device_id 2 \
--quant_mode w8a8 \
--w_sym \
--is_dynamic \
--act_method 3 \
--quant_save_dir ./quant_w8a8_dynamic_withoutData_use_disable_quant_layers \参数说明:
执行后,quant_w8a8_dynamic_withoutData_use_disable_quant_layers目录下会生成两个文件:
quant_model_description_w8a8_dynamic.json:量化配置描述文件(包含量化位宽、层映射等元信息)quant_model_weight_w8a8_dynamic.safetensors:量化后的权重文件(采用safe tensor格式)执行命令:
model_path="/mnt/weights/Qwen-Image-Edit-2511"
python quant_qwenimage.py \
--model_path ${model_path} \
--device_id 0 \
--quant_mode w8a16 \
--w_sym \
--act_method 3 \
--quant_save_dir ./quant_w8a16_withoutData_use_disable_quant_layers
参数说明:
执行后,quant_w8a16_withoutData_use_disable_quant_layers目录下会生成两个文件:
quant_model_description_w8a16.json:量化配置描述文件(包含量化位宽、层映射等元信息)quant_model_weight_w8a16.safetensors:量化后的权重文件(采用safe tensor格式)以Qwen-Image-Edit-2511模型的w8a8量化为例子,运行命令:
# ====================== 全局环境变量配置 ======================
export PYTORCH_NPU_ALLOC_CONF='expandable_segments:True'
export ALGO=1 # 0:普通, 1:FA, 2:LA
export OVERLAP=0 # 通信-计算重叠开关
export ROPE_FUSE=1 # RoPE算子融合
export ADALN_FUSE=1 # ADALN算子融合
# ====================== 任务配置 ======================
TASK="Qwen-Image-Edit-2511"
MODEL_PATH="/weights/Qwen-Image-Edit-2511"
DEVICE_IDS="0" # 单卡:0;多卡:0,1
NPROC_PER_NODE=$(echo $DEVICE_IDS | tr ',' '\n' | wc -l) # 自动计算进程数
MASTER_PORT=29508
# ====================== 设备配置 ======================
export ASCEND_RT_VISIBLE_DEVICES=${DEVICE_IDS}
# ====================== 执行推理 ======================
torchrun --nproc_per_node=${NPROC_PER_NODE} --master-port ${MASTER_PORT} generate.py \
--task ${TASK} \
--ckpt_dir ${MODEL_PATH} \
--image "./examples/yarn-art-pikachu.png" \
--prompt "Make Pikachu hold a sign that says 'Qwen Edit is awesome', yarn art style, detailed, vibrant color" \
--negative_prompt " " \
--width 1024 \
--height 1024 \
--num_inference_steps 40 \
--seed 42 \
--output_file "./output/image_edit_2511.png" \
--vae_tiling \
--vae_slicing \
--quant_dit_path ./quant_w8a8_dynamic_withoutData_use_disable_quant_layers参数说明:
上面是我给我的代码工程写的readme,帮我看看有没有问题