表 1 版本配套表
| 配套 | 版本 | 环境准备指导 |
|---|---|---|
| Python | 3.10 or 3.11 | - |
| torch | 2.1.0 | - |
# 增加软件包可执行权限,{version}表示软件版本号,{arch}表示CPU架构,{soc}表示昇腾AI处理器的版本。
chmod +x ./Ascend-cann-toolkit_{version}_linux-{arch}.run
chmod +x ./Ascend-cann-kernels-{soc}_{version}_linux.run
# 校验软件包安装文件的一致性和完整性
./Ascend-cann-toolkit_{version}_linux-{arch}.run --check
./Ascend-cann-kernels-{soc}_{version}_linux.run --check
# 安装
./Ascend-cann-toolkit_{version}_linux-{arch}.run --install
./Ascend-cann-kernels-{soc}_{version}_linux.run --install
# 设置环境变量
source /usr/local/Ascend/ascend-toolkit/set_env.sh# 增加软件包可执行权限,{version}表示软件版本号,{arch}表示CPU架构。
chmod +x ./Ascend-mindie_${version}_linux-${arch}.run
./Ascend-mindie_${version}_linux-${arch}.run --check
# 方式一:默认路径安装
./Ascend-mindie_${version}_linux-${arch}.run --install
# 设置环境变量
cd /usr/local/Ascend/mindie && source set_env.sh
# 方式二:指定路径安装
./Ascend-mindie_${version}_linux-${arch}.run --install-path=${AieInstallPath}
# 设置环境变量
cd ${AieInstallPath}/mindie && source set_env.sh下载 pytorch_v{pytorchversion}_py{pythonversion}.tar.gz
tar -xzvf pytorch_v{pytorchversion}_py{pythonversion}.tar.gz
# 解压后,会有whl包
pip install torch_npu-{pytorchversion}.xxxx.{arch}.whlhttps://huggingface.co/Qwen/Qwen-Image# 1. 下载代码
git clone https://modelers.cn/MindIE/Qwen-Image.git && cd Qwen-Image
# 2. python相关依赖安装
pip install diffusers==0.35.1
pip install transformers==4.52.4
pip install yunchang==0.6.0
# 3. 用 Python 获取 diffusers 的安装目录
DIFFUSERS_PATH=$(python -c "import diffusers; import os; print(os.path.dirname(diffusers.__file__))")
# 4. 替换pipeline_qwenimage文件
cp -r pipeline_qwenimage.py "$DIFFUSERS_PATH/pipelines/qwenimage/pipeline_qwenimage.py" # 文生图场景
cp -r pipeline_qwenimage_img2img.py "$DIFFUSERS_PATH/pipelines/qwenimage/pipeline_qwenimage_img2img.py" # 图生图场景
cp -r pipeline_qwenimage_inpaint.py "$DIFFUSERS_PATH/pipelines/qwenimage/pipeline_qwenimage_inpaint.py" # inpainting场景
# 5. 替换transformer_qwenimage文件
cp -r transformer_qwenimage.py "$DIFFUSERS_PATH/models/transformers/transformer_qwenimage.py"文生图执行命令:
export ROPE_FUSE=1
export ADALN_FUSE=1
python run.py \
--model_path ./Qwen-Image \
--device_id 0 \
--aspect_ratio "16:9" \
--num_inference_steps 50 \
--prompt_file "prompts.txt" \
--prompt_lang "en"
参数说明:
文生图执行命令:
export ROPE_FUSE=1
export ADALN_FUSE=1
export COND_CACHE=1
export UNCOND_CACHE=1
python run.py \
--model_path ./Qwen-Image \
--device_id 0 \
--aspect_ratio "16:9" \
--num_inference_steps 50 \
--prompt_file "prompts.txt" \
--prompt_lang "en"
参数说明:
执行命令:
export model_path="/home/weight/Qwen-Image/"
export quant_desc_path=" /home/Qwen-Image/quant_w8a8_dynamic_withoutData_use_disable_quant_layers/quant_model_description_w8a8_dynamic.json"
export LCCL_DETERMINISTIC=true
export HCCL_DETERMINISTIC=true
export ATB_MATMUL_SHUFFLE_K_ENABLE=0
export ATB_LLM_LCOC_ENABLE=true
export CLOSE_MATMUL_K_SHIFT=true
# 等价优化 需要时开启
# export ROPE_FUSE=1
# export ADALN_FUSE=1
# cache算法优化 需要时开启
# export COND_CACHE=1
# export UNCOND_CACHE=1
# 8卡 cfg=2 ulysses=4
export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
torchrun --nproc_per_node=8 --master-port 29508 run_cfg_usp.py \
--model_path ${model_path} \
--prompt_file prompts.txt \
--num_inference_steps 50 \
--seed 42 \
--output_dir "./base/baseline_cfg2_ulysses4_optimize_1+2" \
--ulysses_size 4 \
--cfg_size 2 \
# --quant_desc_path ${quant_desc_path}参数说明:
执行命令:
export model_path="/home/weight/Qwen-Image/"
export quant_desc_path=" /home/Qwen-Image/quant_w8a8_dynamic_withoutData_use_disable_quant_layers/quant_model_description_w8a8_dynamic.json"
export LCCL_DETERMINISTIC=true
export HCCL_DETERMINISTIC=true
export ATB_MATMUL_SHUFFLE_K_ENABLE=0
export ATB_LLM_LCOC_ENABLE=true
export CLOSE_MATMUL_K_SHIFT=true
# 等价优化 需要时开启
# export ROPE_FUSE=1
# export ADALN_FUSE=1
# cache算法优化 需要时开启
# export COND_CACHE=1
# export UNCOND_CACHE=1
# 16卡 cfg=2 ulysses=8
export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
torchrun --nproc_per_node=8 --master-port 29508 run_cfg_usp.py \
--model_path ${model_path} \
--prompt_file prompts.txt \
--prompt_lang "en" \
--aspect_ratio "16:9" \
--num_inference_steps 50 \
--seed 42 \
--output_dir "./base/baseline_cfg2_ulysses8_optimize_1+2" \
--ulysses_size 8 \
--cfg_size 2 \
# --quant_desc_path ${quant_desc_path}参数说明:
执行命令:
export ROPE_FUSE=1
export ADALN_FUSE=1
python run_img2img.py \
--model_path ./Qwen-Image \
--device_id 0
参数说明:
执行命令:
export ROPE_FUSE=1
export ADALN_FUSE=1
export COND_CACHE=1
export UNCOND_CACHE=1
python run_img2img.py \
--model_path ./Qwen-Image \
--device_id 0
参数说明:
执行命令:
export model_path="/home/weight/Qwen-Image/"
export quant_desc_path="/home/Qwen-Image/quant_w8a8_withoutData_use_disable_quant_layers/quant_model_description_w8a8_dynamic.json"
export LCCL_DETERMINISTIC=true
export HCCL_DETERMINISTIC=true
export ATB_MATMUL_SHUFFLE_K_ENABLE=0
export ATB_LLM_LCOC_ENABLE=true
export CLOSE_MATMUL_K_SHIFT=true
# 等价优化 需要时开启
# export ROPE_FUSE=1
# export ADALN_FUSE=1
# cache算法优化 需要时开启
# export COND_CACHE=1
# export UNCOND_CACHE=1
# 8卡 cfg=2 ulysses=4
# export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export ASCEND_RT_VISIBLE_DEVICES=8,9,10,11,12,13,14,15
torchrun --nproc_per_node=8 --master-port 29508 run_img2img_cfg_usp.py \
--model_path ${model_path} \
--img_path ./sketch-mountains-input.jpg \
--prompt_file ./img2img_prompts.txt \
--num_inference_steps 50 \
--seed 42 \
--output_dir "./img2img_baseline_cfg2_ulysses4_optimize_1+2" \
--ulysses_size 4 \
--cfg_size 2 \
# --quant_desc_path ${quant_desc_path}参数说明:
执行命令:
export model_path="/home/weight/Qwen-Image/"
export quant_desc_path="/home/Qwen-Image/quant_w8a8_withoutData_use_disable_quant_layers/quant_model_description_w8a8_dynamic.json"
export LCCL_DETERMINISTIC=true
export HCCL_DETERMINISTIC=true
export ATB_MATMUL_SHUFFLE_K_ENABLE=0
export ATB_LLM_LCOC_ENABLE=true
export CLOSE_MATMUL_K_SHIFT=true
# 等价优化 需要时开启
# export ROPE_FUSE=1
# export ADALN_FUSE=1
# cache算法优化 需要时开启
# export COND_CACHE=1
# export UNCOND_CACHE=1
# 16卡 cfg=2 ulysses=8
export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
torchrun --nproc_per_node=8 --master-port 29508 run_img2img_cfg_usp.py \
--model_path ${model_path} \
--img_path ./sketch-mountains-input.jpg \
--prompt_file ./img2img_prompts.txt \
--num_inference_steps 50 \
--seed 42 \
--output_dir "./img2img_baseline_cfg2_ulysses8_optimize_1+2" \
--ulysses_size 8 \
--cfg_size 2 \
# --quant_desc_path ${quant_desc_path}参数说明:
执行命令:
export ROPE_FUSE=1
export ADALN_FUSE=1
python run_inpainting.py \
--model_name ./Qwen-Image \
--device_id 0
参数说明:
执行命令:
export ROPE_FUSE=1
export ADALN_FUSE=1
export COND_CACHE=1
export UNCOND_CACHE=1
python run_inpainting.py \
--model_name ./Qwen-Image \
--device_id 0
参数说明:
执行命令:
export model_path="/home/weight/Qwen-Image/"
export quant_desc_path="/home/Qwen-Image/quant_w8a8_withoutData_use_disable_quant_layers/quant_model_description_w8a8_dynamic.json"
export LCCL_DETERMINISTIC=true
export HCCL_DETERMINISTIC=true
export ATB_MATMUL_SHUFFLE_K_ENABLE=0
export ATB_LLM_LCOC_ENABLE=true
export CLOSE_MATMUL_K_SHIFT=true
# 算子优化
export ROPE_FUSE=1
export ADALN_FUSE=1
# 算法优化
export COND_CACHE=1
export UNCOND_CACHE=1
# 8卡 cfg=2 ulysses=4
export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
# export ASCEND_RT_VISIBLE_DEVICES=8,9,10,11,12,13,14,15
torchrun --nproc_per_node=8 --master-port 29508 run_inpainting_cfg_usp.py \
--model_path ${model_path} \
--source_img_path ./img_url.png \
--mask_img_path ./mask_url.png \
--prompt_file ./inpainting_prompts.txt \
--num_inference_steps 50 \
--seed 42 \
--output_dir "./inpainting_baseline_cfg2_ulysses4_optimize_1+2" \
--ulysses_size 4 \
--cfg_size 2 \
# --quant_desc_path ${quant_desc_path}
参数说明:
执行命令:
export model_path="/home/weight/Qwen-Image/"
export quant_desc_path="/home/Qwen-Image/quant_w8a8_withoutData_use_disable_quant_layers/quant_model_description_w8a8_dynamic.json"
export LCCL_DETERMINISTIC=true
export HCCL_DETERMINISTIC=true
export ATB_MATMUL_SHUFFLE_K_ENABLE=0
export ATB_LLM_LCOC_ENABLE=true
export CLOSE_MATMUL_K_SHIFT=true
# 算子优化
export ROPE_FUSE=1
export ADALN_FUSE=1
# 算法优化
export COND_CACHE=1
export UNCOND_CACHE=1
# 16卡 cfg=2 ulysses=8
export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
torchrun --nproc_per_node=8 --master-port 29508 run_inpainting_cfg_usp.py \
--model_path ${model_path} \
--source_img_path ./img_url.png \
--mask_img_path ./mask_url.png \
--prompt_file ./inpainting_prompts.txt \
--num_inference_steps 50 \
--seed 42 \
--output_dir "./inpainting_baseline_cfg2_ulysses8_optimize_1+2" \
--ulysses_size 8 \
--cfg_size 2 \
# --quant_desc_path ${quant_desc_path}参数说明:
本项目新增量化功能,支持权重 8 位(w8)与激活 8 位 / 16 位(a8/a16)的量化组合,可减少模型显存占用并保持推理性能
参考官方README
通过quantization/quant.py脚本生成量化模型及描述文件,需基于原始模型权重进行量化。
执行命令:
export PYTHONPATH=".:$PYTHONPATH" # 添加当前目录到Python搜索路径
model_path="/mnt/weights/Qwen-Image"
# w8a8量化
python quantization/quant.py \
--model_path ${model_path} \
--device_id 0 \
--quant_mode w8a8 \
--w_sym \
--is_dynamic \
--act_method 3 \
--quant_save_dir ./quant_w8a8_dynamic_withoutData_use_disable_quant_layers参数说明:
执行后,quant_w8a8_dynamic_withoutData_use_disable_quant_layers目录下会生成两个文件:
quant_model_description_w8a8_dynamic.json:量化配置描述文件(包含量化位宽、层映射等元信息)quant_model_weight_w8a8_dynamic.safetensors:量化后的权重文件(采用safe tensor格式,兼容Hugging Face生态)执行命令:
export PYTHONPATH=".:$PYTHONPATH" # 添加当前目录到Python搜索路径
model_path="/mnt/weights/Qwen-Image"
# w8a16量化
python quantization/quant.py \
--model_name ${model_path} \
--device_id 0 \
--quant_mode w8a16 \
--w_sym \
--act_method 3 \
--quant_save_dir ./quant_w8a16_withoutData_use_disable_quant_layers
参数说明:
执行后,quant_w8a16_withoutData_use_disable_quant_layers目录下会生成两个文件:
quant_model_description_w8a16.json:量化配置描述文件(包含量化位宽、层映射等元信息)quant_model_weight_w8a16.safetensors:量化后的权重文件(采用safe tensor格式,兼容Hugging Face生态)# 增加软件包可执行权限,{version}表示软件版本号,{arch}表示CPU架构。
chmod +x Ascend-cann-nnal_<version>_linux-<arch>.run
# 默认路径安装:
./Ascend-cann-nnal_<version>_linux-<arch>.run --install --torch_atb
# 配置环境变量:
source ${HOME}/Ascend/nnal/atb/set_env.sh使用量化模型进行推理时,需在原有命令中添加--quant_desc_path参数,指向量化描述文件(quant_model_description_*.json)路径,该路径需要是绝对路径,其余参数与原生模型推理一致。
以w8a8量化为例子,运行命令:
export model_path="/home/weight/Qwen-Image/"
export quant_desc_path=" /home/Qwen-Image/quant_w8a8_dynamic_withoutData_use_disable_quant_layers/quant_model_description_w8a8_dynamic.json"
# 算子优化 需要时开启
# export ROPE_FUSE=1
# export ADALN_FUSE=1
# cache 算法优化 需要时开启
# export COND_CACHE=1
# export UNCOND_CACHE=1
# 单卡
python run.py \
--device_id 0 \
--model_path ${model_path} \
--prompt_file "./prompts.txt" \
--prompt_lang "en" \
--aspect_ratio "16:9" \
--num_inference_steps 50 \
--seed 42 \
--output_dir "baseline" \
--quant_desc_path ${quant_desc_path}以w8a8量化为例子,运行命令:
export model_path="/home/weight/Qwen-Image/"
export quant_desc_path=" /home/Qwen-Image/quant_w8a8_dynamic_withoutData_use_disable_quant_layers/quant_model_description_w8a8_dynamic.json"
export LCCL_DETERMINISTIC=true
export HCCL_DETERMINISTIC=true
export ATB_MATMUL_SHUFFLE_K_ENABLE=0
export ATB_LLM_LCOC_ENABLE=true
export CLOSE_MATMUL_K_SHIFT=true
# 算子优化 需要时开启
# export ROPE_FUSE=1
# export ADALN_FUSE=1
# cache 算法优化 需要时开启
# export COND_CACHE=1
# export UNCOND_CACHE=1
# 8卡 cfg=2 ulysses=4
export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
# export ASCEND_RT_VISIBLE_DEVICES=8,9,10,11,12,13,14,15
torchrun --nproc_per_node=8 --master-port 29508 run_cfg_usp.py \
--model_path ${model_path} \
--prompt_file prompts.txt \
--prompt_lang "en" \
--aspect_ratio "16:9" \
--num_inference_steps 50 \
--seed 42 \
--output_dir "./base/baseline_cfg2_ulysses4_optimize_1+2" \
--ulysses_size 4 \
--cfg_size 2 \
--quant_desc_path ${quant_desc_path}--quant_desc_path需指向完整的量化描述文件路径(即quant_model_description_*.json),且该路径要求填写绝对路径。量化权重文件(.safetensors)需与描述文件在同一目录下,否则会提示权重加载失败。| 模型 | 分辨率 | 迭代次数 | 卡数 | E2E耗时 |
|---|---|---|---|---|
| Qwen-Image | 1664×928 | 50 | 1 | 78s |
| 模型 | 分辨率 | 迭代次数 | 卡数 | E2E耗时 |
|---|---|---|---|---|
| Qwen-Image | 1664×928 | 50 | 1 | 38s |
若使用torch2.2以下版本时,需要注释掉"$DIFFUSERS_PATH/models/attention_dispatch.py"中481行和490行的torch.library接口的使用,否则会由于兼容性问题报错。修改后的代码如下所示:
# @torch.library.custom_op("flash_attn_3::_flash_attn_forward", mutates_args=(), device_types="cuda")
def _wrapped_flash_attn_3_original(
query: torch.Tensor, key: torch.Tensor, value: torch.Tensor
) -> Tuple[torch.Tensor, torch.Tensor]:
out, lse = flash_attn_3_func(query, key, value)
lse = lse.permute(0, 2, 1)
return out, lse
# @torch.library.register_fake("flash_attn_3::_flash_attn_forward")
def _(query: torch.Tensor, key: torch.Tensor, value: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
batch_size, seq_len, num_heads, head_dim = query.shape
lse_shape = (batch_size, seq_len, num_heads)
return torch.empty_like(query), query.new_empty(lse_shape)