from openmind import pipeline, is_torch_npu_available
from openmind import AutoTokenizer, AutoModelForCausalLM
from openmind_hub import snapshot_download
import torch.nn.functional as F
from torch import Tensor
import openmind
import torch
import argparse
import time
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument(
"--model_name_or_path",
type=str,
help="Path to model",
default="jeffding/ChatTCM-7B-Pretrain-openmind",
)
args = parser.parse_args()
return args
def main():
args = parse_args()
model_path = args.model_name_or_path
if is_torch_npu_available():
device = "npu:0"
else:
device = "cpu"
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True).to(device)
start_time = time.time()
text = "胃病的中药调理方法:"
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer,device=device)
generated_text = pipe(text, max_length=50, do_sample=False, no_repeat_ngram_size=2)[0]
print(generated_text['generated_text'])
end_time = time.time()
print(f"硬件环境:{device},推理执行时间:{end_time - start_time}秒")
if __name__ == "__main__":
main()
在2张A800-80G上,
基于Qwen/Qwen2-7B-Instruct,在llamafactory框架上,
使用SylvanL/Traditional-Chinese-Medicine-Dataset-Pretrain进行了2个epoch的继续预训练(Continue Pre-train)。
在保留模型原有通用能力的前提下,使模型熟悉、记住,并更倾向于输出以下内容:
epoch 1:
{
"num_input_tokens_seen": 442925056,
"total_flos": 885678736932864.0,
"train_loss": 1.658593576353242,
"train_runtime": 133293.1729,
"train_samples_per_second": 3.246,
"train_steps_per_second": 0.014
}
average_perplexity: 7.2646328377141005
epoch 2:
{
"num_input_tokens_seen": 442925056,
"total_flos": 885678736932864.0,
"train_loss": 1.3894652060929016,
"train_runtime": 139124.2076,
"train_samples_per_second": 3.11,
"train_steps_per_second": 0.014
}
average_perplexity: 5.800355962033688

llamafactory-cli train \
--stage pt \
--do_train True \
--model_name_or_path Qwen/Qwen2-7B-Instruct \
--preprocessing_num_workers 16 \
--finetuning_type full \
--template default \
--flash_attn auto \
--dataset_dir {dataset_dir} \
--dataset CPT_generalMedical_362420,{shibing624/huatuo_medical_qa_sharegpt},CPT_medicalRecord_source1_61486,CPT_medicalRecord_source2_15307,CPT_medicalRecord_source3_230000,CPT_tcmKnowledge_source1_17921,CPT_tcmKnowledge_source2_12889,CPT_tcmBooks_source1_146244 \
--cutoff_len 1024 \
--learning_rate 6e-06 \
--num_train_epochs 2.0 \
--max_samples 1000000 \
--per_device_train_batch_size 28 \
--gradient_accumulation_steps 4 \
--lr_scheduler_type cosine \
--max_grad_norm 1.0 \
--logging_steps 1 \
--save_steps 1000 \
--warmup_steps 0 \
--optim adamw_torch \
--packing True \
--report_to none \
--output_dir {output_dir} \
--bf16 True \
--plot_loss True \
--ddp_timeout 180000000 \
--include_num_input_tokens_seen True \
--deepspeed cache/ds_z3_offload_config.json