这是一个基于原始 RoBERTa-base 检查点、使用约 5800 万条推文进行训练的 RoBERTa-base 模型,其技术细节和评估结果详见 TweetEval 基准测试(EMNLP 2020 研究成果)。
如需在推特特定数据上评估本模型及其他语言模型,请参阅 Tweeteval 官方代码库。
请将用户名和链接替换为占位符:"@user" 和 "http"。
def preprocess(text):
new_text = []
for t in text.split(" "):
t = '@user' if t.startswith('@') and len(t) > 1 else t
t = 'http' if t.startswith('http') else t
new_text.append(t)
return " ".join(new_text)from transformers import pipeline, AutoTokenizer
import numpy as np
MODEL = "cardiffnlp/twitter-roberta-base"
fill_mask = pipeline("fill-mask", model=MODEL, tokenizer=MODEL)
tokenizer = AutoTokenizer.from_pretrained(MODEL)
def print_candidates():
for i in range(5):
token = tokenizer.decode(candidates[i]['token'])
score = np.round(candidates[i]['score'], 4)
print(f"{i+1}) {token} {score}")
texts = [
"I am so <mask> 😊",
"I am so <mask> 😢"
]
for text in texts:
t = preprocess(text)
print(f"{'-'*30}\n{t}")
candidates = fill_mask(t)
print_candidates()输出:
------------------------------
I am so <mask> 😊
1) happy 0.402
2) excited 0.1441
3) proud 0.143
4) grateful 0.0669
5) blessed 0.0334
------------------------------
I am so <mask> 😢
1) sad 0.2641
2) sorry 0.1605
3) tired 0.138
4) sick 0.0278
5) hungry 0.0232from transformers import AutoTokenizer, AutoModel, TFAutoModel
import numpy as np
from scipy.spatial.distance import cosine
from collections import defaultdict
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModel.from_pretrained(MODEL)
def get_embedding(text):
text = preprocess(text)
encoded_input = tokenizer(text, return_tensors='pt')
features = model(**encoded_input)
features = features[0].detach().cpu().numpy()
features_mean = np.mean(features[0], axis=0)
return features_mean
MODEL = "cardiffnlp/twitter-roberta-base"
query = "The book was awesome"
tweets = ["I just ordered fried chicken 🐣",
"The movie was great",
"What time is the next game?",
"Just finished reading 'Embeddings in NLP'"]
d = defaultdict(int)
for tweet in tweets:
sim = 1-cosine(get_embedding(query),get_embedding(tweet))
d[tweet] = sim
print('Most similar to: ',query)
print('----------------------------------------')
for idx,x in enumerate(sorted(d.items(), key=lambda x:x[1], reverse=True)):
print(idx+1,x[0])输出:
Most similar to: The book was awesome
----------------------------------------
1 The movie was great
2 Just finished reading 'Embeddings in NLP'
3 I just ordered fried chicken 🐣
4 What time is the next game?from transformers import AutoTokenizer, AutoModel, TFAutoModel
import numpy as np
MODEL = "cardiffnlp/twitter-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
text = "Good night 😊"
text = preprocess(text)
# Pytorch
model = AutoModel.from_pretrained(MODEL)
encoded_input = tokenizer(text, return_tensors='pt')
features = model(**encoded_input)
features = features[0].detach().cpu().numpy()
features_mean = np.mean(features[0], axis=0)
#features_max = np.max(features[0], axis=0)
# # Tensorflow
# model = TFAutoModel.from_pretrained(MODEL)
# encoded_input = tokenizer(text, return_tensors='tf')
# features = model(encoded_input)
# features = features[0].numpy()
# features_mean = np.mean(features[0], axis=0)
# #features_max = np.max(features[0], axis=0)
若使用本模型,请引用参考论文。
@inproceedings{barbieri-etal-2020-tweeteval,
title = "{T}weet{E}val: Unified Benchmark and Comparative Evaluation for Tweet Classification",
author = "Barbieri, Francesco and
Camacho-Collados, Jose and
Espinosa Anke, Luis and
Neves, Leonardo",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2020",
month = nov,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2020.findings-emnlp.148",
doi = "10.18653/v1/2020.findings-emnlp.148",
pages = "1644--1650"
}