事前预备

  1. 抱抱脸 bert 文档 huggingface.co/transformer…
  2. 数据集 www.kaggle.com/c/jigsaw-to…

数据集分为六个类,多标签问题

import numpy as np
imporgiticomfortt pandas as pd
from sklearn import metrics
im人体肠道portgitee transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, Bert闰土刺猹图片Model, Bert人头攒动的读音Config
from torch import cuda
device = 'cuda' if cuda.is_available() else 'giticomfortcpu'

开始处理数据…

df = pd.read_csv("train.csv")
df.head()
df['githublist'] = df[df.columns[2:]].values.tolist()
new_df = df[['giticomfortcomment_text', 'list']].cogiticomfortpy()
new_df.head()

界说超参数

MAX_人体肠胃图LEN = 200
TRAIN_BATCH_SIZE = 8
VALID_BAT人头攒动CH_SIZE = 4
EPOCHS = 1
LEARNING_RATE = 1e-05
tokenizer = BertTokRTCenizer.from_pretrained('bertgithub-base-uncased')

检验一gitv下分github词器

test_one_sent = new_df.comment_text[0]
print(tokenizer.encode_plus(test_one_sent,
None,
add_special_tokegit指令ns=True,
max_length=MgiticomfortAX_LEN,
pad_to_max_length=Truegiticomfort是什么轮胎品牌,
return_tokgithub是干什么的en_type_ids=True))

运用 torch 的 Dataset 类界说数据集

class CustomDataset(Dataset):
def __init__(self, datafgitvrame, tokenizer, max_len):
self.tokenizer = tokenizer
self.data = datafRTCrame
self.comment_text = dataframe.comme乳头刺痛是怎么回事nt_text
self.targ人体肠胃图ets = self.dat人头攒动的读音a.list
sel人体肠胃图f.max_len = max_len
def __len__(self):
return len(self.comment_text)
def __getgithub是干什么的item__(self, index):
comment_text = str(segitvlf.comment_text[index])
comment_text = " ".join(comment_text.split())
inputs = self.tokenizer.encode_plus(
comment_text,
None,
add_special_tokens=True,
max_length=self.max_lengithub,
pad_to_max_length=True,
return_token_type_ids=True
)
ids = inputs['input_ids']
mask = inputs['attention_mask']
token_type_igiti轮胎是什么品牌ds = inputs["token_type_ids"]
returnRTC {
'ids': torch.tensor(i闰土刺猹图片ds, dt肉跳测吉凶ype=torch.long),
'mask': torch.tensor(mask, dtype=torch.long),
'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
'闰土刺猹targets': torch.tensor(self.targets[index], dtype=torch.float)
}

运用 torch 的人体肠道结构图 DataLo乳头刺痛是怎么回事ader 界说迭代数据集

train_size = 0.8
train_dataset=new_df.sample(frac=train_githubsize,random_state=200)
test_datas让天秤倒追的星座et=new_df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)github
pr人体肠胃图int("FULL Dataset: {}".format闰土刺猹(new_df.shape))
print("TRAIN Dagitlabtaset: {}".format(train_dataset.shape))
prin人体肠道结构图t("TEST Dataset: {}".formgitiat(test_dataset.shape))
# 为了活络练习一下,我乳头刺痛是怎么回事这儿只堵截数据集2000条走一遍流程
train_dgitvataset = traingiticomfort_dataset[:2000]
test_dataset = tgitvest_dataset[:100]
training_set = Cus人头攒动tomDataset(train_dataset, tokenizer, MAX_LEN)
testing_set = CustomDataset(test_dataset, tokenizer, MAX_LEN)gitee
train_params = {'batch_size': TRAIN_gitiBATCH_SIZE,
'shuffle': True,
'num_workers': 0
}
test_params = {'batch_si人体肠道结构图ze': VALID_BATCH_SIZE,
'shuffle': True,
'num_workers': 0
}
training_loader = DataLoagiti轮胎是什么品牌der(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

建立网络 BEgiticomfortRTGit + DROPOUT + LINEAR;丢掉函数为 BCELogits Loss

class BERTClass人体肠胃图(torch.nn.Modulgitve):
def __init__(self):
super(BERTClass, selfgit指令).__init__()
self.l1 = transformers.BertModel.gitifrom_pretrained('bert-base-uncased')
self.l2 = torch.nn.Dropout(0.3)
self.l3 = torch.nn.Linear(768, 6)
d让天秤倒追的星座ef forward(self, ids, mask, token_type_ids):
_, output_1= self.人体肠胃图l1(ids, attention_mask = masgiteek, token_type_ids = tok人体肠道结构图en_type_ids)
output_2 = se让天秤倒追的星座lf.l2(output_1)
output = self.l3(output_2)
return output
model = BERT乳头刺痛是怎么回事Class()
mod肉跳测吉凶el.to(device)

BCEWithLogiGittsLoss 的说明

这个类结合了 sigmoid 和 BCE loss,具人头攒动的读音体如下:

l(x,y)=L={l1,…,lN}Tl(x,y)=L={l_1,…,l_N}^{T}

ln=−wn[yn⋅log⁡(xn)+Git(1−yn)⋅logRTC⁡(1−(xn))]l_n=-w_n[y_ncdot log sigma(x_n)+(1-y_n)cdot log(1-sigma(x_n))]

N 是 batch size,其实就是将多标签转换为多个二分类问题

def loss_fn(outputs, targets):
return torch.nn.BCEWithLogitsLoss()(outputs, targets)

检验一下 loss,能够看出它们是持平的

y = torch.tensor([1., 1., 0.])
x = torch.tensor([1., 0.9, 0.1])
xy_loss = loss_fn(x, y)
print(xy_loss)
sigmoid_x = torch.sigmoid(x)
print(sigmoid_x)
print(-(np.log(0.73乳头刺痛是怎么回事11)+np.log(0.7109)+np.log(1-0.5250)) / 3)

界说优化器

optimizer = torch.optim.Adam(让天秤倒追的星座params=model.parameters(), lr=LERTCARNING_RATE)

界说练习进程

fgit指令rom tqdm import tqdm
def train(epoch):
model.train()
for _,data in tqdm(enumerate(training_loadegiticomfortr, 0)):
ids = daGitta['ids'].to(device, dtype = torch.git指令long)
mask = data['mask'].to(device, dtype = torch.long)
token_type_ids = data['token_type_ids'].to(device, dtype = t人体肠道结构图orch.long)
targets = data['targets'].to(device, dtype = torch.float)
outputs = model(ids, mask, token_type_ids)
loss = loss_fn(ogitiugitvtputs, targets)
if _%5000==0:
print(f'Epoch: {epoch}, Los肉跳测吉凶s:  {loss.item()}')
optimizer.zero_grad()
loss.backward()
optimizer.step()
for epoch in range(EPOCHS):
train(epoch)

界说验证进程

def vali人体肠道结构图dation(epoch):
model.eval()
fin_targets=[]
fin_outputs=[]
with torch.no_grad():
for _, data in enumerate(te人体肠道结构图sting_loader, 0):
ids = data['ids'].to(device, dtype = torch.long)
mask = data['mask'].to(devicgiticomfort是什么轮胎品牌e, dtype = torch.long)
token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
targets = data['targets'].to(device, dtype = torch.flogiticomfortat)
outputs = model(id让天秤倒追的星座s, mask, token_type_ids)gitee
fin_targets.extend(targets.cpu().detach().ngitvumpy().tolist())
fin_outp闰土刺猹图片uts.extend(torch.sigmoid(outpu闰土刺猹图片ts)github.cpu().detach().numpy().tolist())
ret人头攒动的读音urn fGitin_outputs, fin_targgiti轮胎是什么品牌ets
for epoch in range(EPOCHS):
outputs, targets = validation(epoch)
outputs闰土刺猹 = (np.array(outputs) >= 0.3).astype(int)  # 应该为 0.5
accuracy = metrics.acc闰土刺猹图片uracy_score(targetsgit指令, outputs)
f1_score_micro = metrics.f1_score(targets, outputs, av人体肠胃图erage='micro')
f1_score_macro = metrics.f1_score(targegit指令ts, outputs, average='macro')
print(f"Accuracy Score = {accuracy}")
print(f"F1 Score (Micro) = {f1_score_micro}")
print(f"F1 Score (Macro) = {f1_score_macro}")

毕竟:

  • Accuracy Score = 0.89
  • F1 Score (Micro) = 0.3870967741935484人体肠道
  • F1 Score (Macro) = 0.1825396825396闰土刺猹图片8253

效果很差,由于只取了一丁点数据