BERT+使⽤transformers库加载⾃⼰数据集做BERT预训练(普通⽅式+Trai。。
。
⼀、简单介绍Word Embedding
在NLP任务中,我们需要对⽂本进⾏编码,使之成为计算机可以读懂的语⾔。在编码时,我们期望句⼦之间保持词语间的相似性。word embedding做的事情就是把⼀个词映射到低维的稠密空间,切语义相近的词向量离得⽐较近。
word2vec的缺点:
1、相同词对应的向量训练好就固定了。
2. 在不同的场景中,词的意思是相同的。(即便是skip-gram,学习到的只是多个场景的综合意思)
BERT就是改进这两个缺点。
⼆、BERT的概念
说⽩了就是transformer的encoder部分,并不需要标签,有语料就能训练了。
BERT模型,本质可以把其看做是新的word2Vec。对于现有的任务,只需把BERT的输出看做是word2vec,在其之上建⽴⾃⼰的模型即可了。
BERT架构
只有编码器的transformer
两个版本:
base: blocks =12, hiddensize=768, heads = 12;
large: blocks =24, hiddensize=1024, heads = 18;
对输⼊的修改
每个样本是⼀个句⼦对
加⼊额外的⽚端嵌⼊
位置编码可学习
cls是句⼦的开头,sep是两个句⼦的结尾。segmentEmbed 前⾯⼀个句⼦是0,后⾯是1. postionEmbend是⾃⼰学的。
三、bert代码
def get_tokens_and_segments(tokens_a, tokens_b=None):
"""获取输⼊序列的词元及其⽚段索引。"""
tokens =['<cls>']+ tokens_a +['<sep>']
# 0和1分别标记⽚段A和B
segments =[0]*(len(tokens_a)+2)
if tokens_b is not None:
tokens += tokens_b +['<sep>']
eval是做什么的segments +=[1]*(len(tokens_b)+1)
return tokens, segments
其中传⼊两个句⼦的tokens,构造成cls+tokena+sep+tokenb+sep的输⼊格式,构造segment时,前⼀个要+2,因为⼿动加上了cls和sep,后⼀个+1,因为只⼿动加了⼀个sep。
class BERTEncoder(nn.Module):
"""BERT encoder."""
def__init__(self, vocab_size, num_hiddens, norm_shape, ffn_num_input,
ffn_num_hiddens, num_heads, num_layers, dropout,
max_len=1000, key_size=768, query_size=768, value_size=768,
**kwargs):
super(BERTEncoder, self).__init__(**kwargs)
self.segment_embedding = nn.Embedding(2, num_hiddens)
self.blks = nn.Sequential()
for i in range(num_layers):
self.blks.add_module(f"{i}", d2l.EncoderBlock(
key_size, query_size, value_size, num_hiddens, norm_shape,
ffn_num_input, ffn_num_hiddens, num_heads, dropout,True))
# 在BERT中,位置嵌⼊是可学习的,因此我们创建⼀个⾜够长的位置嵌⼊参数
self.pos_embedding = nn.Parameter(torch.randn(1, max_len,
num_hiddens))
def forward(self, tokens, segments, valid_lens):
# 在以下代码段中,`X`的形状保持不变:(批量⼤⼩,最⼤序列长度,`num_hiddens`)
X = ken_embedding(tokens)+ self.segment_embedding(segments)
X = X + self.pos_embedding.data[:,:X.shape[1],:]
for blk in self.blks:
X = blk(X, valid_lens)
return X
四、使⽤transformers
import os
import torch
from transformers import AdamW, AutoTokenizer, AutoModelForSequenceClassification
# Same as before
checkpoint ="./model/bert/bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint, use_fast=True)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
sequences =[
"I've been waiting for a HuggingFace course my whole life.",
"This course is amazing!",
]
batch = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")
batch["labels"]= sor([1,1])
optimizer = AdamW(model.parameters())
loss = model(**batch).loss
outputs = model(**batch)
print(outputs.logits)
loss = outputs.loss
print(loss.item())
loss.backward()
optimizer.step()
使⽤from_pretrained加载tokenizer和model ,tokenizer可以分句、增加cls和sep标签、转换ids、padd
ing、truncation,最后返回⼀个字典:
input_ids 是加了cls和sep标签之后的句⼦的ids,101代表[CLS]、102代表[SEP]
token_type_ids是bert特有的,表⽰这是bert输⼊中的第⼏句话。0是第⼀句,1是第⼆句(因为bert可以预测两句话是否是相连的)
attention_mask是设置注意⼒范围,即1是原先句⼦中的部分,0是padding的部分。
⽂本分类⼩任务(将BERT中添加⾃⼰的数据集,做预训练。)
import os
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import transformers as tfs
import numpy as np
import random
import torch
from torch.utils.data import TensorDataset, DataLoader, random_split
from transformers import BertTokenizer
from transformers import BertForSequenceClassification, AdamW
from transformers import get_linear_schedule_with_warmup
ics import f1_score, accuracy_score
def flat_accuracy(preds, labels):
"""A function for calculating accuracy scores"""
pred_flat = np.argmax(preds, axis=1).flatten()
labels_flat = labels.flatten()
return accuracy_score(labels_flat, pred_flat)
seed =42
seed =42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic =True
device = torch.device('cuda')
train_df = pd.read_csv('./train.tsv', delimiter='\t', header=None)
train_set = train_df[:3000]
model_class, tokenizer_class, pretrained_weights =(tfs.BertModel, tfs.BertTokenizer,'../model/bert/bert-base-uncased',) tokenizer = tokenizer_class.from_pretrained(pretrained_weights,do_lower_case=True)
# 拆装数据和标签
sequence =list(train_set.iloc[0:,0])
labels =list(train_set.iloc[0:,1])
def encode_fn(text_list):
all_input_ids =[]
for text in text_list:
input_ids = de(
text,
add_special_tokens =True,# 添加special tokens,也就是CLS和SEP
max_length =160,# 设定最⼤⽂本长度
pad_to_max_length =True,# pad到最⼤的长度
return_tensors ='pt'# 返回的类型为pytorch tensor
)
all_input_ids.append(input_ids)
all_input_ids = torch.cat(all_input_ids, dim=0)
return all_input_ids
all_input_ids = encode_fn(sequence)
labels = sor(labels)
#print(all_input_ids.shape)
#torch.Size([3000, 160])
#print(labels.shape)
#torch.Size([3000])
epochs =4
batch_size =32
# Split data into train and validation
dataset = TensorDataset(all_input_ids, labels)
train_size =int(0.90*len(dataset))
val_size =len(dataset)- train_size
train_dataset, val_dataset = random_split(dataset,[train_size, val_size])
# Create train and validation dataloaders
train_dataloader = DataLoader(train_dataset, batch_size = batch_size, shuffle =True)
val_dataloader = DataLoader(val_dataset, batch_size = batch_size, shuffle =False)
# Load the pretrained BERT model
model = BertForSequenceClassification.from_pretrained(pretrained_weights, num_labels=2,
output_attentions=False, output_hidden_states=False)
model.cuda()
# create optimizer and learning rate schedule
optimizer = AdamW(model.parameters(), lr=2e-5)
total_steps =len(train_dataloader)* epochs
版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系QQ:729038198,我们将在24小时内删除。
发表评论