BERT代码讲解——datawhale开源学习课程《基于transformers的NLP⼊。。。BERT代码实现
⽂章⽬录
前⾔
本⽂是复制datawhale关于transformer教程⾥的⼀章 。本⽂包含⼤量源码和讲解,通过段落和横线分割了各个模块,同时⽹站配备了侧边栏,帮助⼤家在各个⼩节中快速跳转,希望⼤家阅读完能对BERT有深刻的了解。同时建议通过pycharm、vscode等⼯具对bert源码进⾏单步调试,调试到对应的模块再对⽐看本章节的讲解。
本篇章将基于进⾏学习。本章节的全部代码在,注意由于版本更新较快,可能存在差别,请以4.4.2版本为准。
HuggingFace 是⼀家总部位于纽约的聊天机器⼈初创服务商,很早就捕捉到 BERT ⼤潮流的信号并着⼿实现基于 pytorch 的 BERT 模型。这⼀项⽬最初名为 pytorch-pretrained-bert,在复现了原始效果的同时,提供了易⽤的⽅法以⽅便在这⼀强⼤模型的基础上进⾏各种玩耍和研究。
随着使⽤⼈数的增加,这⼀项⽬也发展成为⼀个较⼤的开源社区,合并了各种预训练语⾔模型以及增加了 Tensorflow 的实现,并且在2019 年下半年改名为 Transformers。截⽌写⽂章时(2021 年 3 ⽉ 30 ⽇)
这⼀项⽬已经拥有 43k+ 的star,可以说 Transformers 已经成为事实上的 NLP 基本⼯具。
本⽂基于 Transformers 版本 4.4.2(2021 年 3 ⽉ 19 ⽇发布)项⽬中,pytorch 版的 BERT 相关代码,从代码结构、具体实现与原理,以及使⽤的⾓度进⾏分析。
1-Tokenization分词-BertTokenizer
1.1 Tokenization代码
和BERT 有关的 Tokenizer 主要写在models/bert/tokenization_bert.py中。
import collections
import os
import unicodedata
from typing import List, Optional, Tuple
kenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
from transformers.utils import logging
logger = _logger(__name__)
VOCAB_FILES_NAMES ={"vocab_file":""}
PRETRAINED_VOCAB_FILES_MAP ={
"vocab_file":{
"bert-base-uncased":"/bert-base-uncased/resolve/",
}
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES ={
"bert-base-uncased":512,
}
PRETRAINED_INIT_CONFIGURATION ={
"bert-base-uncased":{"do_lower_case":True},
}
def load_vocab(vocab_file):
def load_vocab(vocab_file):
"""Loads a vocabulary file into a dictionary."""
vocab = collections.OrderedDict()
with open(vocab_file,"r", encoding="utf-8")as reader:
tokens = adlines()
for index, token in enumerate(tokens):
token = token.rstrip("\n")
vocab[token]= index
return vocab
def whitespace_tokenize(text):
"""Runs basic whitespace cleaning and splitting on a piece of text."""
text = text.strip()
if not text:
return[]
tokens = text.split()
return tokens
class BertTokenizer(PreTrainedTokenizer):
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
def__init__(
self,
vocab_file,
do_lower_case=True,
do_basic_tokenize=True,
never_split=None,
unk_token="[UNK]",
sep_token="[SEP]",
pad_token="[PAD]",
cls_token="[CLS]",
mask_token="[MASK]",
tokenize_chinese_chars=True,
strip_accents=None,
**kwargs
):
super().__init__(
do_lower_case=do_lower_case,
do_basic_tokenize=do_basic_tokenize,
never_split=never_split,
unk_token=unk_token,
sep_token=sep_token,
pad_token=pad_token,
cls_token=cls_token,
mask_token=mask_token,
tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents,
**kwargs,
)
if not os.path.isfile(vocab_file):
raise ValueError(
f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained "
"model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
)
self.vocab = load_vocab(vocab_file)
self.ids_to_tokens = collections.OrderedDict([(ids, tok)for tok, ids in self.vocab.items()])
self.do_basic_tokenize = do_basic_tokenize
if do_basic_tokenize:
self.basic_tokenizer = BasicTokenizer(
self.basic_tokenizer = BasicTokenizer(
do_lower_case=do_lower_case,
never_split=never_split,
tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents,
)
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
@property
def do_lower_case(self):
return self.basic_tokenizer.do_lower_case
@property
def vocab_size(self):
return len(self.vocab)
def get_vocab(self):
return dict(self.vocab,**self.added_tokens_encoder)
def_tokenize(self, text):
split_tokens =[]
if self.do_basic_tokenize:
for token in self.kenize(text, never_split=self.all_special_tokens):
# If the token is part of the never_split set
if token in self.ver_split:
split_tokens.append(token)
else:
split_tokens += self.kenize(token)
else:
split_tokens = self.kenize(text)
return split_tokens
def_convert_token_to_id(self, token):
"""Converts a token (str) in an id using the vocab."""
return (token, (self.unk_token))
def_convert_id_to_token(self, index):
"""Converts an index (integer) in a token (str) using the vocab."""
return self.ids_(index, self.unk_token)
def convert_tokens_to_string(self, tokens):
"""Converts a sequence of tokens (string) in a single string."""
out_string =" ".join(tokens).replace(" ##","").strip()
return out_string
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]]=None
)-> List[int]:
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and adding special tokens. A BERT sequence has the following format:
- single sequence: ``[CLS] X [SEP]``
- pair of sequences: ``[CLS] A [SEP] B [SEP]``
Args:
token_ids_0 (:obj:`List[int]`):
List of IDs to which the special tokens will be added.
token_ids_1 (:obj:`List[int]`, `optional`):
Optional second list of IDs for sequence pairs.
Returns:
:obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
"""
if token_ids_1 is None:
return[self.cls_token_id]+ token_ids_0 +[self.sep_token_id]
cls =[self.cls_token_id]
sep =[self.sep_token_id]
return cls + token_ids_0 + sep + token_ids_1 + sep
def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]]=None, already_has_special_tokens:bool=False
)-> List[int]:
"""
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``prepare_for_model`` method.
Args:
token_ids_0 (:obj:`List[int]`):
List of IDs.
token_ids_1 (:obj:`List[int]`, `optional`):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
:obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if already_has_special_tokens:
return super().get_special_tokens_mask(
token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True )
if token_ids_1 is not None:
return[1]+([0]*len(token_ids_0))+[1]+([0]*len(token_ids_1))+[1]
return[1]+([0]*len(token_ids_0))+[1]
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]]=None
)-> List[int]:
"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task. A BERT sequence pair mask has the following format:
::
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
| first sequence | second sequence |
If :obj:`token_ids_1` is :obj:`None`, this method only returns the first portion of the mask (0s).
Args:
token_ids_0 (:obj:`List[int]`):index复数
List of IDs.
token_ids_1 (:obj:`List[int]`, `optional`):
Optional second list of IDs for sequence pairs.
Returns:
:obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
sequence(s).
"""
sep =[self.sep_token_id]
cls =[self.cls_token_id]
if token_ids_1 is None:
return len(cls + token_ids_0 + sep)*[0]
return len(cls + token_ids_0 + sep)*[0]+len(token_ids_1 + sep)*[1]
def save_vocabulary(self, save_directory:str, filename_prefix: Optional[str]=None)-> Tuple[str]:
index =0
if os.path.isdir(save_directory):
vocab_file = os.path.join(
save_directory,(filename_prefix +"-"if filename_prefix else"")+ VOCAB_FILES_NAMES["vocab_file"] )
else:
vocab_file =(filename_prefix +"-"if filename_prefix else"")+ save_directory
with open(vocab_file,"w", encoding="utf-8")as writer:
for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
if index != token_index:
logger.warning(
f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive."
" Please check that the vocabulary is not corrupted!"
)
index = token_index
writer.write(token +"\n")
index +=1
return(vocab_file,)
return(vocab_file,)
class BasicTokenizer(object):
def__init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None): if never_split is None:
never_split =[]
self.do_lower_case = do_lower_case
self.strip_accents = strip_accents
def tokenize(self, text, never_split=None):
"""
Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
WordPieceTokenizer.
Args:
**never_split**: (`optional`) list of str
Kept for backward compatibility purposes. Now implemented directly at the base class level (see
:func:`kenize`) List of token not to split.
"""
# union() returns a new set by concatenating the two sets.
never_split = ver_split.union(set(never_split))if never_split ver_split
text = self._clean_text(text)
# This was added on November 1st, 2018 for the multilingual and Chinese
# models. This is also applied to the English models now, but it doesn't
# matter since the English models were not trained on any Chinese data
# and generally don't have any Chinese data in them (there are Chinese
# characters in the vocabulary because Wikipedia does have some Chinese
# words in the English Wikipedia.).
kenize_chinese_chars:
text = self._tokenize_chinese_chars(text)
orig_tokens = whitespace_tokenize(text)
split_tokens =[]
for token in orig_tokens:
if token not in never_split:
if self.do_lower_case:
token = token.lower()
if self.strip_accents is not False:
token = self._run_strip_accents(token)
elif self.strip_accents:
token = self._run_strip_accents(token)
d(self._run_split_on_punc(token, never_split))
output_tokens = whitespace_tokenize(" ".join(split_tokens))
return output_tokens
def_run_strip_accents(self, text):
"""Strips accents from a piece of text."""
text = alize("NFD", text)
output =[]
for char in text:
cat = unicodedata.category(char)
if cat =="Mn":
continue
output.append(char)
return"".join(output)
def_run_split_on_punc(self, text, never_split=None):
"""Splits punctuation on a piece of text."""
if never_split is not None and text in never_split:
return[text]
chars =list(text)
i =0
start_new_word =True
版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系QQ:729038198,我们将在24小时内删除。
发表评论