使⽤LSTM进⾏⽂本分类
说明
之前写过⽤lstm模型做的⽂本分类,但是代码结构⾮常混乱。读过Bert源码后,决定模仿Bert的结构,⾃⼰重新写⼀遍使⽤lstm模型的代码。只作为熟悉tensorflow各个api与⼀个⽐较清楚的NLP模型结构的练⼿⽤,不求更⾼的准确率。
使⽤包含10个商品类别,60000+数据的,已标注正负情感的商品评论数据作为训练语料。原⽂件为csv格式,包含3个字段:cat(类别)、label(正负情感)、review(实际的评论⽂字)。其既可作为正负情感分类的⼆分类训练语料,也可以(勉强)作为商品类别分类的10分类训练语料。
已按⼤约8:2的⽐例得到了训练集与测试集并将其转为了tsv⽂件。
使⽤结巴分词作为切词⼯具。
⼀、得到词典txt⽂件
由于我⼀直没有到合适的中⽂词典⽂件,Bert中的词典⽂件⼜是以字作为最⼩单位的,故这⾥暂时只是对待训练的语料做切词、去重的处理,得到⼀个⽐较⼩的字典⽂件。
def create_vocab():
raw_df = pd.read_csv(RAW_DATA)# 读原始⽂件为dataframe
# 热⽔器有⼀条数据有问题,不要热⽔器的数据
raw_df = raw_df[raw_df.cat !='热⽔器']
raw_document = raw_df['review'].tolist()# 原始语料(list形式)
eval是做什么的# 加载停⽤词列表
# with open(STOPWORDS, 'r', encoding='utf-8') as s:
# stopwords = [word.strip() for word adlines()]
document_words =[]# 原始语料完成切词
for sentence in raw_document:
cut_sentence =[word for word in jieba.lcut(sentence)]
d(cut_sentence)
vocab_list =set(document_words)
with open(VOCAB,'w', encoding='utf-8')as f:
f.write('[PAD]'+'\n')
f.write('[UNK]'+'\n')
for vocab in vocab_list:
f.write(vocab +'\n')
⼆、Tokenization
这⼀块完全基于Bert源码,做了⾮常多的精简。只满⾜:to_unicode、读取词典、切词、词语转id、id转词语的基本功能。没什么好说的。
import collections
import tensorflow as tf
import jieba
def convert_to_unicode(text):
"""Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
if isinstance(text,str):
return text
elif isinstance(text,bytes):
return text.decode("utf-8","ignore")
else:
raise ValueError("Unsupported string type: %s"%(type(text)))
# 将词典中的词构成(词,index)的collections.OrderedDict形式
def load_vocab(vocab_file):
"""Loads a vocabulary file into a dictionary."""
vocab = collections.OrderedDict()
index =0
with tf.gfile.GFile(vocab_file,"r")as reader:
while True:
token = convert_to_adline())
if not token:
break
token = token.strip()
vocab[token]= index
index +=1
return vocab
def convert_by_vocab(vocab, items):
"""Converts a sequence of [tokens|ids] using the vocab."""
output =[]
for item in items:
output.(item, vocab['[UNK]']))
return output
class FullTokenizer(object):
"""Runs end-to-end tokenziation."""
def__init__(self, vocab_file):
# 根据vocab⽂件,得到形如(词,index)的字典
self.vocab = load_vocab(vocab_file)
# 变成 index: 词的形式
self.inv_vocab ={v: k for k, v in self.vocab.items()}
# 将句⼦变成词列表
@staticmethod
def tokenize(text):
split_tokens = jieba.lcut(text)
return split_tokens
def convert_tokens_to_ids(self, tokens):
return convert_by_vocab(self.vocab, tokens)
def convert_ids_to_tokens(self, ids):
return convert_by_vocab(self.inv_vocab, ids)
三、建⽴模型结构
模型的mode参数可取train、eval、predict三类,取eval时只返回cost与accuracy,取predict时只返回logits。别的不⽤多说了。import tensorflow as tf
import json
import six
class LstmConfig(object):
def__init__(self,
vocab_size,# 词典中的词数
hidden_size=128,
keep_prob=0.9,
embedding_keep_prob=0.9,# 词向量不被dropout的⽐例
max_grad_norm=5,
num_of_classes=2,# 分类数
num_of_layers=2,# lstm⽹络层数
initializer_range=0.02):# 初始化范围
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.keep_prob = keep_prob
self.max_grad_norm = max_grad_norm
self.num_of_classes = num_of_classes
self.num_of_layers = num_of_layers
self.initializer_range = initializer_range
@classmethod
def from_dict(cls, json_object):
"""Constructs a `BertConfig` from a Python dictionary of parameters."""
config = LstmConfig(vocab_size=None)
for(key, value)in six.iteritems(json_object):
config.__dict__[key]= value
return config
@classmethod
def from_json_file(cls, json_file):
"""Constructs a `BertConfig` from a json file of parameters."""
with tf.gfile.GFile(json_file,"r")as reader:
text = ad()
return cls.from_dict(json.loads(text))
# 双向LSTM⽹络模型
class LstmModel(object):
# 构建⽹格结构
def__init__(self, config, mode):
output_keep_prob = config.keep_prob if mode =='train'else1.0
# 词向量
self.word_embedding = tf.get_variable('word_emb', shape=[config.vocab_size, config.hidden_size]) # lstm⽹络结构
# 前向⽹络变量
lstm_cells_fw =[_cell._cell.BasicLSTMCell(config.hidden_size), output_keep_prob=output_keep_prob)
for _ in range(config.num_of_layers)]
self.lstm_fw = _cell.MultiRNNCell(lstm_cells_fw)
# 反向⽹络
lstm_cells_bw =[_cell._cell.BasicLSTMCell(config.hidden_size), output_keep_prob=output_keep_prob)
for _ in range(config.num_of_layers)]
self.lstm_bw = _cell.MultiRNNCell(lstm_cells_bw)
# Softmax层变量
self.weight = tf.get_variable('weight',[config.hidden_size *2, config.num_of_classes]) self.bias = tf.get_variable('bias',[config.num_of_classes])
# 定义模型的前向计算图
def forward(self, src_input, src_size, label):
# 将输⼊的序号化单词转成词向量
inputs = bedding_lookup(self.word_embedding, src_input)
de =='train':
inputs = tf.nn.dropout(inputs, bedding_keep_prob)
# LSTM⽹络计算
with tf.variable_scope('lstm'):
outputs, states = tf.nn.bidirectional_dynamic_rnn(self.lstm_fw,
self.lstm_bw,
inputs,
dtype=tf.float32,
sequence_length=src_size)
final_outputs = tf.concat(outputs,2)
final_outputs = final_outputs[:,-1,:]
# 取平均值
# final_outputs = tf.reduce_at(outputs, 2), 1)
# 全连接层计算
with tf.variable_scope('fc'):
logits = tf.matmul(final_outputs, self.weight)+ self.bias
de =='predict':
return logits
# 损失函数
with tf.variable_scope('loss'):
loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=label,
logits=logits)
cost = tf.reduce_mean(loss)
# 准确率
with tf.variable_scope('accuracy'):
correct_prediction = tf.equal(tf.argmax(logits,1), label)
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
de =='eval':
return cost, accuracy
# 定义反向操作
trainable_variables = tf.trainable_variables()
# 控制梯度⼤⼩,定义优化⽅法和训练步骤
grads = tf.gradients(cost, trainable_variables)
grads, _ = tf.clip_by_global_norm(grads, fig.max_grad_norm)
optimizer = tf.train.GradientDescentOptimizer(learning_fig.learning_rate)
train_op = optimizer.apply_gradients(zip(grads, trainable_variables)) return logits, cost, accuracy, train_op
四、分类主程序
我命名为run_classifier,完全照搬Bert。
⾸先是tf.flag,⽤于导⼊参数
flags = tf.flags
FLAGS = flags.FLAGS
flags.DEFINE_integer("train_batch_size",32,"Total batch size for training.")
flags.DEFINE_integer("eval_batch_size",8,"Total batch size for eval.")
flags.DEFINE_integer("predict_batch_size",8,"Total batch size for predict.")
flags.DEFINE_integer("num_train_epochs",4,"Total epoches for train.")
flags.DEFINE_string(
"data_dir","E:/NLP/NLP_Deep_Learning_Summary/datasets",
"The input data dir. Should contain the .tsv files (or other data files) for the task.")
flags.DEFINE_string("init_checkpoint",None,"Initial checkpoint")
flags.DEFINE_string("vocab_file","./","The vocabulary file.")
flags.DEFINE_string("output_file","./model1","The output file for trained model.")
flags.DEFINE_bool("do_train",True,"Whether to run training.")
flags.DEFINE_bool("do_eval",True,"Whether to run eval on the dev set.")
flags.DEFINE_bool("do_predict",False,"Whether to run the model in inference mode on the test set.")
定义Example、Feature与DataProcessor类
为了从训练/测试数据中获得tfrecord⽂件,需要做Example与Feature类的处理。DataProcessor与Bert源码⼏乎相同,功能为从tsv⽂件中得到example类的数据。
tsv⽂件的各个字段为:index(id)、category(商品类别)、polarity(情感,0或1)、text(原始⽂字)。
class InputExample(object):
"""A single training/test example for simple sequence classification."""
def__init__(self, guid, text, label=None):
"""Constructs a InputExample."""
self.guid = guid
< = text
self.label = label
由于使⽤的是lstm,其特征包含:每条语料原始的词id、原始的长度(即词数)与原始的分类标签。
class InputFeatures(object):
def__init__(self, input_ids, input_size, label):
self.input_ids = input_ids
self.input_size = input_size
self.label = label
版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系QQ:729038198,我们将在24小时内删除。
发表评论