GCN图卷积神经⽹络-中⽂⽂本分类pytorch
从⽆到有学习GCN的⼀个过程,很多python的⽤法都不懂,打上⼀部分注释,希望给我⼀样的⼩⽩⼀点帮助
新⼿⼊门容易遇到的问题
1. 训练的是什么参数? AXW⾥⾯的A是图结构,X是feature,是节点的特征(类⽐word embedding),W是这⼀层的权重,也就是随着梯
度下降更新的值,在pytorch⾥⾯定义⼀个weight,下⽂代码⽤了normal_⽅法对这个权重做了初始化,但在很多讲座介绍⾥这个是可有可⽆的
2. kipf的论⽂⾥⾯X是取的单位矩阵,在这个⽂本分类任务⾥使⽤了TF-IDF和PMI作为对⾓矩阵的值,将边的信息融⼊了学习中,⾄于融
合的怎么样实际上在数学上⽐较抽象,但我个⼈训练后,300-500epoch就能取得⼀个不错的结果,准确率可以达到80%以上还是很顶的,具体细节、更难的东西俺也不懂
3. 中⽂和英⽂分词的区别?中⽂是词语有含义⽽不是英语的单词,所以需要先进⾏分词。下⾯代码⽤了jieba cut分词函数,写了⼀个⼩
jieba函数,jieba真⾹
4. GCN怎么来的,什么原理?参考NTU李宏毅⽼师姜成瀚助教做的影⽚,看那个视频B站有,讲的挺清楚的
# 下⾯的代码关键部分打了注释,觉得有帮助的同学不妨给我点个免费的赞让我开⼼⼀下
# -*- coding: utf-8 -*-
"""
Created on Thu May  9 10:28:24 2019
@author: WT
"""
import os
import pickle
import pandas as pd
from sklearn. import TfidfVectorizer
import nltk
import numpy as np
import networkx as nx
from collections import OrderedDict
from itertools import combinations
import math
from tqdm import tqdm
import logging
import jieba
logging.basicConfig(format='%(asctime)s [%(levelname)s]: %(message)s', \
datefmt='%m/%d/%Y %I:%M:%S %p', level=logging.INFO)# 配置输出⽇志
logger = Logger(__file__)
stop =[line.strip()for line in open('').readlines()]
def cut_words(text):
text =str(text)
text =list(jieba.cut(text))
for word in text:
if word in stop:
return text
def load_pickle(filename):
completeName = os.path.join("./data/", \
filename)
with open(completeName,'rb')as pkl_file:
data = pickle.load(pkl_file)
return data
def save_as_pickle(filename, data):
completeName = os.path.join("./data/", \
completeName = os.path.join("./data/", \
filename)
with open(completeName,'wb')as output:transform和convert的区别
pickle.dump(data, output)
def nCr(n, r):
f = math.factorial  # 阶乘
return int(f(n)/(f(r)* f(n - r)))
# 移除⽆意义词汇和符号
def filter_tokens(tokens, stopwords):
tokens1 =[]
for token in tokens:
if(token not in stopwords)and(token not in[".",",",";","&","'s",":","?","!","(",")", \
"'","'m","'no","***","--","...","[","]"]):
tokens1.append(token)
return tokens1
def dummy_fun(doc):
return doc
def word_word_edges(p_ij):# 该函数⽣成词汇和词汇的边
word_word =[]# list() ⽅法⽤于将元组转换为列表
cols =list(lumns);
cols =[str(w)for w in cols]# str() 函数将对象转化为字符串
'''
# old, inefficient but maybe more instructive code
dum = []; counter = 0
for w1 in tqdm(cols, total=len(cols)):
for w2 in cols:
#if (counter % 300000) == 0:
#    print("Current Count: %d; %s %s" % (counter, w1, w2))
if (w1 != w2) and ((w1,w2) not in dum) and (p_ij.loc[w1,w2] > 0):
word_word.append((w1,w2,{"weight":p_ij.loc[w1,w2]})); dum.append((w2,w1))
counter += 1
'''
for w1, w2 in tqdm(combinations(cols,2), total=nCr(len(cols),2)):
if(p_ij.loc[w1, w2]>0):
word_word.append((w1, w2,{"weight": p_ij.loc[w1, w2]}))
return word_word
def generate_text_graph(window=10):
""" generates graph based on text corpus; window = sliding window size to calculate point-wise mutual information between words """# ⽤滑动窗⼝的⽅式统计词与词间的同时出现概率
logger.info("")
datafolder ="./data/"# 基于⽂本⽣成图的函数,窗⼝⽤于计算⽂字点与点之间的相互关系
df = pd.read_csv(os.path.join(datafolder,"data3.csv"))# pandas读取csv⽂件的⼀个基本函数
# column是⾏,index是列,axis=1的⽬的是为了删掉某⼀列,inplace=True是为了在原df上进⾏修改,类似实参形参
# 这⾥v是指的verse 属于哪⼀节不要了
df = df[["t","c","b"]]
# one chapter per document, labelled by book
# 查注释,id = verse ID , b= Book , c = Chapter , v = Verse (诗句的)节, t = Text
# df_data = pd.DataFrame(columns=["c", "b"])  # 创建⼀个data_frame,列索引选择c,b,也就是选了
章和册(实际上这⾥就text没选)
# for book in df["b"].unique():              # unique()是以数组形式(numpy.ndarray)返回列的所有唯⼀值(特征的所有唯⼀值)
#    dum = pd.DataFrame(columns=["c", "b"])
#    dum["c"] = df[df["b"] == book].groupby("c").apply(lambda x: (" ".join(x["t"])).lower())
#    dum["b"] = book
#    df_data = pd.concat([df_data,dum], ignore_index=True)        # 使⽤ concat是⼀种基本的合并⽅式
# del df
# del df
df_data = pd.DataFrame(columns=["c","b"])
for book in df["b"].unique():
dum = pd.DataFrame(columns=["c","b"])
dum["c"]= df[df["b"]== book].groupby("c").apply(lambda x:(" ".join(x["t"])))
dum["c"]= dum["c"].apply(cut_words)
dum["b"]= book
df_data = pd.concat([df_data, dum], ignore_index=True)# 使⽤ concat是⼀种基本的合并⽅式
del df
save_as_pickle("df_data.pkl", df_data)
# tokenize是分词断句的函数
# Tfidf              # tf-idf对⽂本重要程度进⾏加权
logger.info("")# 使⽤sklearn包下的TfidfVectorizer()进⾏⽂本处理,该函数将原始⽂档集合转换为TF-IDF特性矩阵。
vectorizer = TfidfVectorizer(input="content", max_features=None, tokenizer=dummy_fun, preprocessor=dummy_fun)
vectorizer.fit(df_data["c"])# 从培训集中学习词汇和IDF
df_tfidf = ansform(df_data["c"])# 将⽂档转换为⽂档术语矩阵
df_tfidf = array()
vocab = _feature_names()# 从特征整数索引到特征名的数组映射,Return :A list of feature names.
vocab = np.array(vocab)
df_tfidf = pd.DataFrame(df_tfidf, columns=vocab)# 以特征名和 fit() 处理后的 df_tfidf(df == datafolder) ⽣成⼀个dataframe
# PMI between words  #计算词与词之间的PMI  PMI是 point-wise mutual information
names = vocab
# names是上节传递的列的'名字'
n_i = OrderedDict((name,0)for name in names)
word2index = OrderedDict((name, index)for index, name in enumerate(names))
# OrderedDict是python中的有序字典,n_i和word2index都是有序的字典
occurrences = np.zeros((len(names),len(names)), dtype=np.int32)
# Find the co-occurrences:
no_windows =0;
logger.info("")
for l in tqdm(df_data["c"], total=len(df_data["c"])):# 这⾥是暴⼒计算co-occurrences的函数
for i in range(len(l)- window):
no_windows +=1
d =set(l[i:(i + window)])# 这⾥是上周没看懂的地⽅
for w in d:
n_i[w]+=1# 出现的频率++  # combinations(iterable, r)
for w1, w2 in combinations(d,2):# 创建⼀个迭代器,返回iterable中所有长度为r的⼦序列,返回的⼦序列中的项按输⼊iterable中的顺序排序                i1 = word2index[w1]# 简单来说combination实现排列组合
i2 = word2index[w2]
occurrences[i1][i2]+=1
occurrences[i2][i1]+=1
logger.info("Calculating PMI*...")
# convert to PMI
p_ij = pd.DataFrame(occurrences, index=names, columns=names)/ no_windows
p_i = pd.Series(n_i, index=n_i.keys())/ no_windows
del occurrences
del n_i
for col in lumns:
p_ij[col]= p_ij[col]/ p_i[col]
for row in p_ij.index:
p_ij.loc[row,:]= p_ij.loc[row,:]/ p_i[row]
p_ij = p_ij +1E-9
for col in lumns:
p_ij[col]= p_ij[col].apply(lambda x: math.log(x))
# Build graph            # 建图,调⽤networkx模块
logger.info("Building graph (No. of document, word nodes: %d, %d)..."%(len(df_tfidf.index),len(vocab)))
G = nx.Graph()
G = nx.Graph()
logger.info("Adding document nodes ")
G.add_nodes_from(df_tfidf.index)# document nodes    # add_nodes_from()向图G中添加节点,⽂档节点    logger.info("Adding word nodes ")
G.add_nodes_from(vocab)# word nodes                  # 这⾥添加的是单词节点
# build edges between document-word pairs
logger.info("Building ")# 这⼀步⽐较慢
document_word =[(doc, w,{"weight": df_tfidf.loc[doc, w]})for doc in
tqdm(df_tfidf.index, total=len(df_tfidf.index)) \
for w in lumns]# 建⽴⽂档和单词之间的边
logger.info("Building ")
word_word = word_word_edges(p_ij)# 建⽴单词和单词之间的边
save_as_pickle("word_word_edges.pkl", word_word)
logger.info("Adding document-word and ")
G.add_edges_from(document_word)# 添加上⾯建⽴好的边
G.add_edges_from(word_word)
save_as_pickle("text_graph.pkl", G)# ⽤pickle格式保存,准备把图送⼊text_GCN函数
logger.info("Done and saved!")
if __name__ =="__main__":
generate_text_graph()
下⾯是⽹络模型代码
# -*- coding: utf-8 -*-
"""
Created on Wed Jul  3 10:58:01 2019
@author: WT
"""
import torch
as nn
functional as F
class gcn(nn.Module):
def__init__(self, X_size, A_hat, args, bias=True):# X_size = num features
super(gcn, self).__init__()# 继承nn.Module的__init__()
self.A_hat = sor(A_hat, requires_grad=False).float()
self.weight = nn.parameter.Parameter(torch.FloatTensor(X_size, args.hidden_size_1))# 定义图和权重矩阵        var =2./(self.weight.size(1)+self.weight.size(0))
self.al_(0,var)
self.weight2 = nn.parameter.Parameter(torch.FloatTensor(args.hidden_size_1, args.hidden_size_2))
var2 =2./(self.weight2.size(1)+self.weight2.size(0))# 使⽤normal_⽅法初始化权重weight
# normal_(mean=0, std=1, , gengerator=None*)
# 将tensor⽤均值为mean和标准差为std的正态分布填充。
self.al_(0,var2)
if bias:
self.bias = nn.parameter.Parameter(torch.FloatTensor(args.hidden_size_1))
self.al_(0,var)
self.bias2 = nn.parameter.Parameter(torch.FloatTensor(args.hidden_size_2))
self.al_(0,var2)
else:
self.fc1 = nn.Linear(args.hidden_size_2, args.num_classes)# 全连接⽹络
def forward(self, X):### 2-layer GCN architecture
X = (X, self.weight)# (a, b)是矩阵a和b矩阵相乘
if self.bias is not None:
X =(X + self.bias)# relu套上就是⼀层GCN的H1,送⼊下⼀层作为新的'图'结构
X = F.(self.A_hat, X))
X = (X, self.weight2)
if self.bias2 is not None:
X =(X + self.bias2)
X = F.(self.A_hat, X))
return self.fc1(X)

版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系QQ:729038198,我们将在24小时内删除。