def dataloader(file1, file2, file3):
print("")
entity =[]
relation =[]
with open(file2,'r')as f1,open(file3,'r')as f2:
lines1 = f1.readlines()
lines2 = f2.readlines()
for line in lines1:
line = line.strip().split('\t')
if len(line)!=2:
continue
entities2id[line[0]]= line[1]
entity.append(line[1])
for line in lines2:
line = line.strip().split('\t')
if len(line)!=2:
continue
relations2id[line[0]]= line[1]
relation.append(line[1])
margin rate
triple_list =[]
with codecs.open(file1,'r')as f:
content = f.readlines()
for line in content:
triple = line.strip().split("\t")
if len(triple)!=3:
continue
h_ = entities2id[triple[0]]
r_ = relations2id[triple[1]]
t_ = entities2id[triple[2]]
triple_list.append([h_, r_, t_])
print("Complete load. entity : %d , relation : %d , triple : %d"%(
len(entity),len(relation),len(triple_list)))
return entity, relation, triple_list
def norm_l1(h, r, t):
return np.sum(np.fabs(h + r - t))
def norm_l2(h, r, t):
return np.sum(np.square(h + r - t))
class TransE:
def__init__(self, entity, relation, triple_list, embedding_dim=50, lr=0.01, margin=1.0, norm=1):        ities = entity
self.dimension = embedding_dim
self.learning_rate = lr
self.margin = margin
< = norm
self.loss =0.0
def data_initialise(self):
entityVectorList ={}
relationVectorList ={}
for entity ities:
entity_vector = np.random.uniform(-6.0/ np.sqrt(self.dimension),6.0/ np.sqrt(self.dimension),
self.dimension)
entityVectorList[entity]= entity_vector
for relation lations:
relation_vector = np.random.uniform(-6.0/ np.sqrt(self.dimension),6.0/ np.sqrt(self.dimension),
self.dimension)
relation_vector = alization(relation_vector)
relationVectorList[relation]= relation_vector
def normalization(self, vector):
return vector / (vector)
def training_run(self, epochs=1, nbatches=100, out_file_title =''):
batch_size =int(iples)/ nbatches)
print("batch size: ", batch_size)
for epoch in range(epochs):
start = time.time()
self.loss =0.0
# Normalise the embedding of the entities to 1
for entity ities.keys():
for batch in range(nbatches):
batch_samples = random.iples, batch_size)
Tbatch =[]
for sample in batch_samples:
corrupted_sample = copy.deepcopy(sample)
pr = np.random.random(1)[0]
if pr >0.5:
# change the head entity
corrupted_sample[0]= random.ities.keys(),1)[0]
while corrupted_sample[0]== sample[0]:
corrupted_sample[0]= random.ities.keys(),1)[0]
else:
# change the tail entity
corrupted_sample[2]= random.ities.keys(),1)[0]
while corrupted_sample[2]== sample[2]:
corrupted_sample[2]= random.ities.keys(),1)[0]
if(sample, corrupted_sample)not in Tbatch:
Tbatch.append((sample, corrupted_sample))
self.update_triple_embedding(Tbatch)
end = time.time()
print("epoch: ", epoch,"cost time: %s"%(round((end - start),3)))
print("running loss: ", self.loss)
with codecs.open(out_file_title +"TransE_entity_"+str(self.dimension)+"dim_batch"+str(batch_size),"w")as f1:
for e ities.keys():
# f1.write("\t")
# f1.write(e + "\t")
f1.write(str(ities[e])))
f1.write("\n")
with codecs.open(out_file_title +"TransE_relation_"+str(self.dimension)+"dim_batch"+str(batch_size),"w")as f2: for r lations.keys():
for r lations.keys():
# f2.write("\t")
# f2.write(r + "\t")
f2.write(str(lations[r])))
f2.write("\n")
def update_triple_embedding(self, Tbatch):
# deepcopy 可以保证,即使list嵌套list也能让各层的地址不同,即这⾥copy_entity 和
# entitles中所有的elements都不同
copy_entity = copy.ities)
copy_relation = copy.lations)
for correct_sample, corrupted_sample in Tbatch:
correct_copy_head = copy_entity[correct_sample[0]]
correct_copy_tail = copy_entity[correct_sample[2]]
relation_copy = copy_relation[correct_sample[1]]
corrupted_copy_head = copy_entity[corrupted_sample[0]]
corrupted_copy_tail = copy_entity[corrupted_sample[2]]
correct_head = ities[correct_sample[0]]
correct_tail = ities[correct_sample[2]]
relation = lations[correct_sample[1]]
corrupted_head = ities[corrupted_sample[0]]
corrupted_tail = ities[corrupted_sample[2]]
# calculate the distance of the triples
==1:
correct_distance = norm_l1(correct_head, relation, correct_tail)
corrupted_distance = norm_l1(corrupted_head, relation, corrupted_tail)
else:
correct_distance = norm_l2(correct_head, relation, correct_tail)
corrupted_distance = norm_l2(corrupted_head, relation, corrupted_tail)
loss = self.margin + correct_distance - corrupted_distance
if loss >0:
self.loss += loss
print(loss)
correct_gradient =2*(correct_head + relation - correct_tail)
corrupted_gradient =2*(corrupted_head + relation - corrupted_tail)
==1:
for i in range(len(correct_gradient)):
if correct_gradient[i]>0:
correct_gradient[i]=1
else:
correct_gradient[i]=-1
if corrupted_gradient[i]>0:
corrupted_gradient[i]=1
else:
corrupted_gradient[i]=-1
correct_copy_head -= self.learning_rate * correct_gradient
relation_copy -= self.learning_rate * correct_gradient
correct_copy_tail -=-1* self.learning_rate * correct_gradient
relation_copy -=-1* self.learning_rate * corrupted_gradient
if correct_sample[0]== corrupted_sample[0]:
# if corrupted_triples replaces the tail entity, the head entity's embedding need to be updated twice                    correct_copy_head -=-1* self.learning_rate * corrupted_gradient
corrupted_copy_tail -= self.learning_rate * corrupted_gradient
corrupted_copy_tail -= self.learning_rate * corrupted_gradient
elif correct_sample[2]== corrupted_sample[2]:
# if corrupted_triples replaces the head entity, the tail entity's embedding need to be updated twice                    corrupted_copy_head -=-1* self.learning_rate * corrupted_gradient
correct_copy_tail -= self.learning_rate * corrupted_gradient
# normalising these new embedding vector, instead of normalising all the embedding together
copy_entity[correct_sample[0]]= alization(correct_copy_head)
copy_entity[correct_sample[2]]= alization(correct_copy_tail)
if correct_sample[0]== corrupted_sample[0]:
# if corrupted_triples replace the tail entity, update the tail entity's embedding
copy_entity[corrupted_sample[2]]= alization(corrupted_copy_tail)
elif correct_sample[2]== corrupted_sample[2]:
# if corrupted_triples replace the head entity, update the head entity's embedding
copy_entity[corrupted_sample[0]]= alization(corrupted_copy_head)
# the paper mention that the relation's embedding don't need to be normalised
copy_relation[correct_sample[1]]= relation_copy
# copy_relation[correct_sample[1]] = alization(relation_copy)
if __name__ =='__main__':
file1 ="/"
file2 ="/"
file3 ="/"
entity_set, relation_set, triple_list = dataloader(file1, file2, file3)
# modify by yourself
transE = TransE(entity_set, relation_set, triple_list, embedding_dim=30, lr=0.01, margin=1.0, norm=2)
transE.data_initialise()

版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系QQ:729038198,我们将在24小时内删除。