LTP提供了一系列中文自然语言处理工具,用户可以使用这些工具对于中文文本进行分词、词性标注、句法分析等等工作。
ltp的官方文档里演示了分词,句法分析,语义依存关系提取等简单demo。本文在此基础上,将提取出的语义依存关系构建出知识图谱,使用的是neo4j平台。同时本文也会演示怎么使用python在neo4j上创建图谱。neo4j的安装比较简单,请自行查阅。
用ltp创建知识图谱至少需要3个信息:
- 节点类型
- 节点名字
- 节点间的关系
使用ltp提取文本关系:
本文只是简单演示,分析的句子是:他叫汤姆去拿外衣。
你也可以随意替换它。
from ltp import LTP
def ltp_data():
"""将句子处理成语义依存图"""
ltp = LTP()
# 分词
seg, hidden = ltp.seg(["他叫汤姆去拿外衣。"])
# 词性标注
pos = ltp.pos(hidden)
# 词性标注
ner = ltp.ner(hidden)
tag, start, end = ner[0][0]
print(tag, ":", "".join(seg[0][start:end + 1]))
# 语义角色标注
srl = ltp.srl(hidden)
# 依存句法分析
dep = ltp.dep(hidden)
# 语义依存分析(图)
sdp = ltp.sdp(hidden, mode='graph')
return sdp, pos, seg
这里我们看一下返回的结果:
if __name__ == '__main__':
sdp, pos, seg = ltp_data()
print(sdp)
print(pos)
print(seg)
out:
[[(1, 2, ‘AGT’), (2, 0, ‘Root’), (3, 2, ‘DATV’), (3, 4, ‘AGT’), (3, 5, ‘AGT’), (4, 2, ‘eSUCC’), (5, 2, ‘eSUCC’), (5, 4, ‘eSUCC’), (6, 5, ‘PAT’), (7, 2, ‘mPUNC’)]]
[[‘r’, ‘v’, ‘nh’, ‘v’, ‘v’, ‘n’, ‘wp’]]
[[‘他’, ‘叫’, ‘汤姆’, ‘去’, ‘拿’, ‘外衣’, ‘。’]]
标注和关系的具体含义参考ltp附录。
提取节点和关系:
整理上一步返回的结果,从里面提取出节点和关系。
提取节点:
def node_extraction(seg, pos):
"""从语义依存图中提取出节点的名字和节点类型"""
seg[0] = [str(i) for i in seg[0]]
pos[0] = [str(i) for i in pos[0]]
return seg[0], pos[0]
提取关系时需要用到创建的节点,因此用到了nodes这个参数,它是在后面创建节点函数那里生成的。
提取关系
def relation_extraction(sdp,nodes):
pass
"""
提取出节点间的关系,将节点与关系整合成三元组,并存放在列表中。
(node1,node2,relation)
"""
rel = []
for tuple in sdp[0]:
# 根据索引提取出节点和关系
index1 = int(tuple[0]) - 1
index2 = int(tuple[1]) - 1
node1 = nodes[index1]
node2 = nodes[index2]
relation = str(tuple[2])
# 将节点和关系添加到3元组中
triple = []
triple.append(node1)
triple.append(node2)
triple.append(relation)
# 将3元组整合到列表中
rel.append(triple)
return rel
创建节点和关系:
这一步是创建知识图谱,需要先去neo4j连接上,在建立连接那里,第一个参数 是用cmd打开neo4j时生成的网址(http://localhost:7474),第二个参数是用户名,第三个参数是密码。
from py2neo import Node, Graph, Relationship
from ltp_data import ltp_data
# 可以先阅读下文档:https://py2neo.org/v4/index.htm
class DataToNeo4j(object):
"""将数据存入neo4j"""
def __init__(self):
"""建立连接"""
link = Graph("your localhost", username="your username", password="your password")
self.graph = link
# self.graph = NodeMatcher(link)
self.graph.delete_all()
"""
node3 = Node('animal' , name = 'cat')
node4 = Node('animal' , name = 'dog')
node2 = Node('Person' , name = 'Alice')
node1 = Node('Person' , name = 'Bob')
r1 = Relationship(node2 , 'know' , node1)
r2 = Relationship(node1 , 'know' , node3)
r3 = Relationship(node2 , 'has' , node3)
r4 = Relationship(node4 , 'has' , node2)
self.graph.create(node1)
self.graph.create(node2)
self.graph.create(node3)
self.graph.create(node4)
self.graph.create(r1)
self.graph.create(r2)
self.graph.create(r3)
self.graph.create(r4)
"""
def create_node(self, name_node, type_node):
"""创建节点"""
nodes = []
for i in range(len(name_node)):
node = Node(type_node[i], name = name_node[i])
self.graph.create(node)
nodes.append(node)
print('节点创建成功')
return nodes
def create_relation(self, rel):
"""创建联系"""
for triple in rel:
try:
# 关系要转化成字符串格式
r = Relationship(triple[0],str(triple[2]),triple[1])
self.graph.create(r)
except AttributeError as e:
print(e)
print('关系创建成功')
测试运行
if __name__ == '__main__':
sdp, pos, seg = ltp_data()
create_data = DataToNeo4j()
# 建立节点
node_name, node_type = node_extraction(seg, pos)
nodes = create_data.create_node(node_name, node_type)
# 建立联系
rel = relation_extraction(sdp, nodes)
create_data.create_relation(rel
效果
所有代码:
ltp_data.py
from ltp import LTP
def ltp_data():
"""将句子处理成语义依存图"""
ltp = LTP()
# 分词
seg, hidden = ltp.seg(["他叫汤姆去拿外衣。"])
# 词性标注
pos = ltp.pos(hidden)
# 词性标注
ner = ltp.ner(hidden)
# 语义角色标注
srl = ltp.srl(hidden)
# 依存句法分析
dep = ltp.dep(hidden)
# 语义依存分析(图)
sdp = ltp.sdp(hidden, mode='graph')
return sdp, pos, seg
if __name__ == '__main__':
sdp, pos, seg = ltp_data()
print(sdp)
print(pos)
print(seg)
neo4j.py
# -*- coding: utf-8 -*-
from py2neo import Node, Graph, Relationship
from ltp_data import ltp_data
# 可以先阅读下文档:https://py2neo.org/v4/index.htm
class DataToNeo4j(object):
"""将excel中数据存入neo4j"""
def __init__(self):
"""建立连接"""
link = Graph("your localhost", username="your username", password="your password")
self.graph = link
# self.graph = NodeMatcher(link)
self.graph.delete_all()
"""
node3 = Node('animal' , name = 'cat')
node4 = Node('animal' , name = 'dog')
node2 = Node('Person' , name = 'Alice')
node1 = Node('Person' , name = 'Bob')
r1 = Relationship(node2 , 'know' , node1)
r2 = Relationship(node1 , 'know' , node3)
r3 = Relationship(node2 , 'has' , node3)
r4 = Relationship(node4 , 'has' , node2)
self.graph.create(node1)
self.graph.create(node2)
self.graph.create(node3)
self.graph.create(node4)
self.graph.create(r1)
self.graph.create(r2)
self.graph.create(r3)
self.graph.create(r4)
"""
def create_node(self, name_node, type_node):
"""创建节点"""
nodes = []
for i in range(len(name_node)):
node = Node(type_node[i], name = name_node[i])
self.graph.create(node)
nodes.append(node)
print('节点创建成功')
return nodes
def create_relation(self, rel):
"""创建联系"""
for triple in rel:
try:
# 关系要转化成字符串格式
r = Relationship(triple[0],str(triple[2]),triple[1])
self.graph.create(r)
except AttributeError as e:
print(e)
print('关系创建成功')
def node_extraction(seg, pos):
"""从语义依存图中提取出节点的名字和节点类型"""
seg[0] = [str(i) for i in seg[0]]
pos[0] = [str(i) for i in pos[0]]
return seg[0], pos[0]
def relation_extraction(sdp,nodes):
pass
"""
提取出节点间的关系,将节点与关于整合成三元组,并存放在列表中。
(node1,node2,relation)
"""
rel = []
for tuple in sdp[0]:
# 根据索引提取出节点和关系
index1 = int(tuple[0]) - 1
index2 = int(tuple[1]) - 1
node1 = nodes[index1]
node2 = nodes[index2]
relation = str(tuple[2])
# 将节点和关系添加到3元组中
triple = []
triple.append(node1)
triple.append(node2)
triple.append(relation)
# 将3元组整合到列表中
rel.append(triple)
return rel
if __name__ == '__main__':
sdp, pos, seg = ltp_data()
create_data = DataToNeo4j()
# 建立节点
node_name, node_type = node_extraction(seg, pos)
nodes = create_data.create_node(node_name, node_type)
# 建立联系
rel = relation_extraction(sdp, nodes)
create_data.create_relation(rel)