1.正则表达式
正则表达式在处理文本方面发挥着重要的作用
1.re.match()
从字符串开头匹配,匹配成功返回匹配结果,加上.group()可查看匹配到的具体的值,匹配不成功则返回None
import re
print(re.match(r'a','abc123').group()) #a
print(re.match(r'A','abc123',re.I).group())#a,加上re.I可忽略大小写
print(re.match(r'Ab','abc123',re.I).group())#ab
print(re.match('12','ab123'))#None
2.re.search()
从前往后匹配,返回匹配到的第一个值,若匹配不成功,返回None
print(re.search(r'a','abc123').group())#a
print(re.search(r'B','abc123',re.I).group())#b
print(re.search(r'Bc','abc123',re.I).group())#bc
print(re.search('12','ab123').group())#12
print(re.search('12','ab123').span())#(2,4), span()可以返回匹配到的字符串的索引范围
3.re.findall()
从前往后匹配,返回匹配到的字符串列表
print(re.findall(r'\d+','12abjh46hjk698bg7ghj8'))#['12', '46', '698', '7', '8']
print(re.findall(r'[a-zA-Z]+','my beautiful girl .'))#['my', 'beautiful', 'girl']
4.re.finditer()
和re.findall()差不多,只不过返回的不是列表,是迭代器
mymatch=re.finditer(r'\d+','12abjh46hjk698bg7ghj8')
for match in mymatch:
print(match.group())
#输出
12
46
698
7
8
5.re.split()
将字符串按照正则表达式切分为列表
re.split('\s+','this is a dog')
#输出
['this', 'is', 'a', 'dog']
6.re.sub()
将正则表达式匹配到的部分进行替换
print(re.sub('a','b','abcabc'))
print(re.sub('\s+','.','this is a dog'))
#输出:
bbcbbc
this.is.a.dog
7.关于group()
line='Cats are smarter than dogs'
matchObj=re.match(r'(.*) are (.*?) .*',line,re.M|re.I)
if matchObj:
print(matchObj.group())
print(matchObj.group(1))
print(matchObj.group(2))
print(matchObj.groups())
else:
print('no match')
#输出:
Cats are smarter than dogs
Cats
smarter
('Cats', 'smarter')
#可以给分组命名
s='1102231990xxxxxxxx'
res=re.search('(?P<province>\d{3})(?P<city>\d{3})(?P<born_year>\d{4})',s)
print(res.groupdict())#这样会输出一个字典
#输出:
{'province': '110', 'city': '223', 'born_year': '1990'}
2.中英文分词和词性标注
1.英文分词和词性标注
import nltk
from nltk.corpus import stopwords
def english_label(text):
#首先将所有的字母变为小写字母
text=text.lower()
#分词
text_list=nltk.word_tokenize(text)
#去掉标点符号
english_punctuations=[',','.',':',';','?','(',')','[',']','&','!','*','@','#','$','%']
text_list=[word for word in text_list if word not in english_punctuations]
#去停用词
stops=set(stopwords.words('english'))
text_list=[word for word in text_list if word not in stops]
#词性标注
poslist=nltk.pos_tag(text_list)
return poslist
#测试一下
mytext='This is a dog. That is a cat. I love them very much.'
print(english_label(mytext))
#输出:
[('dog', 'NN'), ('cat', 'NN'), ('love', 'VBP'), ('much', 'RB')]
2.中文分词和词性标注
import jieba
import jieba.posseg as pseg
def chinese_label(text,stops_path):
#text是需要处理的中文文本
#stops_path是停用词表所在路径
#首先对中文分词
text_list=jieba.lcut(text)
#然后删除停用词
#首先导入停用词词表并处理
with open(stops_path,encoding='utf-8') as fp:
stopwords=fp.readlines()
stopwords=[line.strip() for line in stopwords]
text_list=[word for word in text_list if word not in stopwords]
#把列表转成字符串,词与词之间连接
text_str=''.join(text_list)
#然后进行词性标注
text_pos=pseg.cut(text_str)
return text_pos
#测试一下
mytext='我爱北京天安门,天安门上太阳升'
path=r'D:\python\1python\stopwords\stopwords-zh-master\stopwords-zh.txt'#你的停用词路径
result=chinese_label(mytext,path)
for a in result:
print(a)
#输出
爱/v
北京/ns
天安门/ns
天安门/ns
太阳升/nr
3.命名实体识别NER
1.英文命名实体识别
#英文命名实体识别用的还是nltk
import nltk
from nltk.corpus import stopwords
#定义一个用于命名实体识别的函数
#命名实体识别时不要把大写字母变成小写,可能也不需要去停用词
def english_ner(text):
#首先分词
text_list=nltk.word_tokenize(text)
#去掉标点符号
english_punctuations=[',','.',':',';','?','(',')','[',']','&','!','*','@','#','$','%']
text_list=[word for word in text_list if word not in english_punctuations]
#去停用词
#stops=set(stopwords.words('english'))
#text_list=[word for word in text_list if word not in stops]
#然后词性标注
text_pos=nltk.pos_tag(text_list)
#然后进行命名实体识别
text_entities=nltk.chunk.ne_chunk(text_pos)
return text_entities
#测试一下
texts='This is a dog. That is a cat. I love them very much. Beijing is a beautiful city. London is a beautiful girl. I am very excited about the next generation of Apple products. I bought these Apple products today. His name is Jack'
#分句
mytext=nltk.sent_tokenize(texts)
print(mytext)
#然后进行分词词性标注命名实体识别
for text in mytext:
print(english_ner(text))
#输出:
['This is a dog.', 'That is a cat.', 'I love them very much.', 'Beijing is a beautiful city.', 'London is a beautiful girl.', 'I am very excited about the next generation of Apple products.', 'I bought these Apple products today.', 'His name is Jack']
(S This/DT is/VBZ a/DT dog/NN)
(S That/DT is/VBZ a/DT cat/NN)
(S I/PRP love/VBP them/PRP very/RB much/RB)
(S (GPE Beijing/NNP) is/VBZ a/DT beautiful/JJ city/NN)
(S (GPE London/NNP) is/VBZ a/DT beautiful/JJ girl/NN)
(S
I/PRP
am/VBP
very/RB
excited/JJ
about/IN
the/DT
next/JJ
generation/NN
of/IN
(GPE Apple/NNP)
products/NNS)
(S I/PRP bought/VBD these/DT Apple/NNP products/NNS today/NN)
(S His/PRP$ name/NN is/VBZ (PERSON Jack/NNP))
2.中文命名实体识别
#中文命名实体识别
#可以用百度的LAC开源项目实现中文的命名实体识别
from LAC import LAC
import re
import jieba
import jieba.posseg as pseg
#定义一个函数实现中文命名实体识别,我写的这个函数是针对句子列表的,上面那些函数都是针对句子的
def chinese_ner(texts):
#装载LAC模型
lac=LAC(mode='lac')#当mode是seg时是在分词
lac_result=lac.run(texts)
return lac_result
#测试一下
corpus='''我爱北京天安门。天安门上太阳升。朱一龙是个著名的青年男演员!百度是一家大公司。
华北制药集团有限责任公司工资高吗'''
#进行分句得到句子的列表(用正则表达式完成分句)
delimiter=r'[。?;!]'
texts=re.split(delimiter,corpus)
texts=[sent for sent in texts if sent]
print(texts)
#然后调用上面的函数进行命名实体识别
print(chinese_ner(texts))
#输出:
['我爱北京天安门', '天安门上太阳升', '朱一龙是个著名的青年男演员', '百度是一家大公司', '\n华北制药集团有限责任公司工资高吗']
[[['我', '爱', '北京', '天安门'], ['r', 'v', 'LOC', 'LOC']], [['天安门', '上', '太阳', '升'], ['LOC', 'f', 'n', 'v']], [['朱一龙', '是', '个', '著名', '的', '青年', '男演员'], ['PER', 'v', 'q', 'a', 'u', 'n', 'n']], [['百度', '是', '一家', '大公司'], ['ORG', 'v', 'm', 'n']], [['\n华北制药集团有限责任公司', '工资', '高', '吗'], ['ORG', 'n', 'a', 'xc']]]
4.句法分析
1.中文句法分析
#句法分析用的是斯坦福的包
#首先是中文的句法分析
from stanfordcorenlp import StanfordCoreNLP
nlp=StanfordCoreNLP(r'D:\wy\斯坦福句法分析\stanford-corenlp-full-2016-10-31',lang='zh')#中文的话是‘en’
sentence='清华大学位于北京。'
#分词和词性标注不能用在中文里面
#分词
#print(nlp.word_tokenize(sentence))
#词性标注
#print(nlp.pos_tag(sentence))
#句法分析
print(nlp.parse(sentence))#这个难道是短语结构句法分析?
print(nlp.dependency_parse(sentence))#这个是依存句法分析
#输出:
(ROOT
(IP
(NP (NR 清华) (NN 大学))
(VP (VV 位于)
(NP (NR 北京)))
(PU 。)))
[('ROOT', 0, 3), ('compound:nn', 2, 1), ('nsubj', 3, 2), ('dobj', 3, 4), ('punct', 3, 5)]
2.英文句法分析
#然后是英文的,步骤跟上面完全一样,改一改参数就行了
from stanfordcorenlp import StanfordCoreNLP
nlp=StanfordCoreNLP(r'D:\wy\斯坦福句法分析\stanford-corenlp-full-2016-10-31',lang='en')
sentence='The dog is eating a meat on the desk.'
#分词
print(nlp.word_tokenize(sentence))
#词性标注
print(nlp.pos_tag(sentence))
#句法分析
print(nlp.parse(sentence))
print(nlp.dependency_parse(sentence))
#输出:
['The', 'dog', 'is', 'eating', 'a', 'meat', 'on', 'the', 'desk', '.']
[('The', 'DT'), ('dog', 'NN'), ('is', 'VBZ'), ('eating', 'VBG'), ('a', 'DT'), ('meat', 'NN'), ('on', 'IN'), ('the', 'DT'), ('desk', 'NN'), ('.', '.')]
(ROOT
(S
(NP (DT The) (NN dog))
(VP (VBZ is)
(VP (VBG eating)
(NP
(NP (DT a) (NN meat))
(PP (IN on)
(NP (DT the) (NN desk))))))
(. .)))
[('ROOT', 0, 4), ('det', 2, 1), ('nsubj', 4, 2), ('aux', 4, 3), ('det', 6, 5), ('dobj', 4, 6), ('case', 9, 7), ('det', 9, 8), ('nmod', 4, 9), ('punct', 4, 10)]