在Python中需要通过正则表达式对字符串进行匹配的时候,可以使用一个模块,名字为re
import re
# 判断字符串中是否包含hello字符串
bool = re.match("hello", "hello world")
# 如果有返回值则表示包含,没有则表示不包含
print(bool)
# 判断字符串中是否包含大小写h和H字符
bool = re.match("[hH]", "hello world")
print(bool)
bool = re.match("[hH]", "Hello world")
print(bool)
. 匹配任意1个字符(除了\n)
import re
# . 匹配任意1个字符(除了\n)
ret = re.match(".", "adfasdfasfdsa")
print(ret.group())
ret = re.match("a.o", "aaodfasfsa")
print(ret.group())
ret = re.match("aaa.o", "aaaooo")
print(ret.group())
打印结果:
a
aao
aaaoo
[ ] 匹配[ ]中列举的字符
# [ ] 匹配[ ]中列举的字符
import re
# 如果hello的首字符小写,那么正则表达式需要小写的h
ret = re.match("h", "hello Python")
print(ret.group())
# 如果hello的首字符大写,那么正则表达式需要大写的H
ret = re.match("H", "Hello Python")
print(ret.group())
# 大小写hH都可以的情况
ret = re.match("[hH]", "hello Python")
print(ret.group())
ret = re.match("[hH]", "Hello Python")
print(ret.group())
ret = re.match("[hH]ello Python", "Hello Python")
print(ret.group())
# 匹配0到9写法
ret = re.match("[0-9]Hello Python", "7Hello Python")
print(ret.group())
# 匹配0到3或者5到9写法
ret = re.match("[0-35-9]Hello Python", "7Hello Python")
print(ret.group())
# 下面这个正则不能够匹配到数字4,因此ret为None
ret = re.match("[0-35-9]Hello Python", "4Hello Python")
print(ret.group())
import re
# 使用\d进行匹配
ret = re.match("aaa\da", "aaa1aa")
print(ret.group())
ret = re.match("aaa\da", "aaa2aa")
print(ret.group())
ret = re.match("aaa\da", "aaa3aa")
print(ret.group())
# 打印结果:
aaa1a
aaa2a
aaa3a
import re
# 使用\s进行匹配
ret = re.match("aaa\sa", "aaa aa")
print(ret.group())
ret = re.match("aaa\sa", "aaa\taa")
print(ret.group())
#打印结果
aaa a
aaa a
import re
# 使用\s进行匹配
ret = re.match("aaa\wa", "aaataa")
print(ret.group())
ret = re.match("aaa\wa", "aaa我aa")
print(ret.group())
#打印结果:
aaata
aaa我a
import re
ret = re.match("[A-Z][a-z]*", "Aaaabbbbb")
print(ret.group())
ret = re.match("[a-z,1-9]*", "111dasf,asfsa")
print(ret.group())
打印结果:
Aaaabbbbb
111dasf,asfsa
Aabcdef
import re
names = ["name1", "_name", "2_name", "__name__", "$name__"]
for name in names:
ret = re.match("[a-zA-Z_]+[\w]*",name)
if ret:
print(f"{name}合法,{ret.group()}")
else:
print(f"{name}不合法")
打印结果:
name1合法,name1
_name合法,_name
2_name不合法
__name__合法,__name__
$name__不合法
? 匹配前一个字符出现1次或者0次,即要么有1次,要么没有
import re
ret = re.match("[1-9]?[a-z]", "aaa")
print(ret.group())
ret = re.match("[1-9]?\d", "33fff")
print(ret.group())
打印结果:
a
33
0
import re
# 表示1-9数字出现1次或者2次
ret = re.match("[1-9]{1,2}", "12dd")
print(ret.group())
# 表示a-zA-Z,0-9_一共出现6次
ret = re.match("[a-zA-Z,0-9_]{6}", "12a3,g45678")
print(ret.group())
# 表示a-zA-Z0-9_一共出现8次到20次之间
ret = re.match("[a-zA-Z0-9_]{8,20}", "1ad12f23s34455ff66")
print(ret.group())
打印结果:
12
12a3,g
1ad12f23s34455ff66
import re
# 必须以a开头的字符才正确
ret = re.match("^a[a-z]*", "cdsfasdfasfd")
print(ret.group())
ret = re.match("^a[a-z]*", "adsfasdfasfd")
print(ret.group())
结果打印:
adsfasdfasfd
import re
# 判断邮箱99aa9dd9@qq.com
ret = re.match("^\w{4,20}@qq\.com", "99aa9dd9@qq.com")
print(ret.group())
ret = re.match("^[a-z0-9A-Z]{4,20}@qq\.com$", "99aa9dd9@qq.com")
print(ret.group())
结果打印:
99aa9dd9@qq.com
99aa9dd9@qq.com
import re
email_list = ["xiaoWang@163.com", "xiaoWang@163.comheihei", ".com.xiaowang@qq.com"]
for email in email_list:
ret = re.match("^[a-z0-9A-Z]{4,20}@163\.com$", email)
if ret:
print(f"{ret.group()}符合")
else:
print(f"{email}不符合")
|() 分组判断邮箱,这个|就是或者的意思,()小括号就是分组的意思
import re
email_list = ["xiaoWang@163.com", "xiaoWang@163.comheihei", ".com.xiaowang@qq.com", "123456@qq.com", "zhangsanaili@126.com"]
for email in email_list:
# (163|126|qq)表示163或者126或者qq都可以,这个|就是或者的意思,()小括号就是分组的意思
ret = re.match("^[a-z0-9A-Z]{4,20}@(163|126|qq)\.com$", email)
if ret:
print(f"{ret.group()}符合")
else:
print(f"{email}不符合")
import re
ret = re.match("(http|https)://www.([a-z0-9]+)\.(com|cn|net|org|vip)/(index.html|index.jsp)", "http://www.baidu.com/index.html")
print(ret.group(1))
print(ret.group(2))
print(ret.group(3))
print(ret.group(4))
注意:使用正则表达式时使用一个r””表示格式化正则
# \num 引用分组num匹配到的字符串
import re
str1 = "<body><h1>asdfasfsa</h1></body>"
# ()表示分组\1表示获取从左到右的第1个分组配对,也就是一模一样的字符,\2表示直接使用第二个分组
ret = re.match(r"<(\w*)>.*</\1>", str1)
print(ret.group())
str1 = "<body><h1>asdfasfsa</h1></body>"
ret = re.match(r"<(\w*)><(\w*)>\w*</\2></\1>", str1)
print(ret.group())
import re
str1 = "<body><h1>asdfasfsa</h1></body>"
# ()表示分组?P<p1>这个是为第一个分组取一个别名,?P=p1,意思是将第一个分组的在此处引用
ret = re.match(r"<(?P<p1>\w*)>.*</(?P=p1)>", str1)
print(ret.group())
str1 = "<body><h1>asdfasfsa</h1></body>"
# 意思同上
ret = re.match(r"<(?P<p1>\w*)><(?P<p2>\w*)>\w*</(?P=p2)></(?P=p1)>", str1)
print(ret.group())
import re
# search用法
ret = re.search(r"(\d+)", "python = 9999, c = 7890, c++ = 12345")
print(ret.group())
# findall用法,搜索到所有的
ret = re.findall(r"\d+", "python = 9999, c = 7890, c++ = 12345")
print(ret)
sub 将匹配到的数据进行替换 将匹配到的阅读次数加1
ret = re.sub(r"\d+", '998', "python = 997")
print(ret)
# 第二种方式,可以使用方法的引用
def add(temp):
strNum = temp.group()
num = int(strNum) + 1
return str(num)
# 将正则获取的值传入add方法,然后执行完成后在替换
ret = re.sub(r"\d+", add, "python = 997")
print(ret)
# 以:冒号和空格2个字符来切割字符串“info:xiaoZhang 33 shandong”
ret = re.split(r":| ", "info:xiaoZhang 33 shandong")
print(ret)
完