from pathlib import Path
import re,datetime,threading
from queue import Queue
from collections import defaultdict
from user_agents import parse
from urllib.parse import urlparse
expdict = {
"datetime":lambda x:datetime.datetime.strptime(x,"%d/%b/%Y:%H:%M:%S %z"),
"length":int,
"status":int,
"useragent":lambda ua: parse(ua)
}
#解析日志
def loganalysis(file:str,encoding=None):
restr = '''^(?P<address>[\d.]{7,}) - . \[(?P<datetime>[^\]]*)\] "(?P<method>[^ ]+) (?P<url>[^ ]+) (?P<protocol>[^ ]+)" (?P<status>\d{3}) (?P<length>\d)+ "[^ ]*" "(?P<useragent>[^"]+)"'''
req = re.compile(restr)
with Path(file).open(encoding=encoding) as f:
for line in f:
regex = req.match(line)
if regex: #日志行解析成功
yield {k:expdict.get(k,lambda x:x)(v) for k,v in regex.groupdict().items()}
else: #解析失败
raise Exception("No match. {}".format(line))
#加载日志文件
def load(*paths,ext:str="*.log",recursive:bool=False,encoding:str="utf8"):
for filepath in paths:
path = Path(filepath)
ext = [ext] if isinstance(ext,str) else list(ext)
if path.is_dir(): #是目录
for et in ext:
for f in (path.rglob if recursive else path.rglob)(et):
yield from loganalysis(f.absolute(),encoding)
else: #是文件,直接读取
yield from loganalysis(path.absolute(),encoding)
######消息队列分发
def dispatchar(src):
handler = []
queueler = []
#队列注册
def reg(fun):
q = Queue()
handler.append(threading.Thread(target=fun,args=(q,)))
queueler.append(q)
#队列分发
def run():
#启动线程
for hd in handler:
hd.start()
#开始分发
for i in src:
for qu in queueler:
qu.put(i)
return reg,run
##创建消息队列分发器
reg,run = dispatchar(load(".",ext="*.log",recursive=True))
#注册分析ip算法
@reg
def ip_handle(qu:Queue):
ipdict = {}
while True:
try:
data = qu.get(timeout=3) #阻塞读取
ip = data["address"]
if ip:
ipdict[ip] = ipdict.get(ip,0)+1
except :
# 获取数据失败
print("ip分析完成(查看前5名ip):")
print(sorted(ipdict.items(),key=lambda x:x[1],reverse=True)[:5])
break
#注册pv分析
@reg
def pv_handle(qu:Queue):
pvdict = defaultdict(lambda :[0,defaultdict(int)])
pvnoset = {"/"}
while True:
try:
data = qu.get(timeout=3)
url = data["url"]
ip = data["address"]
if url and url not in pvnoset:
ulpa = urlparse(url)
pathname = ulpa.path
temp = pvdict[pathname]
temp[0] += 1
temp[1][ip] += 1
except :
#获取数据失败
arr = sorted(pvdict.items(),key=lambda k:k[1][0],reverse=True)
print("pv前5名:")
for i in arr[:5]:
print("url:{}\t访问次数:{}\t访问ip前5名:{}".format(i[0],i[1][0],sorted(i[1][1].items(),key=lambda x:x[1],reverse=True)[:5]))
break
#客户端分析
##需要使用第三方库user_agents中的parse
##安装方法:pip install pyyaml ua-parser user-agents
@reg #注册客户端分析
def cilent_handle(qu:Queue):
# cildict = {}
cildict = defaultdict(lambda : [0,defaultdict(int)])
while True:
try:
data = qu.get(timeout=3)
uab:parse= data["useragent"].browser
if uab:
family = uab.family
ver = uab.version_string
ucname = uab.family,uab.version_string
ip = data["address"]
temp = cildict[ucname]
temp[0] +=1
temp[1][ip] += 1
except :
# 获取数据失败
print("客户端分析完成(查看前5名):")
arr = sorted(cildict.items(),key=lambda x: x[1][0],reverse=True)
print("前5名:")
for k,v in arr[:5]:
print("浏览器:{}\t访问次数:{}\tip详细次数:{}".format(k,v[0],sorted(v[1].items(),key=lambda x:x[1],reverse=True)[:5]))
print("后5名:")
for k, v in arr[-5:]:
print("浏览器:{}\t访问次数:{}\tip详细次数:{}".format(k, v[0],sorted(v[1].items(), key=lambda x: x[1], reverse=True)[:5]))
break
##启动分析
run()