#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import requests
from lib.re_util import ReUtil
base_url = 'http://ns.meituan.com/meishi/b25710/'
cookies_str = ''
cookies_dict = {}
for cookie in cookies_str.split(";"):
k, v = cookie.split("=", 1)
cookies_dict[k.strip()] = v.strip()
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/71.0.3578.98 Chrome/71.0.3578.98 Safari/537.36'
}
page = requests.get(
url=base_url,
cookies=cookies_dict,
headers=headers
)
def get_element_from_html(raw_html):
regex = ReUtil.get_regex(begin_with=['"poiInfos":'], end_with=['},"comHeader"'])
result = regex.findall(raw_html)
print(result[0][1])
ans = ""
for i in range(4):
ans += result[0][i]
return result
get_element_from_html(page.text)
ReUtil,这个工具其实也够用了,但是还是建议用xPath这种正规的方法来处理HTML
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import re
class ReUtil:
need_escape = {
'\\': True,
'^': True,
'$': True,
'.': True,
'*': True,
'+': True,
'?': True,
'{': True,
'}': True,
'(': True,
')': True,
'[': True,
']': True,
'|': True,
}
exits = {}
@classmethod
def get_regex(cls, begin_with=None, must_contain=None, end_with=None) -> 're object':
begin_with = cls.conver_to_list(begin_with)
must_contain = cls.conver_to_list(must_contain)
end_with = cls.conver_to_list(end_with)
pattern = ''
pattern += cls.list_to_restring(begin_with)
pattern += '(.*)?'
pattern += cls.list_to_restring(must_contain)
pattern += '(.*)?'
pattern += cls.list_to_restring(end_with)
if cls.exits.get(pattern):
return cls.exits[pattern]
regex_obj = re.compile(pattern, re.DOTALL)
cls.exits[pattern] = regex_obj
return regex_obj
@classmethod
def list_to_restring(cls, args: list) -> 'str':
ans = '((?i)' # ignore capitals
for i, arg in enumerate(args):
for j in range(len(arg)):
if arg[j] in cls.need_escape:
ans += '\\'
ans += arg[j]
if i != len(args) - 1:
ans += '|'
ans += ')'
return ans
@classmethod
def conver_to_list(cls, value) -> 'list':
return [] if not value else [value] if not isinstance(value, list) else value
@classmethod
def get_all_number_to_list(cls, string):
return re.findall('\d+\.?\d*', string)