Requests
获取猫眼TOP100榜电影名字和主演
1 import time
2 import requests
3 from model import *
4 from bs4 import BeautifulSoup
5 headers = {"Content-Type": "text/html; charset=utf-8",
6 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36"}
7
8 url = f"https://www.maoyan.com/board/4?timeStamp=1637291300330&channelId=40011&index=2&signKey=c286a92c2bb667036254185fde905f09&sVersion=1"
9 r = requests.get(url, timeout=3, headers=headers)
10 print(url)
11 soup = BeautifulSoup(r.text, "html5lib")
12 bb = soup.find('ul', class_="list-pager").text.replace("\n", "").split(" ")
13 page = [i for i in bb if i != '']
14 print(page[-2])
15 for i in range(0,int(page[-2])):
16 p=i*10
17 print(i)
18 url = f"https://www.maoyan.com/board/4?timeStamp=1637053092611&sVersion=1&index=7&signKey=52df1051b8c1478e914905882e09e10a&channelId=40011&requestCode=c30fbaba9d9f7b73a53f83fe71ac0ec13ztmp&offset={p}"
19 r = requests.get(url, timeout=3, headers=headers)
20 print(url)
21 time.sleep(3)
22 print(r.status_code)
23 soup = BeautifulSoup(r.text, "html5lib")
24 # 获取某标签的属性值
25 soup1 = BeautifulSoup(r.text, "html5lib")
26 aa = soup.find_all('div', class_="board-item-content")
27 for a in aa:
28 bb = []
29 datalist={}
30 dd = a.find('p', class_="name")
31 ff = a.find('p', class_="star")
32 ss = a.find('p', class_="releasetime")
33
34 movie_name=dd.getText().replace("\n","")
35 art_name=ff.getText().replace("\n","").replace(" ","").replace("主演:","")
36 movie_time=ss.getText().replace("\n", "").replace(" ", "").replace("上映时间:","")
37 print(movie_name)
38 print(art_name)
39 print(movie_time)
40 bb.append(dict(movie_name=movie_name, art_name=art_name, movie_time=movie_time, remark='猫眼'))
41 for l in bb:
42 find_data = SQLsession.query(Infos).filter_by(movie_name=l['movie_name'], remark='猫眼').first()
43 if not find_data:
44 SQLsession.add(Infos(**l))
45 SQLsession.commit()
1 from sqlalchemy import *
2 import pymysql
3 from sqlalchemy.orm import sessionmaker
4 from sqlalchemy.ext.declarative import declarative_base
5 from datetime import datetime
6
7 database = 'mysql+pymysql://root:1234@localhost/test_ten?charset=utf8mb4'
8
9 Base = declarative_base()
10 # 创建数据库连接对象
11 engine = create_engine(database)
12 DBSession = sessionmaker(bind=engine)
13 SQLsession = DBSession()
14
15
16 # ORM
17 class Infos(Base):
18 __tablename__ = 'test_table1'
19 id = Column(Integer(), primary_key=True)
20 code = Column(String(255))
21 movie_name = Column(String(255))
22 art_name = Column(String(255))
23 movie_time=Column(String(255))
24 status = Column(Integer(), default=1)
25 remark = Column(Text)
26 created = Column(DateTime, default=datetime.now())
27 updated = Column(DateTime, default=datetime.now(), onupdate=datetime.now())
28
29
30 Base.metadata.create_all(engine)
如果出现以下情况点进去滑动验证重新运行就可以正常获取数据
豆瓣电影
1 import requests
2 from bs4 import BeautifulSoup
3 url='https://movie.douban.com/chart'
4 headers={"Content-Type":"text/html; charset=utf-8",
5 "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36"}
6 print(url)
7 r=requests.get(url, timeout=3, headers=headers)
8 soup=BeautifulSoup(r.text,"html5lib")
9 name=soup.find_all('div',class_='pl2')
10 for i in name:
11 aa=i.getText().replace("\n","").replace("/","")
12 print(aa)
Requests-HTML
1 # 使用Requests-HTML模拟Ajax请求来获取网页数据
2 from requests_html import HTMLSession
3 url='https://movie.douban.com/chart'
4 headers={"Content-Type":"text/html; charset=utf-8",
5 "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36"}
6 print(url)
7 # Requests-HTML的Ajax加载功能,Requests-HTML只能使用Requests的Session模式
8 session=HTMLSession()
9 r=session.get(url)
10 # Chromium浏览器加载网页
11 # Ajax加载功能由render()方法实现
12 r.html.render()
13 name=r.html.find('div.pl2')
14 for g in name:
15 print(g.text)
json解析
参考:python的requests爬取Json数据,从Json数据中提取标题和图片(腾讯视频的某综艺节目)_Mr.Pan_学狂-CSDN博客
1 # 使用Requests-HTML模拟Ajax请求来获取网页数据
2 from requests_html import HTMLSession
3 import json
4 url='https://movie.douban.com/tag/#/'
5 headers={"Content-Type":"text/html; charset=utf-8",
6 "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36"}
7 print(url)
8 # Requests-HTML的Ajax加载功能,Requests-HTML只能使用Requests的Session模式
9 session=HTMLSession()
10 r=session.get(url)
11 # Chromium浏览器加载网页
12 # Ajax加载功能由render()方法实现
13 r.html.render()
14 # name=r.html.find('span.title')
15 # for g in name:
16 # print(g.text)
17 for d in range(0,5):
18 p = d * 20
19 url1=f"https://movie.douban.com/j/new_search_subjects?sort=U&range=0,10&tags=&start={p}"
20 r2 = session.get(url1)
21 print(r2.text)
22 json_data = json.loads(r2.text) # 加载内容
23 Data_ls = json_data['data'] # 取出data对应的值,是list列表类型
24 title_dt = {} # 定义一个空字典,用于存储标题和对应的演员
25 for D in Data_ls:
26 try: # 加入异常处理机制,使不满足条件的时候跳过,而不报错。
27 if type(D) is dict:
28 title_dt[D['title']] = D['casts']
29 except Exception: # 出现异常的时候,跳过
30 continue
31 print(title_dt)#打印字典