背景
前段时间导师有个小目标,把实验室发的文章汇总一下,看看都被谁引用过,其中哪些是大牛,跳出来,给脸上贴金,于是催生了这样一个需求:
需求
- 查找实验室发的文献信息,爬下来
- 每一篇文献的所有施引文献信息,爬下来
- 每一篇施引文献的作者h-index,爬下来
思路
需要四个函数,分别实现以下功能
- extract_frame(): 传入网页链接,传出当前页面的所有文章信息(编号、标题、标题所在的超链接、被引次数、施引文献所指向的超链接),如果当前页面没有列完搜索结果,则获取下一页的超链接递归当前函数
- find_author_url(): 传入施引文献链接,传出所有作者及作者主页的超链接
- find_author_hindex(): 传入作者主页超链接,传出hindex,有一点较为关键,直接get当前主页url得到的网页没有所需信息,需要使用parse解析主页超链接,并重新构造请求地址和请求头,这个需要分析网页返回的数据包
- log(): 写入信息到txt文件,作为日志,方便记录结果,也方便查询当前进度,否则上百篇文献,每篇0~150条施引文献,每篇施引文献0~7个作者的信息是不足以在屏幕输出的
整体实现思路
- 搜索结果的网页url传入extract_frame,获取实验室发表文章的所有信息,期中施引文献所在的超链接最关键,因为要找施引文献
- 遍历实验室发表文章,将每一篇文章的施引文献超链接传入extract_frame,得到当前施引文献的所有信息,期中标题所在的超链接最关键,因为要进去看作者
- 遍历所有施引文献,将标题所在的超链接传入到find_author_url,得到所有作者的主页地址和姓名
- 遍历所有作者主页连接,获取当前hindex
结果
ps:现在已经查询到一万三千多行数据,中间断过一次
隐藏的坑
- 施引文献为0时没有链接,request发起请求会报错,需要判断
- 有些作者没有主页,这些作者没有daisIds,发起请求会报错,需要判断
- 获取hindex需要构造地址和请求头,这个不同于另外两个函数
- 依然会有中断,186篇文献会有中断,不知名原因,可能网络问题,但是继续执行就没事了
代码
import requests,time,random
from lxml import etree
from urllib import parse
# 全局变量定义
search_records_list = []
author_homepage='https://app.webofknowledge.com/api/wosnx/rrc/author/'
url_base = 'https://apps.webofknowledge.com'
url_head = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36 Edg/89.0.774.57'
}
log_file_name=f"log{time.localtime().tm_hour}_{time.localtime().tm_min}_{time.localtime().tm_sec}.txt"
def log(str):
logfile=open(log_file_name,mode='a+',encoding='utf-8')
logfile.write(str)
print(str, end='')
if set(str)!={'\t'}:
logfile.write('\n')
print('\n',end='')
logfile.close()
def extract_frame(url,list):
time.sleep(0.5+random.random())
response=session.get(url,headers=url_head)
response_html = etree.HTML(response.text)
records = response_html.xpath('//div[contains(@id,"RECORD_")]')
record_dict={}
for record in records:
record_dict['num_index']=record.xpath('.//div[@class="search-results-number-align"]/text()')[0]
record_dict['title_paper'] =record.xpath('.//div[@class="search-results-content"]/div[1]/div[1]/descendant::value/text()')[0]
record_dict['title_url'] =url_base+record.xpath('.//div[@class="search-results-content"]/div[1]/div[1]/descendant::a[@class="smallV110 snowplow-full-record"]/@href')[0]
timescited=record.xpath('.//div[@class="search-results-data-cite"]/text()')[0]
if timescited=='被引频次: 0' or timescited=='Times Cited: 0':#判断如果没有被引频次,则不需要添加施引文献链接,直接返回list,需要按被引频次排序
record_dict['cited_number']=0
record_dict['cited_url']=None
else:
record_dict['cited_number'] =record.xpath('.//a[@class="snowplow-times-cited-link"]/text()')[0]
record_dict['cited_url'] =url_base+record.xpath('.//a[@class="snowplow-times-cited-link"]/@href')[0]
print(f"{record_dict['num_index']} 《{record_dict['title_paper']}》 Has Been Built Successfully")
list.append(record_dict.copy())
nextpage_url=response_html.xpath('//a[contains(@class,"paginationNext")]/@href')[0]
if not nextpage_url=='javascript: void(\'paginationNext\')':#如果存在下一页,则递归下一页
extract_frame(nextpage_url,list)
return list
def find_author_url(url):
time.sleep(0.5 + random.random())
response=session.get(url,headers=url_head)
response_html = etree.HTML(response.text)
name_list = response_html.xpath('//a[@title="Find more records by this author"]/text()')
name_url = response_html.xpath('//a[@title="Find more records by this author"]/@href')
name_dict=dict(list(zip(name_list,name_url)))#把姓名和url两个列表元素一一对应生成字典
print(f"\tAuthors: {name_list} Have Been Found")
return name_dict
def find_author_hindex(url):
url_parts = parse.urlparse(url)
url_params = parse.parse_qs(url_parts.query)
author_head={
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36',
'x-1p-wos-sid': url_params['SID'][0]
}
if 'daisIds' in url_params.keys():#部分作者没有主页
author_detail_url=author_homepage+url_params['daisIds'][0]+'?coAuthor=true'
time.sleep(0.5 + random.random())
author_detail_response=session.get(author_detail_url,headers=author_head)
author_detail_dict=eval(author_detail_response.text)
return author_detail_dict['hIndex']
else:
return 0
if __name__=="__main__":
url_test='***手动打码***'
session=requests.session()
search_records_list=extract_frame(url_test,search_records_list)
for search_record in search_records_list:
log(search_record['num_index']+search_record['title_paper'])
num_index_float=float(search_record['num_index'])
if num_index_float<115:#意外结束后跳到指定文章继续运行
continue
if search_record['cited_number']==0:
log('\tNo Cited Record Found')
continue
cited_records_list=[]
cited_records_list=extract_frame(search_record['cited_url'],cited_records_list)
for cited_record in cited_records_list:
log('\t')
log(cited_record['num_index']+cited_record['title_paper'])
log('\t')
log(cited_record['title_url'])
authors_dict=find_author_url(cited_record['title_url'])
for key in authors_dict.keys():
author_hindex=find_author_hindex(authors_dict[key])
log('\t\t')
log('Hindex:\t'+str(author_hindex)+'\tauthor:\t'+key)
print('end')