python简单的论文查重 python查找论文

转载

mob64ca13ff9303 2023-10-22 20:58:12

文章标签 python简单的论文查重 python 爬虫论文下载 html 文章分类 Python 后端开发

这里写自定义目录标题

Python 批量下载SIGMOD,VLDB的论文 Mac OS

实现

0、要爬取的网站
1、下载单篇论文
2、获得所有论文的链接

完整代码

Python 批量下载SIGMOD,VLDB的论文 Mac OS

因为一个个找带某一关键词的论文太累了，因此写了一个python脚本来下载论文，可以支持关键字寻找，批量下载。
目前只适合SIGMOD和VLDB。

需要下载bs4, requests模块。

实现

推荐一个网站https://dblp.org，收录了比较多的论文。
本python脚本就是爬取这个网站，获得某一年的SIGMOD/VLDB全部论文，然后对论文标题逐一进行关键词筛选做的。

0、要爬取的网站

假设我们要下载SIGMOD2019中带有privacy关键词的论文，我们从这个网站https://dblp.org/db/conf/sigmod/sigmod2019.html找到所有论文。任意点一篇进去，如Answering Multi-Dimensional Analytical Queries under Local Differential Privacy这篇文章，跳转到https://dl.acm.org/doi/10.1145/3299869.3319891里。

1、下载单篇论文

我们先来看如何用python脚本下载单篇论文，如果我们要下载SIGMOD的这篇论文https://dl.acm.org/doi/10.1145/3299869.3319891，

如图，右键网站上的“PDF”–“检查”，可以看到右边红色框内的pdf链接。点击链接会跳转到下面的网站https://dl.acm.org/doi/pdf/10.1145/3299869.3319891，就是我们要下载的论文地址。

python简单的论文查重 python查找论文_html

如果我们知道了链接地址，可以直接用下面的代码下载（完整代码中放在函数download_one_paper.py的get_paper(url, folder, filename)了）：

import requests

url= 'https://dl.acm.org/doi/pdf/10.1145/3299869.3319891'
r= requests.get(url)
with open('1.pdf', 'wb') as f:
    f.write(r.content)
    f.close()

2、获得所有论文的链接

可以通过链接下载论文了以后，我们需要考虑如果获得所有论文的链接。

网页https://dblp.org/db/conf/sigmod/sigmod2019.html正好提供了，SIGMOD论文下载地址都是如下图格式的。

我们可以用BeautifulSoup模块获得html的解析格式，再遍历如下格式的链接，获得所有论文地址。

python简单的论文查重 python查找论文_论文下载_02

如下是获取所有论文链接的代码（完整代码封装在download_papers.py的get_paper_url_list(html)函数中）

from bs4 import BeautifulSoup
import requests

def getHTMLText(url): # 固定格式，获得html的字符串形式
    try:
        r= requests.get(url, timeout=30)
        r.raise_for_status()
        r.encoding= r.apparent_encoding
        return r.text
    except:
        return "getHTMLText error!"

html= getHTMLText('https://dblp.org/db/conf/sigmod/sigmod2019.html')
soup= BeautifulSoup(html, 'html.parser')

paper_url_list= []
for content in soup.find_all('a'): # 枚举html中所有的标签'a'
    url= content.get('href') # 获得href里面的地址
    if (url!=None) and (url[0:16]=='https://doi.org/'):
        paper_url_list.append(url)
paper_url_list= list(set(paper_url_list)) # 去重
for url in paper_url_list:
    print(url)

完整代码

包括两个文件download_one_paper.py和download_papers.py，第一个文件用来下载一篇论文，第二个文件获得所有论文链接后调用第一个文件下载论文。
用的时候直接:

$ python download_papers.py

在github上有相应代码https://github.com/VFVrPQ/Python_Real_Learning/tree/master/download_paper 另外第一个文件中增加了获得论文标题，和下载后的文件名。

# download_one_paper.py
import requests
import os
from bs4 import BeautifulSoup

# 关键词
TITLE_SHAI= ['privacy', 'private', 'differential', 'local', 'location', 'crowd', 'spatial']

def get_paper(url, folder, filename):
    '''下载单篇论文
        :param url: 要下载的论文url
        :param folder: 保存在本地的路径
        :param filename: 保存在本地的文件名
    '''
    #try:
    if not os.path.exists(folder): # 若文件夹不存在，则创建
        os.mkdir(folder)

    path= folder + '/' + filename
    if not os.path.exists(path): # 若文件不存在，则创建
        r= requests.get(url)
        with open(path, 'wb') as f:
            f.write(r.content)
            f.close()
            print("%s文件保存成功" % (filename))
    else:
        print("%s文件已存在" % (filename))
    #except:
    #    print("%s:爬取失败" % (url))

def getHTMLText(url):
    try:
        r= requests.get(url, timeout=30)
        r.raise_for_status()
        r.encoding= r.apparent_encoding
        return r.text
    except:
        return "getHTMLText error!"

def get_paper_name(html):
    '''获取论文标题，根据论文标题关键词筛选
    '''
    soup= BeautifulSoup(html, 'html.parser')
    title=''
    for content in soup.find('h1'):
        title=str(content)
    title= title.replace(':', '-') # 将标题中的冒号改为-

    for shai in TITLE_SHAI: # 根据关键字筛选
        if shai in title.lower():
            return True, title
    return False, title  

def get_pdf_url(html):
    '''获得pdf的链接
    '''
    soup= BeautifulSoup(html, 'html.parser')
    for link in soup.find_all('a'): #
        # <a class="btn big stretched red" href="/doi/pdf/10.1145/3299869.3319891" title="View or Download as a PDF file"><i aria-hidden="true" class="icon-pdf-file"></i>PDF</a>
        url= link.get('href')
        if (url!=None) and (url[0:9]=='/doi/pdf/'): # SIGMOD
            return 'https://dl.acm.org'+link.get('href')
        if (url!=None) and (url[0:38]=='https://link.springer.com/content/pdf/'): # VLDB
            return url
    return None 


def download_one_paper(url, year, typ, conf):
    '''获得下载url，和论文名字（根据论文名字关键词筛选），下载单篇论文
        :param url: 
        :param year: 出版年份
        :param typ: ccf认证类别
        :param conf: 会议名
    '''
    print(url)
    html= getHTMLText(url)
    #print(html.prettify())
    like, papername= get_paper_name(html)
    if like==False:
        print('没有关键字： %s' % (papername))
        return 
    pdf_url= get_pdf_url(html)
    get_paper(url=pdf_url, folder='./paper', filename=year+'-'+typ+'-'+conf+'-'+papername+'.pdf')
    

if __name__ == "__main__":
    #print(len('https://link.springer.com/content/pdf/'))
    download_one_paper('https://dl.acm.org/doi/10.1145/3299869.3319891', '2019', 'A', 'SIGMOD')
    #download_one_paper('https://link.springer.com/article/10.1007/s00778-019-00568-7', '2020', 'A', 'VLDB')
    
    #download_one_paper('https://ieeexplore.ieee.org/document/9155359', '2020', 'A', 'INFOCOM') # failure

# download_papers.py
import requests
import os
from bs4 import BeautifulSoup
# 本地导入
from download_one_paper import getHTMLText, download_one_paper

def get_paper_url_list(html):
    '''获取所有论文的下载地址
    '''
    paper_url_list= []

    soup= BeautifulSoup(html, 'html.parser')
    for content in soup.find_all('a'):
        url= content.get('href')
        if (url!=None) and (url[0:16]=='https://doi.org/'):
            paper_url_list.append(url)
    paper_url_list= list(set(paper_url_list)) # 去重
    return paper_url_list

if __name__ == "__main__":
    conf_list=[
        {
            'url':'https://dblp.org/db/journals/vldb/vldb29.html',
            'year':'2020',
            'typ':'A',
            'conf':'VLDB'
        },
        {
            'url':'https://dblp.org/db/journals/vldb/vldb28.html',
            'year':'2019',
            'typ':'A',
            'conf':'VLDB'
        },
        '''
        {
            'url':'https://dblp.org/db/conf/sigmod/sigmod2019.html',
            'year':'2019',
            'typ':'A',
            'conf':'SIGMOD'
        }'''
    ]
    for conf in conf_list:
        conf_url= conf['url'] # 获取会议的网站
        html= getHTMLText(conf_url)
        paper_url_list= get_paper_url_list(html) # 获取所有论文的下载地址

        totnum_list= len(paper_url_list)
        for i in range(len(paper_url_list)):
            print('\ndealing with %d/%d=%f%%' % (i+1, totnum_list, 100.0*(i+1)/totnum_list)) # 用来观察进度
            paper_url= paper_url_list[i] # paper_url= 'https://doi.org/10.1145/3299869.3314037'
            download_one_paper(paper_url, conf['year'], conf['typ'], conf['conf'])

本文章为转载内容，我们尊重原作者对文章享有的著作权。如有内容错误或侵权问题，欢迎原作者联系我们进行内容更正或删除文章。