论文下载
https://publications.cispa.saarland/view/year/2023.type.html
S&P
https://www.ieee-security.org/TC/SP2023/program-papers.html
https://www.ieee-security.org/TC/SP2022/program-papers.html
https://www.ieee-security.org/TC/SP2021/program-papers.html
CCS https://dl.acm.org/doi/proceedings/10.1145/3460120
安全顶会
CCS
CCS大概下载30篇左右就会封禁IP两小时,因此每次被封禁IP后会sleep两小时继续爬
环境依赖:
- python库selenium、bs4、resquests
- 安装chrome对应版本的googledriver 并在init_driver()中chrome_driver_path 指定googledriver的位置
使用方式:
python downloadccs.py --year 2021
首先爬取所有paper的 doi ,存储到data.json文件中, 然后根据data.json进行下载每次被封禁后sleep两小时继续下载。
爬取doi使用了selenium,后续的下载使用resquests、bs4库
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import requests
import re
import os
import argparse
import configparser
import datetime
import requests
from bs4 import BeautifulSoup
import json
import re
def generate_url_config(urls,driver):
url_list=[]
for url in urls:
select_pdf_url(url,driver)
return url_list
#使用requests库根据url 和pdf_name 发送httpget请求下载文文件
def download_pdf(url, pdf_name,year):
# 使用正则表达式移除不适合用于文件名的特殊字符,将它们替换为空格
pdf_name = re.sub(r'[\\/:*?"<>|]', ' ', pdf_name)
# 发送HTTP GET请求获取论文内容
response = requests.get(url)
# 创建下载目录
download_dir = os.path.join(os.getcwd(), str(year))
os.makedirs(download_dir, exist_ok=True)
if response.status_code == 200:
# 确定下载路径,你可以根据需要修改保存路径
download_path = os.path.join(os.getcwd(), str(year), f'{pdf_name}.pdf')
time.sleep(5)
if os.path.exists(download_path):
print(f'{download_path} 文件已存在')
return 1
else:
print(f'{download_path} 不存在,正在下载中')
# 保存论文内容为PDF文件
with open(download_path, 'wb') as pdf_file:
pdf_file.write(response.content)
print(f'论文已下载到: {download_path}')
return 1
else:
print(f'无法下载论文,HTTP响应代码: {response.status_code}')
return 0
#获取指定topic下的pdfurl,返回的是 pdf_name:pdf_url的多个键值对
def gather_pdf_url_from_topics(url,driver):
print("正在访问指定的topic{}".format(url))
pdf_urls={}
driver.get(url)
#提取出文章的container .目前只要 research-article
items = driver.find_elements(By.XPATH,'.//div[@class="issue-item-container"]')
for item in items:
paperType=item.find_element(By.XPATH,'.//div[@class="issue-heading"]').text.split("\\n")[0]
print(paperType)
if paperType =="RESEARCH-ARTICLE":
#提取pdf的名字
pdf_name=item.find_element(By.XPATH,'.//h5[@class="issue-item__title"]/a').text
#提取出pdf链接
pdf_url=item.find_element(By.XPATH,'.//a[@class="btn--icon simple-tooltip__block--b red btn"]').get_attribute("href")
#下载PDF
#download_pdf(pdf_url,pdf_name)
#记录下来
pdf_urls[pdf_name]=pdf_url
print(pdf_urls)
return pdf_urls
#初始化google_driver
def init_driver():
#指定chromedirver的路径
chrome_driver_path = 'chromedriver.exe'
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--ignore-certificate-errors')
chrome_options.add_argument('--ignore-certificate-errors-spki-list')
chrome_options.add_experimental_option('excludeSwitches', ['enable-logging'])
#下面这个可以指定下载路径
#chrome_options.add_experimental_option("prefs", {"download.default_directory": 'E:\\Temp'})
driver = webdriver.Chrome()
# 创建Chrome WebDriver实例
chrome_service = ChromeService(executable_path=chrome_driver_path)
driver = webdriver.Chrome(service=chrome_service,options=chrome_options)
return driver
# 访问CCS历年会议的列表从中提取出指定年份会议的url
def get_conference_urls(year):
# 发送 GET 请求获取页面内容
url = '<https://dl.acm.org/conference/ccs/proceedings>' # 将 URL 替换为你要访问的网页 URL
response = requests.get(url)
# 使用 BeautifulSoup 解析页面内容
soup = BeautifulSoup(response.text, 'html.parser')
# 查找所有匹配指定条件的 ul 标签
divs = soup.find_all('div', class_='conference__title left-bordered-title')
# 遍历 ul 标签并提取其中的所有 a 标签
for div in divs:
a_tags = div.find_all('a')
for a_tag in a_tags:
conf_year=a_tag.text.split("the")[1].split("ACM")[0].strip()
if year==conf_year:
return "<https://dl.acm.org>"+a_tag['href']
#从用户输入中获取要下载 的年份
def parser_year():
current_year = datetime.datetime.now().year
# 创建命令行参数解析器
parser = argparse.ArgumentParser(description='处理年份对应的 PDF')
# 添加命令行参数,并设置默认值为当前年份
parser.add_argument('--year', type=str, default=current_year-1, help='要处理的年份,默认为当前年份')
# 解析命令行参数
args = parser.parse_args()
# 获取命令行参数中的年份
return str(args.year)
#遍历该CCS会议的所有topic
def select_topic_urls(ccs_url,driver):
#创建一个sele
driver.get(ccs_url)
#等待get请求结束
time.sleep(10)
#获取所有topic 的url
downSessions=driver.find_elements(By.XPATH,'//a[@class="section__title accordion-tabbed__control left-bordered-title"]')
topic_urls=[]
for session in downSessions:
topic_urls.append(session.get_attribute("href"))
return topic_urls
if __name__ == '__main__':
download_year=parser_year() #,通过 --year指定,如2021
#解析配置文件
f=open('data.json', 'r')
data = json.load(f)
f.close()
print("下载CCS会议年份:",download_year)
if str(download_year) in data['conferences']:
print("已有配置文件,直接下载!")
for pdfs in data['conferences'][download_year].items():
pdf_url=pdfs[1]['download_url']
pdf_name=pdfs[0]
if(data['conferences'][download_year][pdf_name]['isdownloaded']!=True):#检查是否已下载,为false则下载
print(pdfs[1]['download_url'])
if download_pdf(pdf_url,pdf_name,download_year):#如果能够正常下载则返回修改配置文件
data['conferences'][download_year][pdf_name]['isdownloaded']=True
with open('data.json', 'w') as updated_json_file: #更新配置文件
json.dump(data, updated_json_file, indent=4)
else:
print("ip已被封禁,睡眠2小时 ")
time.sleep(60*60*2)
else:
driver=init_driver()
data['conferences'][download_year] = {}
#更新配置文件
pdfname_url={}
#获取目标年份CCS会议的url
ccs_url=get_conference_urls(download_year)
#获取CCS会议所有topic对应的url
topic_urls=select_topic_urls(ccs_url,driver=driver)
#存储 {pdf_name,pdf_url}
for topic_url in topic_urls:
pdfname_url.update(gather_pdf_url_from_topics(topic_url,driver=driver))
target_year_data = data['conferences'][download_year]
#{pdf_name,pdf_url} 记入配置文件中
for pdf_name, download_url in pdfname_url.items():
target_year_data[pdf_name] = {
"download_url": download_url,
"isdownloaded": False
}
#写入配置文件中
with open('data.json', 'w') as updated_json_file:
json.dump(data, updated_json_file, indent=4)
############################################################################################################################
print("配置文件导入完成!开始下载 ")
f=open('data.json', 'r')
data = json.load(f)
f.close()
for pdfs in data['conferences'][download_year].items():
pdf_url=pdfs[1]['download_url']
pdf_name=pdfs[0]
if(data['conferences'][download_year][pdf_name]['isdownloaded']!=True):#检查是否已下载,为false则下载
print(pdfs[1]['download_url'])
if download_pdf(pdf_url,pdf_name,download_year):
data['conferences'][download_year][pdf_name]['isdownloaded']=True
with open('data.json', 'w') as updated_json_file: #更新配置文件
json.dump(data, updated_json_file, indent=4)
else:
print("ip已被封禁,退出下载 ")
break
S&P
https://ieeexplore.ieee.org/xpl/conhome/1000646/all-proceedings 这里按年份记录了下载地址 ,下面是近三年的下载地址
使用selenium来下载
1.更新chrome ,浏览器输入chrome://version/ 查看 chrome版本
https://www.itbenet.com/wenz/223586.html
- 下载对应版本的chrome-driver, 将exe文件解压出来
https://googlechromelabs.github.io/chrome-for-testing/#stable
- 修改脚本中的exe位置为自己的位置 chrome_driver_path = ‘F:\develop\chromedriver.exe’
- pip install selenium
sp2023="<https://ieeexplore.ieee.org/xpl/conhome/10179215/proceeding?isnumber=10179280&sortType=vol-only-seq&rowsPerPage=10&pageNumber=1>"
sp2022="<https://ieeexplore.ieee.org/xpl/conhome/9833550/proceeding?isnumber=9833558&sortType=vol-only-seq&rowsPerPage=100&pageNumber=1>"
sp2021="<https://ieeexplore.ieee.org/xpl/conhome/9519381/proceeding?isnumber=9519382&sortType=vol-only-seq&rowsPerPage=10&pageNumber=1>"
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
#分页下载,调用该函数会下载指定页面的pdf
def download_page(url,driver):
driver.get(url)
print("waiting for page loading")
time.sleep(10)
selectinput = driver.find_elements(By.XPATH,'//div[@class="hide-mobile"]//input[@class="ng-untouched ng-pristine ng-valid"]')
#选中pdf
for input in selectinput:
driver.execute_script("arguments[0].scrollIntoView();", input)#这里要模拟下拉一下,虽然不知道为啥 ,但是下拉就对了
time.sleep(1) # 这里要sleep 1秒,否则点击有可能报错
print("clicking")
input.click()
#print(len(selectinput))
#点击下载按钮
downloadbtn = driver.find_elements(By.XPATH,'//button[@class="xpl-toggle-btn" and normalize-space(text())="Download PDFs"]')
downloadbtn[0].click()
#点击确认下载
downloadbtn_accept=driver.find_element(By.XPATH,'//button[contains(@class, "downloadpdf-predl-proceed-button stats-SearchResults_BulkPDFDownload xpl-btn-primary")]')
downloadbtn_accept.click()
#获取loading
loadingbtn=driver.find_element(By.XPATH,'//i[contains(@class, "fas") and contains(@class, "fa-spinner") and contains(@class, "fa-spin")]')
#等待loading消失
wait.until(EC.staleness_of(loadingbtn))
if __name__ == '__main__':
# 设置Chrome WebDriver的路径
chrome_driver_path = 'F:\\develop\\chromedriver.exe'
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--ignore-certificate-errors')
chrome_options.add_argument('--ignore-certificate-errors-spki-list')
chrome_options.add_experimental_option('excludeSwitches', ['enable-logging'])
driver = webdriver.Chrome()
# 创建Chrome WebDriver实例
chrome_service = ChromeService(executable_path=chrome_driver_path)
driver = webdriver.Chrome(service=chrome_service,options=chrome_options)
wait = WebDriverWait(driver, 300)
#首先解析出一共有多少pdf,然后分页下载
url = f"<https://ieeexplore.ieee.org/xpl/conhome/9833550/proceeding?isnumber=9833558&sortType=vol-only-seq&rowsPerPage=10&pageNumber=1>"
driver.get(url)
#这个div 记录了pdf总数
print("waiting for page loading")
time.sleep(10)
div_elements = driver.find_elements(By.CSS_SELECTOR, 'div.col-12.Dashboard-header.text-base-md-lh')
total_papers= int(div_elements[0].text.split(" ")[-1])#204
for page in range(1,int(total_papers/10)+2): #每页10个 /10
url = f"<https://ieeexplore.ieee.org/xpl/conhome/9833550/proceeding?isnumber=9833558&sortType=vol-only-seq&rowsPerPage=10&pageNumber={page}>"
print(url)
download_page(url,driver=driver)
NDSS
使用: python NDSS.py —year 2022
import requests
from bs4 import BeautifulSoup
import argparse
import datetime
def download_pdf(url, pdf_name):
response = requests.get(url)
if response.status_code == 200:
with open(pdf_name, 'wb') as pdf_file:
pdf_file.write(response.content)
print(f"Downloaded and saved {pdf_name}")
else:
print("Failed to download the PDF")
def NDSS_Download(year):
# 定义基础URL和参数位置
base_url = f"<https://www.ndss-symposium.org/ndss{year}/accepted-papers/>" # 将URL替换为你的基础URL
# 发起GET请求获取网页内容
response = requests.get(base_url)
html_content = response.text
# 使用BeautifulSoup解析网页内容
soup = BeautifulSoup(html_content, 'html.parser')
# 查找所有匹配指定条件的链接
links = soup.find_all('a', class_='paper-link-abs', href=True)
# 打印匹配到的链接
for link in links:
paper_pdf_name=link['href'].rstrip('/').split('/')[-1]
print("Paper name:", link['href'].rstrip('/').split('/')[-1])
response = requests.get(link['href'])
html_content = response.text
# 使用BeautifulSoup解析网页内容
soup = BeautifulSoup(html_content, 'html.parser')
# 查找所有带有class属性为"a-pdf"的a标签
pdf_links = soup.find_all('a', class_='pdf-button', href=True)[0]['href']
print("论文 url",pdf_links)
#下载论文
download_pdf(pdf_links, str(year)+"NDSS-"+paper_pdf_name+".pdf")
def USENIX_download(year):
url_fall="<https://www.usenix.org/conference/usenixsecurity21/fall-accepted-papers>"
url_summer="<https://www.usenix.org/conference/usenixsecurity21/summer-accepted-papers>"
if __name__ == "__main__":
current_year = datetime.datetime.now().year
parser = argparse.ArgumentParser(description="Extract PDF links from a webpage based on the year.")
parser.add_argument("--year", type=int, default=current_year, help="Year parameter (default: current year)")
args = parser.parse_args()
NDSS_Download(args.year)
多线程版本
import requests
from bs4 import BeautifulSoup
import argparse
import datetime
import threading
from concurrent.futures import ThreadPoolExecutor
def download_pdf(url, pdf_name):
response = requests.get(url)
if response.status_code == 200:
with open(pdf_name, 'wb') as pdf_file:
pdf_file.write(response.content)
print(f"Downloaded and saved {pdf_name}")
else:
print("Failed to download the PDF")
def download_thread(link, year):
paper_pdf_name = link['href'].rstrip('/').split('/')[-1]
print("Paper name:", paper_pdf_name)
response = requests.get(link['href'])
html_content = response.text
soup = BeautifulSoup(html_content, 'html.parser')
pdf_links = soup.find_all('a', class_='pdf-button', href=True)[0]['href']
print("论文 url", pdf_links)
download_pdf(pdf_links, str(year) + "NDSS-" + paper_pdf_name + ".pdf")
def NDSS_Download(year, num_threads):
base_url = f"<https://www.ndss-symposium.org/ndss{year}/accepted-papers/>"
response = requests.get(base_url)
html_content = response.text
soup = BeautifulSoup(html_content, 'html.parser')
links = soup.find_all('a', class_='paper-link-abs', href=True)
# 使用线程池创建指定数量的线程
with ThreadPoolExecutor(max_workers=num_threads) as executor:
for link in links:
executor.submit(download_thread, link, year)
if __name__ == "__main__":
current_year = datetime.datetime.now().year
parser = argparse.ArgumentParser(description="Extract PDF links from a webpage based on the year.")
parser.add_argument("--year", type=int, default=current_year, help="Year parameter (default: current year)")
parser.add_argument("--threads", type=int, default=10, help="Number of threads (default: 10)")
args = parser.parse_args()
NDSS_Download(args.year, args.threads)
USENIX
import re
import requests
from bs4 import BeautifulSoup
import argparse
import datetime
def download_pdf(url, pdf_name):
response = requests.get(url)
if response.status_code == 200:
with open(pdf_name, 'wb') as pdf_file:
pdf_file.write(response.content)
print(f"Downloaded and saved {pdf_name}")
else:
print("Failed to download the PDF")
def make_safe_filename(filename):
# 定义要替换的特殊字符的正则表达式模式
special_chars_pattern = r'[<>:"/\\\\|?*]'
# 使用正则表达式进行替换,将特殊字符替换为下划线
safe_filename = re.sub(special_chars_pattern, '_', filename)
return safe_filename
def USENIX_download_all(year):
url_fall=f"<https://www.usenix.org/conference/usenixsecurity{year}/fall-accepted-papers>"
USENIX_download(year,url_fall)
url_summer=f"<https://www.usenix.org/conference/usenixsecurity{year}/summer-accepted-papers>"
USENIX_download(year,url_summer)
def USENIX_download(year,url_target):
response = requests.get(url_target)
html_content = response.text
# 使用BeautifulSoup解析网页内容
soup = BeautifulSoup(html_content, 'html.parser')
# 查找所有匹配指定条件的链接
pattern = re.compile(r'/conference.*/presentation.*')
links = soup.find_all('a', href=pattern)
# 将找到的链接添加到集合中以去除重复元素
unique_links_set = set()
for link in links:
unique_links_set.add("<https://www.usenix.org>"+link['href'])
# 将集合转换回列表
links = list(unique_links_set)
#遍历
for link in links:
print(link)#<https://www.usenix.org/conference/usenixsecurity23/presentation/cernera>
response = requests.get(link)
html_content = response.text
# 使用BeautifulSoup解析网页内容
soup = BeautifulSoup(html_content, 'html.parser')
#获取论文的标题
pdf_name=str(year)+"USENIX-"+make_safe_filename(soup.find('h1', id='page-title').get_text().replace(":", ""))+".pdf"
#获取论文的详细链接
pdf_link = soup.find('a', attrs={'type': lambda value: value and 'application/pdf' in value})['href']
download_pdf(pdf_link,pdf_name)
if __name__ == "__main__":
current_year = datetime.datetime.now().year
parser = argparse.ArgumentParser(description="Extract PDF links from a webpage based on the year.")
parser.add_argument("--year", type=int, default=current_year, help="Year parameter (default: current year)")
args = parser.parse_args()
USENIX_download_all(args.year%100) #用到的是年份的后两位
多线程版本
import re
import requests
from bs4 import BeautifulSoup
import argparse
import datetime
from concurrent.futures import ThreadPoolExecutor
def download_pdf(url, pdf_name):
response = requests.get(url)
if response.status_code == 200:
with open(pdf_name, 'wb') as pdf_file:
pdf_file.write(response.content)
print(f"Downloaded and saved {pdf_name}")
else:
print("Failed to download the PDF")
def make_safe_filename(filename):
# 定义要替换的特殊字符的正则表达式模式
special_chars_pattern = r'[<>:"/\\\\|?*]'
# 使用正则表达式进行替换,将特殊字符替换为下划线
safe_filename = re.sub(special_chars_pattern, '_', filename)
return safe_filename
def USENIX_download_all(year,num_threads):
url_fall=f"<https://www.usenix.org/conference/usenixsecurity{year}/fall-accepted-papers>"
USENIX_download(year,url_fall,num_threads)
url_summer=f"<https://www.usenix.org/conference/usenixsecurity{year}/summer-accepted-papers>"
USENIX_download(year,url_summer,num_threads)
def USENIX_download(year, url_target,num_threads):
response = requests.get(url_target)
html_content = response.text
soup = BeautifulSoup(html_content, 'html.parser')
pattern = re.compile(r'/conference.*/presentation.*')
links = soup.find_all('a', href=pattern)
unique_links_set = set()
for link in links:
unique_links_set.add("<https://www.usenix.org>" + link['href'])
links = list(unique_links_set)
# 使用线程池创建指定数量的线程
with ThreadPoolExecutor(max_workers=num_threads) as executor:
for link in links:
executor.submit(download_thread, link, year)
def download_thread(link, year):
response = requests.get(link)
html_content = response.text
soup = BeautifulSoup(html_content, 'html.parser')
pdf_name = str(year) + "USENIX-" + make_safe_filename(soup.find('h1', id='page-title').get_text().replace(":", "")) + ".pdf"
pdf_link = soup.find('a', attrs={'type': lambda value: value and 'application/pdf' in value})['href']
download_pdf(pdf_link, pdf_name)
if __name__ == "__main__":
current_year = datetime.datetime.now().year
parser = argparse.ArgumentParser(description="Extract PDF links from a webpage based on the year.")
parser.add_argument("--year", type=int, default=current_year, help="Year parameter (default: current year)")
parser.add_argument("--threads", type=int, default=10, help="Number of threads (default: 10)")
args = parser.parse_args()
num_threads = args.threads
USENIX_download_all(args.year % 100, num_threads)
人工智能顶会
Download papers and supplemental materials from open-access paper website, such as AAAI, ACCV, AISTATS, COLT, CVPR, ECCV, ICCV, ICLR, ICML, IJCAI, JMLR, NIPS.
https://github.com/SilenceEagle/paper_downloader
github项目中有阿里云盘(2020-2022年)可直接保存下载
下载IJCAI 2023
https://www.ijcai.org/proceedings/2023/
脚本通过—year 指定年份,不输入默认为当前年份
import re
import requests
from bs4 import BeautifulSoup
import argparse
import datetime
#根据url进行下载并将文件名命名为pdf_name
def download_pdf(url, pdf_name):
response = requests.get(url)
if response.status_code == 200:
with open(pdf_name, 'wb') as pdf_file:
pdf_file.write(response.content)
print(f"Downloaded and saved {pdf_name}")
else:
print("Failed to download the PDF")
#有的论文有特殊字符在window下无法作为正常文件名
def make_safe_filename(filename):
# 定义要替换的特殊字符的正则表达式模式
special_chars_pattern = r'[<>:"/\\\\|?*]'
# 使用正则表达式进行替换,将特殊字符替换为下划线
safe_filename = re.sub(special_chars_pattern, '_', filename)
return safe_filename
def IJCAI_download(year):
# 定义基础URL和参数位置
base_url = f"<https://www.ijcai.org/proceedings/{year}/>" # 将URL替换为你的基础URL
# 发起GET请求获取网页内容
response = requests.get(base_url)
html_content = response.text
# 使用BeautifulSoup解析网页内容
soup = BeautifulSoup(html_content, 'html.parser')
# 查找所有匹配指定条件的链接
links = soup.find_all('div', class_='paper_wrapper')
for link in links:
title=make_safe_filename(str(year)+"-IJCAI-"+link.find('div', class_='title').get_text()+".pdf")
url=f"<https://www.ijcai.org/proceedings/{year}/>"+link.find('a', href=re.compile(r'.*\\.pdf$'))['href']
print(title,url)
download_pdf(url,title)
if __name__ == "__main__":
current_year = datetime.datetime.now().year
parser = argparse.ArgumentParser(description="Extract PDF links from a webpage based on the year.")
parser.add_argument("--year", type=int, default=current_year, help="Year parameter (default: current year)")
args = parser.parse_args()
IJCAI_download(args.year)
下载ICML 2023
2023年的ICML :http://proceedings.mlr.press/v202/
这个脚本只能下载2023的,后续2024之类的要将url的v202改成对应的编号
import re
import requests
from bs4 import BeautifulSoup
import argparse
import datetime
#根据url进行下载并将文件名命名为pdf_name
def download_pdf(url, pdf_name):
response = requests.get(url)
if response.status_code == 200:
with open(pdf_name, 'wb') as pdf_file:
pdf_file.write(response.content)
print(f"Downloaded and saved {pdf_name}")
else:
print("Failed to download the PDF")
#有的论文有特殊字符在window下无法作为正常文件名
def make_safe_filename(filename):
# 定义要替换的特殊字符的正则表达式模式
special_chars_pattern = r'[<>:"/\\\\|?*]'
# 使用正则表达式进行替换,将特殊字符替换为下划线
safe_filename = re.sub(special_chars_pattern, '_', filename)
return safe_filename
def ICML_download():
# 定义基础URL和参数位置
base_url = "<http://proceedings.mlr.press/v202/>"
# 发起GET请求获取网页内容
response = requests.get(base_url)
html_content = response.text
# 使用BeautifulSoup解析网页内容
soup = BeautifulSoup(html_content, 'html.parser')
# 查找所有匹配指定条件的链接
links = soup.find_all('div', class_='paper')
for link in links:
title=make_safe_filename("2023-ICML-"+link.find('p', class_='title').get_text()+".pdf")
url=link.find('a', href=re.compile(r'.*\\.pdf$'))['href']
print(title,url)
download_pdf(url,title)
if __name__ == "__main__":
ICML_download()
下载AAAI 2023
zotero插件可以直接导入
https://dblp.uni-trier.de/db/conf/aaai/aaai2023.html
NIPS没有2023的
https://proceedings.neurips.cc/paper_files/paper/2022
参考文章
转载请注明来源,欢迎对文章中的引用来源进行考证,欢迎指出任何有错误或不够清晰的表达。可以在下面评论区评论,也可以邮件至 1944270374@qq.com