from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import requests
import re
import os
import argparse
import configparser
import datetime
import requests
from bs4 import BeautifulSoup
import json
import re
def generate_url_config(urls,driver):
    url_list=[]
    for url in urls:
        select_pdf_url(url,driver)
    return url_list
#使用requests库根据url 和pdf_name 发送httpget请求下载文文件
def download_pdf(url, pdf_name,year):
    # 使用正则表达式移除不适合用于文件名的特殊字符，将它们替换为空格
    pdf_name = re.sub(r'[\\/:*?"<>|]', ' ', pdf_name)
    
    # 发送HTTP GET请求获取论文内容
    response = requests.get(url)
        # 创建下载目录
    download_dir = os.path.join(os.getcwd(), str(year))
    os.makedirs(download_dir, exist_ok=True)
    if response.status_code == 200:
        # 确定下载路径，你可以根据需要修改保存路径
        download_path = os.path.join(os.getcwd(), str(year), f'{pdf_name}.pdf')
        time.sleep(5)
        if os.path.exists(download_path):
            print(f'{download_path} 文件已存在')
            return 1
        else:
            print(f'{download_path} 不存在,正在下载中')
            # 保存论文内容为PDF文件
            with open(download_path, 'wb') as pdf_file:
                pdf_file.write(response.content)
            print(f'论文已下载到: {download_path}')
            return 1
    else:
        print(f'无法下载论文，HTTP响应代码: {response.status_code}')
        return 0
#获取指定topic下的pdfurl，返回的是 pdf_name:pdf_url的多个键值对
def gather_pdf_url_from_topics(url,driver):
    
    print("正在访问指定的topic{}".format(url))
    pdf_urls={}
    driver.get(url)
    #提取出文章的container .目前只要 research-article
    items = driver.find_elements(By.XPATH,'.//div[@class="issue-item-container"]')
    for item in items:
        paperType=item.find_element(By.XPATH,'.//div[@class="issue-heading"]').text.split("\\n")[0]
        print(paperType)
        if paperType =="RESEARCH-ARTICLE":
            #提取pdf的名字
            pdf_name=item.find_element(By.XPATH,'.//h5[@class="issue-item__title"]/a').text
            #提取出pdf链接
            pdf_url=item.find_element(By.XPATH,'.//a[@class="btn--icon simple-tooltip__block--b red btn"]').get_attribute("href")
            #下载PDF
            #download_pdf(pdf_url,pdf_name)
            #记录下来
            pdf_urls[pdf_name]=pdf_url
    print(pdf_urls)
    return pdf_urls
#初始化google_driver
def init_driver():
    #指定chromedirver的路径
    chrome_driver_path = 'chromedriver.exe'
    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument('--ignore-certificate-errors')
    chrome_options.add_argument('--ignore-certificate-errors-spki-list')
    chrome_options.add_experimental_option('excludeSwitches', ['enable-logging'])
    #下面这个可以指定下载路径
    #chrome_options.add_experimental_option("prefs", {"download.default_directory": 'E:\\Temp'})
    driver = webdriver.Chrome()
    # 创建Chrome WebDriver实例
    chrome_service = ChromeService(executable_path=chrome_driver_path)
    driver = webdriver.Chrome(service=chrome_service,options=chrome_options)
    return driver

# 访问CCS历年会议的列表从中提取出指定年份会议的url
def get_conference_urls(year):
    # 发送 GET 请求获取页面内容
    url = '<https://dl.acm.org/conference/ccs/proceedings>'  # 将 URL 替换为你要访问的网页 URL
    response = requests.get(url)

    # 使用 BeautifulSoup 解析页面内容
    soup = BeautifulSoup(response.text, 'html.parser')

    # 查找所有匹配指定条件的 ul 标签
    divs = soup.find_all('div', class_='conference__title left-bordered-title')

    # 遍历 ul 标签并提取其中的所有 a 标签
    for div in divs:
        a_tags = div.find_all('a')
        for a_tag in a_tags:
            conf_year=a_tag.text.split("the")[1].split("ACM")[0].strip()
            if year==conf_year:
                return "<https://dl.acm.org>"+a_tag['href']
#从用户输入中获取要下载 的年份
def parser_year():
    current_year = datetime.datetime.now().year
    # 创建命令行参数解析器
    parser = argparse.ArgumentParser(description='处理年份对应的 PDF')
    # 添加命令行参数，并设置默认值为当前年份
    parser.add_argument('--year', type=str, default=current_year-1, help='要处理的年份，默认为当前年份')
    # 解析命令行参数
    args = parser.parse_args()
    # 获取命令行参数中的年份
    return str(args.year)

#遍历该CCS会议的所有topic
def select_topic_urls(ccs_url,driver):
        #创建一个sele

    driver.get(ccs_url)
    #等待get请求结束
    time.sleep(10)
    #获取所有topic 的url
    downSessions=driver.find_elements(By.XPATH,'//a[@class="section__title accordion-tabbed__control left-bordered-title"]')
    topic_urls=[]
    for session in downSessions:
        topic_urls.append(session.get_attribute("href"))
    return topic_urls

if __name__ == '__main__':
    download_year=parser_year() #，通过 --year指定，如2021
    #解析配置文件
    f=open('data.json', 'r')
    data = json.load(f)
    f.close()
    print("下载CCS会议年份:",download_year)
    if str(download_year) in data['conferences']:
        print("已有配置文件，直接下载！")
        for pdfs in data['conferences'][download_year].items():
            pdf_url=pdfs[1]['download_url']
            pdf_name=pdfs[0]
            if(data['conferences'][download_year][pdf_name]['isdownloaded']!=True):#检查是否已下载，为false则下载
                print(pdfs[1]['download_url'])
                if download_pdf(pdf_url,pdf_name,download_year):#如果能够正常下载则返回修改配置文件
                    data['conferences'][download_year][pdf_name]['isdownloaded']=True
                    with open('data.json', 'w') as updated_json_file: #更新配置文件
                        json.dump(data, updated_json_file, indent=4)
                else:
                    print("ip已被封禁，睡眠2小时 ")
                    time.sleep(60*60*2)
                    

    else:
        driver=init_driver()
        data['conferences'][download_year] = {}
        #更新配置文件
        pdfname_url={}
        #获取目标年份CCS会议的url
        ccs_url=get_conference_urls(download_year)
        #获取CCS会议所有topic对应的url
        topic_urls=select_topic_urls(ccs_url,driver=driver)
        #存储 {pdf_name,pdf_url}
        for topic_url in topic_urls:
            pdfname_url.update(gather_pdf_url_from_topics(topic_url,driver=driver))
        target_year_data = data['conferences'][download_year]
        #{pdf_name,pdf_url} 记入配置文件中

        for pdf_name, download_url in pdfname_url.items():
            target_year_data[pdf_name] = {
                "download_url": download_url,
                "isdownloaded": False
                }
        #写入配置文件中
        with open('data.json', 'w') as updated_json_file:
            json.dump(data, updated_json_file, indent=4)
############################################################################################################################
        print("配置文件导入完成！开始下载 ")
        f=open('data.json', 'r')
        data = json.load(f)
        f.close()
        for pdfs in data['conferences'][download_year].items():
            pdf_url=pdfs[1]['download_url']
            pdf_name=pdfs[0]
            if(data['conferences'][download_year][pdf_name]['isdownloaded']!=True):#检查是否已下载，为false则下载
                print(pdfs[1]['download_url'])
                if download_pdf(pdf_url,pdf_name,download_year):
                    data['conferences'][download_year][pdf_name]['isdownloaded']=True
                    with open('data.json', 'w') as updated_json_file: #更新配置文件
                        json.dump(data, updated_json_file, indent=4)
                else:
                    print("ip已被封禁，退出下载 ")
                    break

S&P

https://ieeexplore.ieee.org/xpl/conhome/1000646/all-proceedings 这里按年份记录了下载地址，下面是近三年的下载地址

https://ieeexplore.ieee.org/xpl/conhome/10179215/proceeding?isnumber=10179280&sortType=vol-only-seq&rowsPerPage=10&pageNumber=1

https://ieeexplore.ieee.org/xpl/conhome/9833550/proceeding?isnumber=9833558&sortType=vol-only-seq&rowsPerPage=100&pageNumber=1

https://ieeexplore.ieee.org/xpl/conhome/9519381/proceeding?isnumber=9519382&sortType=vol-only-seq&rowsPerPage=10&pageNumber=1

使用selenium来下载

1.更新chrome ，浏览器输入chrome://version/ 查看 chrome版本

https://www.itbenet.com/wenz/223586.html

下载对应版本的chrome-driver, 将exe文件解压出来

https://googlechromelabs.github.io/chrome-for-testing/#stable

修改脚本中的exe位置为自己的位置 chrome_driver_path = ‘F:\develop\chromedriver.exe’
pip install selenium

sp2023="<https://ieeexplore.ieee.org/xpl/conhome/10179215/proceeding?isnumber=10179280&sortType=vol-only-seq&rowsPerPage=10&pageNumber=1>"
sp2022="<https://ieeexplore.ieee.org/xpl/conhome/9833550/proceeding?isnumber=9833558&sortType=vol-only-seq&rowsPerPage=100&pageNumber=1>"
sp2021="<https://ieeexplore.ieee.org/xpl/conhome/9519381/proceeding?isnumber=9519382&sortType=vol-only-seq&rowsPerPage=10&pageNumber=1>"
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

#分页下载，调用该函数会下载指定页面的pdf
def download_page(url,driver):
    driver.get(url)
    print("waiting for page loading")
    time.sleep(10)
    selectinput = driver.find_elements(By.XPATH,'//div[@class="hide-mobile"]//input[@class="ng-untouched ng-pristine ng-valid"]')
    #选中pdf
    for input in selectinput:
        driver.execute_script("arguments[0].scrollIntoView();", input)#这里要模拟下拉一下，虽然不知道为啥 ，但是下拉就对了
        time.sleep(1) # 这里要sleep 1秒，否则点击有可能报错
        print("clicking")
        input.click()
    #print(len(selectinput))
    #点击下载按钮
    downloadbtn = driver.find_elements(By.XPATH,'//button[@class="xpl-toggle-btn" and normalize-space(text())="Download PDFs"]')
    downloadbtn[0].click()

    #点击确认下载
    downloadbtn_accept=driver.find_element(By.XPATH,'//button[contains(@class, "downloadpdf-predl-proceed-button stats-SearchResults_BulkPDFDownload xpl-btn-primary")]')
    downloadbtn_accept.click()

    #获取loading
    loadingbtn=driver.find_element(By.XPATH,'//i[contains(@class, "fas") and contains(@class, "fa-spinner") and contains(@class, "fa-spin")]')
    #等待loading消失
    wait.until(EC.staleness_of(loadingbtn))

if __name__ == '__main__':
    # 设置Chrome WebDriver的路径
    chrome_driver_path = 'F:\\develop\\chromedriver.exe'
    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument('--ignore-certificate-errors')
    chrome_options.add_argument('--ignore-certificate-errors-spki-list')
    chrome_options.add_experimental_option('excludeSwitches', ['enable-logging'])
    driver = webdriver.Chrome()
    # 创建Chrome WebDriver实例
    chrome_service = ChromeService(executable_path=chrome_driver_path)
    driver = webdriver.Chrome(service=chrome_service,options=chrome_options)
    wait = WebDriverWait(driver, 300)

    #首先解析出一共有多少pdf,然后分页下载
    url = f"<https://ieeexplore.ieee.org/xpl/conhome/9833550/proceeding?isnumber=9833558&sortType=vol-only-seq&rowsPerPage=10&pageNumber=1>"
    driver.get(url)
    #这个div 记录了pdf总数
    print("waiting for page loading")
    time.sleep(10)
    div_elements = driver.find_elements(By.CSS_SELECTOR, 'div.col-12.Dashboard-header.text-base-md-lh')
    total_papers= int(div_elements[0].text.split(" ")[-1])#204
    for page in range(1,int(total_papers/10)+2): #每页10个 /10
        url = f"<https://ieeexplore.ieee.org/xpl/conhome/9833550/proceeding?isnumber=9833558&sortType=vol-only-seq&rowsPerPage=10&pageNumber={page}>"
        print(url)
        download_page(url,driver=driver)

NDSS

使用: python NDSS.py —year 2022

import requests
from bs4 import BeautifulSoup
import argparse
import datetime
def download_pdf(url, pdf_name):
    response = requests.get(url)
    if response.status_code == 200:
        with open(pdf_name, 'wb') as pdf_file:
            pdf_file.write(response.content)
        print(f"Downloaded and saved {pdf_name}")
    else:
        print("Failed to download the PDF")

def NDSS_Download(year):
    # 定义基础URL和参数位置
    base_url = f"<https://www.ndss-symposium.org/ndss{year}/accepted-papers/>"  # 将URL替换为你的基础URL
    # 发起GET请求获取网页内容
    response = requests.get(base_url)
    html_content = response.text

    # 使用BeautifulSoup解析网页内容
    soup = BeautifulSoup(html_content, 'html.parser')

    # 查找所有匹配指定条件的链接
    links = soup.find_all('a', class_='paper-link-abs', href=True)

    # 打印匹配到的链接
    for link in links:
        paper_pdf_name=link['href'].rstrip('/').split('/')[-1]
        print("Paper name:", link['href'].rstrip('/').split('/')[-1])
        response = requests.get(link['href'])
        html_content = response.text

        # 使用BeautifulSoup解析网页内容
        soup = BeautifulSoup(html_content, 'html.parser')
        # 查找所有带有class属性为"a-pdf"的a标签
        pdf_links = soup.find_all('a', class_='pdf-button', href=True)[0]['href']
        print("论文 url",pdf_links)

        #下载论文
        download_pdf(pdf_links, str(year)+"NDSS-"+paper_pdf_name+".pdf")
def USENIX_download(year):
    url_fall="<https://www.usenix.org/conference/usenixsecurity21/fall-accepted-papers>"
    url_summer="<https://www.usenix.org/conference/usenixsecurity21/summer-accepted-papers>"
if __name__ == "__main__":
    current_year = datetime.datetime.now().year

    parser = argparse.ArgumentParser(description="Extract PDF links from a webpage based on the year.")
    parser.add_argument("--year", type=int, default=current_year, help="Year parameter (default: current year)")
    args = parser.parse_args()
    NDSS_Download(args.year)

多线程版本

import requests
from bs4 import BeautifulSoup
import argparse
import datetime
import threading
from concurrent.futures import ThreadPoolExecutor

def download_pdf(url, pdf_name):
    response = requests.get(url)
    if response.status_code == 200:
        with open(pdf_name, 'wb') as pdf_file:
            pdf_file.write(response.content)
        print(f"Downloaded and saved {pdf_name}")
    else:
        print("Failed to download the PDF")

def download_thread(link, year):
    paper_pdf_name = link['href'].rstrip('/').split('/')[-1]
    print("Paper name:", paper_pdf_name)
    response = requests.get(link['href'])
    html_content = response.text

    soup = BeautifulSoup(html_content, 'html.parser')
    pdf_links = soup.find_all('a', class_='pdf-button', href=True)[0]['href']
    print("论文 url", pdf_links)

    download_pdf(pdf_links, str(year) + "NDSS-" + paper_pdf_name + ".pdf")

def NDSS_Download(year, num_threads):
    base_url = f"<https://www.ndss-symposium.org/ndss{year}/accepted-papers/>"
    response = requests.get(base_url)
    html_content = response.text
    soup = BeautifulSoup(html_content, 'html.parser')
    links = soup.find_all('a', class_='paper-link-abs', href=True)

    # 使用线程池创建指定数量的线程
    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        for link in links:
            executor.submit(download_thread, link, year)

if __name__ == "__main__":
    current_year = datetime.datetime.now().year

    parser = argparse.ArgumentParser(description="Extract PDF links from a webpage based on the year.")
    parser.add_argument("--year", type=int, default=current_year, help="Year parameter (default: current year)")
    parser.add_argument("--threads", type=int, default=10, help="Number of threads (default: 10)")
    args = parser.parse_args()
    NDSS_Download(args.year, args.threads)

USENIX

import re

import requests
from bs4 import BeautifulSoup
import argparse
import datetime
def download_pdf(url, pdf_name):
    response = requests.get(url)
    if response.status_code == 200:
        with open(pdf_name, 'wb') as pdf_file:
            pdf_file.write(response.content)
        print(f"Downloaded and saved {pdf_name}")
    else:
        print("Failed to download the PDF")

def make_safe_filename(filename):
    # 定义要替换的特殊字符的正则表达式模式
    special_chars_pattern = r'[<>:"/\\\\|?*]'

    # 使用正则表达式进行替换，将特殊字符替换为下划线
    safe_filename = re.sub(special_chars_pattern, '_', filename)

    return safe_filename
def USENIX_download_all(year):
    url_fall=f"<https://www.usenix.org/conference/usenixsecurity{year}/fall-accepted-papers>"
    USENIX_download(year,url_fall)
    url_summer=f"<https://www.usenix.org/conference/usenixsecurity{year}/summer-accepted-papers>"
    USENIX_download(year,url_summer)
def USENIX_download(year,url_target):
    response = requests.get(url_target)
    html_content = response.text
    # 使用BeautifulSoup解析网页内容
    soup = BeautifulSoup(html_content, 'html.parser')
    # 查找所有匹配指定条件的链接
    pattern = re.compile(r'/conference.*/presentation.*')
    links = soup.find_all('a', href=pattern)
    # 将找到的链接添加到集合中以去除重复元素
    unique_links_set = set()
    for link in links:
        unique_links_set.add("<https://www.usenix.org>"+link['href'])

    # 将集合转换回列表
    links = list(unique_links_set)
    #遍历
    for link in links:
        print(link)#<https://www.usenix.org/conference/usenixsecurity23/presentation/cernera>
        response = requests.get(link)
        html_content = response.text
        # 使用BeautifulSoup解析网页内容
        soup = BeautifulSoup(html_content, 'html.parser')

        #获取论文的标题
        pdf_name=str(year)+"USENIX-"+make_safe_filename(soup.find('h1', id='page-title').get_text().replace(":", ""))+".pdf"
        #获取论文的详细链接
        pdf_link = soup.find('a', attrs={'type': lambda value: value and 'application/pdf' in value})['href']
        download_pdf(pdf_link,pdf_name)
if __name__ == "__main__":
    current_year = datetime.datetime.now().year

    parser = argparse.ArgumentParser(description="Extract PDF links from a webpage based on the year.")
    parser.add_argument("--year", type=int, default=current_year, help="Year parameter (default: current year)")
    args = parser.parse_args()

    USENIX_download_all(args.year%100) #用到的是年份的后两位

多线程版本

import re
import requests
from bs4 import BeautifulSoup
import argparse
import datetime
from concurrent.futures import ThreadPoolExecutor
def download_pdf(url, pdf_name):
    response = requests.get(url)
    if response.status_code == 200:
        with open(pdf_name, 'wb') as pdf_file:
            pdf_file.write(response.content)
        print(f"Downloaded and saved {pdf_name}")
    else:
        print("Failed to download the PDF")

def make_safe_filename(filename):
    # 定义要替换的特殊字符的正则表达式模式
    special_chars_pattern = r'[<>:"/\\\\|?*]'

    # 使用正则表达式进行替换，将特殊字符替换为下划线
    safe_filename = re.sub(special_chars_pattern, '_', filename)

    return safe_filename
def USENIX_download_all(year,num_threads):
    url_fall=f"<https://www.usenix.org/conference/usenixsecurity{year}/fall-accepted-papers>"
    USENIX_download(year,url_fall,num_threads)
    url_summer=f"<https://www.usenix.org/conference/usenixsecurity{year}/summer-accepted-papers>"
    USENIX_download(year,url_summer,num_threads)
def USENIX_download(year, url_target,num_threads):
    response = requests.get(url_target)
    html_content = response.text
    soup = BeautifulSoup(html_content, 'html.parser')
    pattern = re.compile(r'/conference.*/presentation.*')
    links = soup.find_all('a', href=pattern)
    unique_links_set = set()
    for link in links:
        unique_links_set.add("<https://www.usenix.org>" + link['href'])

    links = list(unique_links_set)

    # 使用线程池创建指定数量的线程
    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        for link in links:
            executor.submit(download_thread, link, year)

def download_thread(link, year):
    response = requests.get(link)
    html_content = response.text
    soup = BeautifulSoup(html_content, 'html.parser')
    pdf_name = str(year) + "USENIX-" + make_safe_filename(soup.find('h1', id='page-title').get_text().replace(":", "")) + ".pdf"
    pdf_link = soup.find('a', attrs={'type': lambda value: value and 'application/pdf' in value})['href']
    download_pdf(pdf_link, pdf_name)

if __name__ == "__main__":
    current_year = datetime.datetime.now().year

    parser = argparse.ArgumentParser(description="Extract PDF links from a webpage based on the year.")
    parser.add_argument("--year", type=int, default=current_year, help="Year parameter (default: current year)")
    parser.add_argument("--threads", type=int, default=10, help="Number of threads (default: 10)")
    args = parser.parse_args()

    num_threads = args.threads
    USENIX_download_all(args.year % 100, num_threads)

人工智能顶会

Download papers and supplemental materials from open-access paper website, such as AAAI, ACCV, AISTATS, COLT, CVPR, ECCV, ICCV, ICLR, ICML, IJCAI, JMLR, NIPS.

https://github.com/SilenceEagle/paper_downloader

github项目中有阿里云盘(2020-2022年)可直接保存下载

下载IJCAI 2023

https://www.ijcai.org/proceedings/2023/

脚本通过—year 指定年份，不输入默认为当前年份

import re

import requests
from bs4 import BeautifulSoup
import argparse
import datetime
#根据url进行下载并将文件名命名为pdf_name
def download_pdf(url, pdf_name):
    response = requests.get(url)
    if response.status_code == 200:
        with open(pdf_name, 'wb') as pdf_file:
            pdf_file.write(response.content)
        print(f"Downloaded and saved {pdf_name}")
    else:
        print("Failed to download the PDF")

#有的论文有特殊字符在window下无法作为正常文件名
def make_safe_filename(filename):
    # 定义要替换的特殊字符的正则表达式模式
    special_chars_pattern = r'[<>:"/\\\\|?*]'
    # 使用正则表达式进行替换，将特殊字符替换为下划线
    safe_filename = re.sub(special_chars_pattern, '_', filename)

    return safe_filename
def IJCAI_download(year):
    # 定义基础URL和参数位置
    base_url = f"<https://www.ijcai.org/proceedings/{year}/>"  # 将URL替换为你的基础URL
    # 发起GET请求获取网页内容
    response = requests.get(base_url)
    html_content = response.text

    # 使用BeautifulSoup解析网页内容
    soup = BeautifulSoup(html_content, 'html.parser')

    # 查找所有匹配指定条件的链接
    links = soup.find_all('div', class_='paper_wrapper')
    for link in links:
        title=make_safe_filename(str(year)+"-IJCAI-"+link.find('div', class_='title').get_text()+".pdf")
        url=f"<https://www.ijcai.org/proceedings/{year}/>"+link.find('a', href=re.compile(r'.*\\.pdf$'))['href']
        print(title,url)
        download_pdf(url,title)
if __name__ == "__main__":
    current_year = datetime.datetime.now().year

    parser = argparse.ArgumentParser(description="Extract PDF links from a webpage based on the year.")
    parser.add_argument("--year", type=int, default=current_year, help="Year parameter (default: current year)")
    args = parser.parse_args()

    IJCAI_download(args.year)

下载ICML 2023

http://proceedings.mlr.press/

2023年的ICML :http://proceedings.mlr.press/v202/

这个脚本只能下载2023的，后续2024之类的要将url的v202改成对应的编号

import re

import requests
from bs4 import BeautifulSoup
import argparse
import datetime
#根据url进行下载并将文件名命名为pdf_name
def download_pdf(url, pdf_name):
    response = requests.get(url)
    if response.status_code == 200:
        with open(pdf_name, 'wb') as pdf_file:
            pdf_file.write(response.content)
        print(f"Downloaded and saved {pdf_name}")
    else:
        print("Failed to download the PDF")

#有的论文有特殊字符在window下无法作为正常文件名
def make_safe_filename(filename):
    # 定义要替换的特殊字符的正则表达式模式
    special_chars_pattern = r'[<>:"/\\\\|?*]'
    # 使用正则表达式进行替换，将特殊字符替换为下划线
    safe_filename = re.sub(special_chars_pattern, '_', filename)

    return safe_filename
def ICML_download():
    # 定义基础URL和参数位置
    base_url = "<http://proceedings.mlr.press/v202/>"
    # 发起GET请求获取网页内容
    response = requests.get(base_url)
    html_content = response.text

    # 使用BeautifulSoup解析网页内容
    soup = BeautifulSoup(html_content, 'html.parser')

    # 查找所有匹配指定条件的链接
    links = soup.find_all('div', class_='paper')
    for link in links:
        title=make_safe_filename("2023-ICML-"+link.find('p', class_='title').get_text()+".pdf")
        url=link.find('a', href=re.compile(r'.*\\.pdf$'))['href']
        print(title,url)
        download_pdf(url,title)
if __name__ == "__main__":
    ICML_download()

下载AAAI 2023

zotero插件可以直接导入

https://dblp.uni-trier.de/db/conf/aaai/aaai2023.html

NIPS没有2023的

https://proceedings.neurips.cc/paper_files/paper/2022

参考文章

转载请注明来源，欢迎对文章中的引用来源进行考证，欢迎指出任何有错误或不够清晰的表达。可以在下面评论区评论，也可以邮件至 1944270374@qq.com