Python Crawler10:知网、AGU、JJR

Python Crawler10:知网、AGU、JJR

说在最前面

首先确保浏览器能访问:知网、WRR、JGR SE、JH

UI设计

  • 在QT Designer中设计好UI界面
notion image

UI设计生成文件

  • Hydrology.ui
<?xml version="1.0" encoding="UTF-8"?> <ui version="4.0"> <class>Dialog</class> <widget class="QDialog" name="Dialog"> <property name="geometry"> <rect> <x>0</x> <y>0</y> <width>480</width> <height>433</height> </rect> </property> <property name="windowTitle"> <string>Dialog</string> </property> <widget class="QDialogButtonBox" name="buttonBox"> <property name="geometry"> <rect> <x>100</x> <y>360</y> <width>341</width> <height>32</height> </rect> </property> <property name="orientation"> <enum>Qt::Horizontal</enum> </property> <property name="standardButtons"> <set>QDialogButtonBox::Cancel|QDialogButtonBox::Ok</set> </property> </widget> <widget class="QWidget" name="gridLayoutWidget"> <property name="geometry"> <rect> <x>70</x> <y>70</y> <width>364</width> <height>251</height> </rect> </property> <layout class="QGridLayout" name="gridLayout"> <item row="1" column="0"> <widget class="QLabel" name="label_1"> <property name="font"> <font> <family>华文行楷</family> <pointsize>10</pointsize> </font> </property> <property name="text"> <string>请输入关键词:</string> </property> </widget> </item> <item row="0" column="0"> <widget class="QLabel" name="label_0"> <property name="font"> <font> <family>华文行楷</family> <pointsize>10</pointsize> </font> </property> <property name="text"> <string>请输入关键词:</string> </property> </widget> </item> <item row="3" column="1"> <widget class="QLineEdit" name="lineEdit_3"> <property name="text"> <string/> </property> </widget> </item> <item row="4" column="1"> <layout class="QHBoxLayout" name="horizontalLayout"> <item> <widget class="QPushButton" name="pushButton"> <property name="text"> <string>开始</string> </property> </widget> </item> <item> <widget class="QPushButton" name="pushButton_2"> <property name="text"> <string>结束</string> </property> </widget> </item> </layout> </item> <item row="6" column="0"> <widget class="QLabel" name="label_4"> <property name="font"> <font> <family>华文行楷</family> <pointsize>10</pointsize> </font> </property> <property name="text"> <string>输出结果:</string> </property> </widget> </item> <item row="2" column="0"> <widget class="QLabel" name="label_2"> <property name="font"> <font> <family>华文行楷</family> <pointsize>10</pointsize> </font> </property> <property name="text"> <string>请输入数量:</string> </property> </widget> </item> <item row="3" column="0"> <widget class="QLabel" name="label_3"> <property name="font"> <font> <family>华文行楷</family> <pointsize>10</pointsize> </font> </property> <property name="text"> <string>路径选择:</string> </property> </widget> </item> <item row="3" column="2"> <widget class="QToolButton" name="toolButton"> <property name="text"> <string>...</string> </property> </widget> </item> <item row="2" column="1"> <widget class="QLineEdit" name="lineEdit_2"> <property name="text"> <string/> </property> </widget> </item> <item row="1" column="1"> <widget class="QLineEdit" name="lineEdit_1"> <property name="text"> <string/> </property> </widget> </item> <item row="6" column="1"> <widget class="QTextBrowser" name="textBrowser"/> </item> <item row="0" column="1"> <widget class="QComboBox" name="comboBox"> <item> <property name="text"> <string>请选择文献期刊</string> </property> </item> <item> <property name="text"> <string>CNKI</string> </property> </item> <item> <property name="text"> <string>Journal of Hydrology</string> </property> </item> <item> <property name="text"> <string>Water Resources Research</string> </property> </item> <item> <property name="text"> <string>JGR Solid Earth</string> </property> </item> </widget> </item> </layout> </widget> <widget class="QLabel" name="label_5"> <property name="geometry"> <rect> <x>180</x> <y>30</y> <width>131</width> <height>31</height> </rect> </property> <property name="font"> <font> <family>Comic Sans MS</family> <pointsize>15</pointsize> </font> </property> <property name="text"> <string>GroundWater</string> </property> </widget> </widget> <resources/> <connections> <connection> <sender>buttonBox</sender> <signal>accepted()</signal> <receiver>Dialog</receiver> <slot>accept()</slot> <hints> <hint type="sourcelabel"> <x>248</x> <y>254</y> </hint> <hint type="destinationlabel"> <x>157</x> <y>274</y> </hint> </hints> </connection> <connection> <sender>buttonBox</sender> <signal>rejected()</signal> <receiver>Dialog</receiver> <slot>reject()</slot> <hints> <hint type="sourcelabel"> <x>316</x> <y>260</y> </hint> <hint type="destinationlabel"> <x>286</x> <y>274</y> </hint> </hints> </connection> </connections> </ui>

生成UI代码

  • Ui_Hydrology.py
# Form implementation generated from reading ui file 'd:\Cumtb_Code\PyQT\Hydrology.ui' # # Created by: PyQt6 UI code generator 6.1.0 # # WARNING: Any manual changes made to this file will be lost when pyuic6 is # run again. Do not edit this file unless you know what you are doing. from PyQt6 import QtCore, QtGui, QtWidgets class Ui_Dialog(object): def setupUi(self, Dialog): Dialog.setObjectName("Dialog") Dialog.resize(480, 433) self.buttonBox = QtWidgets.QDialogButtonBox(Dialog) self.buttonBox.setGeometry(QtCore.QRect(100, 360, 341, 32)) self.buttonBox.setOrientation(QtCore.Qt.Orientation.Horizontal) self.buttonBox.setStandardButtons(QtWidgets.QDialogButtonBox.StandardButton.Cancel|QtWidgets.QDialogButtonBox.StandardButton.Ok) self.buttonBox.setObjectName("buttonBox") self.gridLayoutWidget = QtWidgets.QWidget(Dialog) self.gridLayoutWidget.setGeometry(QtCore.QRect(70, 70, 364, 251)) self.gridLayoutWidget.setObjectName("gridLayoutWidget") self.gridLayout = QtWidgets.QGridLayout(self.gridLayoutWidget) self.gridLayout.setContentsMargins(0, 0, 0, 0) self.gridLayout.setObjectName("gridLayout") self.label_1 = QtWidgets.QLabel(self.gridLayoutWidget) font = QtGui.QFont() font.setFamily("华文行楷") font.setPointSize(10) self.label_1.setFont(font) self.label_1.setObjectName("label_1") self.gridLayout.addWidget(self.label_1, 1, 0, 1, 1) self.label_0 = QtWidgets.QLabel(self.gridLayoutWidget) font = QtGui.QFont() font.setFamily("华文行楷") font.setPointSize(10) self.label_0.setFont(font) self.label_0.setObjectName("label_0") self.gridLayout.addWidget(self.label_0, 0, 0, 1, 1) self.lineEdit_3 = QtWidgets.QLineEdit(self.gridLayoutWidget) self.lineEdit_3.setText("") self.lineEdit_3.setObjectName("lineEdit_3") self.gridLayout.addWidget(self.lineEdit_3, 3, 1, 1, 1) self.horizontalLayout = QtWidgets.QHBoxLayout() self.horizontalLayout.setObjectName("horizontalLayout") self.pushButton = QtWidgets.QPushButton(self.gridLayoutWidget) self.pushButton.setObjectName("pushButton") self.horizontalLayout.addWidget(self.pushButton) self.pushButton_2 = QtWidgets.QPushButton(self.gridLayoutWidget) self.pushButton_2.setObjectName("pushButton_2") self.horizontalLayout.addWidget(self.pushButton_2) self.gridLayout.addLayout(self.horizontalLayout, 4, 1, 1, 1) self.label_4 = QtWidgets.QLabel(self.gridLayoutWidget) font = QtGui.QFont() font.setFamily("华文行楷") font.setPointSize(10) self.label_4.setFont(font) self.label_4.setObjectName("label_4") self.gridLayout.addWidget(self.label_4, 6, 0, 1, 1) self.label_2 = QtWidgets.QLabel(self.gridLayoutWidget) font = QtGui.QFont() font.setFamily("华文行楷") font.setPointSize(10) self.label_2.setFont(font) self.label_2.setObjectName("label_2") self.gridLayout.addWidget(self.label_2, 2, 0, 1, 1) self.label_3 = QtWidgets.QLabel(self.gridLayoutWidget) font = QtGui.QFont() font.setFamily("华文行楷") font.setPointSize(10) self.label_3.setFont(font) self.label_3.setObjectName("label_3") self.gridLayout.addWidget(self.label_3, 3, 0, 1, 1) self.toolButton = QtWidgets.QToolButton(self.gridLayoutWidget) self.toolButton.setObjectName("toolButton") self.gridLayout.addWidget(self.toolButton, 3, 2, 1, 1) self.lineEdit_2 = QtWidgets.QLineEdit(self.gridLayoutWidget) self.lineEdit_2.setText("") self.lineEdit_2.setObjectName("lineEdit_2") self.gridLayout.addWidget(self.lineEdit_2, 2, 1, 1, 1) self.lineEdit_1 = QtWidgets.QLineEdit(self.gridLayoutWidget) self.lineEdit_1.setText("") self.lineEdit_1.setObjectName("lineEdit_1") self.gridLayout.addWidget(self.lineEdit_1, 1, 1, 1, 1) self.textBrowser = QtWidgets.QTextBrowser(self.gridLayoutWidget) self.textBrowser.setObjectName("textBrowser") self.gridLayout.addWidget(self.textBrowser, 6, 1, 1, 1) self.comboBox = QtWidgets.QComboBox(self.gridLayoutWidget) self.comboBox.setObjectName("comboBox") self.comboBox.addItem("") self.comboBox.addItem("") self.comboBox.addItem("") self.comboBox.addItem("") self.comboBox.addItem("") self.gridLayout.addWidget(self.comboBox, 0, 1, 1, 1) self.label_5 = QtWidgets.QLabel(Dialog) self.label_5.setGeometry(QtCore.QRect(180, 30, 131, 31)) font = QtGui.QFont() font.setFamily("Comic Sans MS") font.setPointSize(15) self.label_5.setFont(font) self.label_5.setObjectName("label_5") self.retranslateUi(Dialog) self.buttonBox.accepted.connect(Dialog.accept) self.buttonBox.rejected.connect(Dialog.reject) QtCore.QMetaObject.connectSlotsByName(Dialog) def retranslateUi(self, Dialog): _translate = QtCore.QCoreApplication.translate Dialog.setWindowTitle(_translate("Dialog", "Dialog")) self.label_1.setText(_translate("Dialog", "请输入关键词:")) self.label_0.setText(_translate("Dialog", "请输入关键词:")) self.pushButton.setText(_translate("Dialog", "开始")) self.pushButton_2.setText(_translate("Dialog", "结束")) self.label_4.setText(_translate("Dialog", "输出结果:")) self.label_2.setText(_translate("Dialog", "请输入数量:")) self.label_3.setText(_translate("Dialog", "路径选择:")) self.toolButton.setText(_translate("Dialog", "...")) self.comboBox.setItemText(0, _translate("Dialog", "请选择文献期刊")) self.comboBox.setItemText(1, _translate("Dialog", "CNKI")) self.comboBox.setItemText(2, _translate("Dialog", "Journal of Hydrology")) self.comboBox.setItemText(3, _translate("Dialog", "Water Resources Research")) self.comboBox.setItemText(4, _translate("Dialog", "JGR Solid Earth")) self.label_5.setText(_translate("Dialog", "GroundWater"))
 

核心代码

  • Hydrology.py
''' Description: henggao_note version: v1.0.0 Date: 2022-04-12 19:57:31 LastEditors: henggao LastEditTime: 2022-04-12 19:57:32 ''' import sys import time from openpyxl import Workbook from PyQt6.QtCore import QThread, pyqtSignal from Ui_Hydrology import Ui_Dialog from PyQt6.QtWidgets import QApplication, QDialog from selenium import webdriver from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.by import By from selenium.webdriver.common.desired_capabilities import DesiredCapabilities from selenium.webdriver.chrome.options import Options from selenium.webdriver.common.keys import Keys from PyQt6 import QtWidgets # 全局变量 # search_url = "https://www.cnki.net" class Cnkiprogrammer(QDialog, Ui_Dialog): def __init__(self, parent=None): super(Cnkiprogrammer, self).__init__(parent) self.setupUi(self) # self.runButton.clicked.connect(self.execute) # global search_url # 实例化线程对象 self.work = WorkThread() # 下拉框 self.comboBox.currentIndexChanged.connect(self.getURL) # 编辑按钮 self.pushButton.clicked.connect(self.startBtn) # 选择保存路径 self.toolButton.clicked.connect(self.savePath) def getURL(self): global select_value select_value = self.comboBox.currentText() # return search_url def savePath(self): # 保存路径 directory = QtWidgets.QFileDialog.getSaveFileName( self, "设置路径", "./", "All Files (*);;Text Files (*.tsv)") # print(directory) self.lineEdit_3.setText(directory[0]) def startBtn(self): # 开始检索,判断输入框内容是否为空 if len(self.lineEdit_1.text()) == 0: print("请输入关键词") self.lineEdit_1.setText('请输入关键词') else: # 设置谷歌驱动器的环境 options = webdriver.ChromeOptions() # 设置chrome不加载图片,提高速度 options.add_experimental_option( "prefs", {"profile.managed_default_content_settings.images": 2}) # 隐藏正受到自动测试软件的控制。 options.add_experimental_option( 'excludeSwitches', ['enable-automation']) # 隐藏navigator.webdriver标志 options.add_argument( "--disable-blink-features=AutomationControlled") # 无痕模式 options.add_argument('--incognito') # 设置不显示窗口 # options.add_argument('--headless') # options.add_argument("--window-size=1920,1050") #专门应对无头浏览器中不能最大化屏幕的方案 # 设置页面加载策略为eager options.page_load_strategy = "eager" # 设置全局变量,在线程run中获取值 global driver global theme global papers_need global save_path # 创建一个谷歌驱动器 driver = webdriver.Chrome(options=options) # driver.set_page_load_timeout(10) # 防止页面加载个没完 # 设置搜索主题 theme = self.lineEdit_1.text() # 设置所需篇数 papers_need = int(self.lineEdit_2.text()) # 存储路径 save_path = self.lineEdit_3.text() # print(save_path) # 启动线程 self.work.start() # 线程自定义信号连接的槽函数 self.work.trigger.connect(self.display) def execute(self): # 启动线程 self.work.start() # 线程自定义信号连接的槽函数 self.work.trigger.connect(self.display) def display(self, str): # 由于自定义信号时自动传递一个字符串参数,所以在这个槽函数中要接受一个参数 self.textBrowser.append(str) class WorkThread(QThread): # 自定义信号对象。参数str就代表这个信号可以传一个字符串 trigger = pyqtSignal(str) def __int__(self): # 初始化函数 super(WorkThread, self).__init__() def run(self): # 重写线程执行的run函数 global count # 赋值序号, 控制爬取的文章数量 count = 1 # 根据Url打开页面 if select_value == "Journal of Hydrology": print(select_value) # global search_url search_url = "https://www.sciencedirect.com/journal/journal-of-hydrology" driver.get(search_url) # driver.get("https://www.sciencedirect.com/journal/journal-of-hydrology") input = driver.find_element( by=By.CSS_SELECTOR, value=".search-input") # 传入关键字 input.send_keys(theme) # 点击搜索 input.send_keys(Keys.ENTER) time.sleep(3) # 获取总文献数和页数 res_unm = WebDriverWait(driver, 10).until(EC.presence_of_element_located( (By.XPATH, "//span[@class='search-body-results-text']"))).text print(f"共找到 {res_unm} 。") success_text = '共查询到' + str(res_unm) + \ '条数据,' + '\n' self.trigger.emit(success_text) # 当爬取数量小于需求时,循环网页页码 while count <= papers_need: # 等待加载完全,休眠3S # time.sleep(3) title_list = WebDriverWait(driver, 10).until( EC.presence_of_all_elements_located((By.CLASS_NAME, "result-list-title-link"))) # 循环网页一页中的条目 for i in range(len(title_list)): try: print("================第" + str(i) + "篇===================") # 点击条目 href = title_list[i].get_attribute('href') # 在新的标签页打开链接 driver.execute_script( f'window.open("{href}", "_blank");') # 获取driver的句柄 n = driver.window_handles # driver切换至最新生产的页面 driver.switch_to.window(n[-1]) # 标题 message_title = WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.CLASS_NAME, "title-text"))).text print(message_title) # 作者 # arguments[0]代表第一个属性 # str = 'MMMM' # js= "arguments[0].text='{}'".format(str) # 定位一个对象 # message_author_ref = WebDriverWait(driver, 10).until( # EC.presence_of_element_located((By.CLASS_NAME, "author-ref"))) # print(message_author_ref) # print(message_author_ref) # opt=driver.find_element_by_xpath('/div[1]') # 对执行js脚本进行属性的修改 # driver.execute_script(js,message_author_ref) # message_author_ref = driver.execute_script("arguments[0].removeAttribute(arguments[1])", # message_author_ref) # driver.execute_script("arguments[0].setAttribute(arguments[1],arguments[2])", message_author, "value", ",") # id # js = "var elem2 = document.getElementByClassName('author-ref');" # + "elem2.parentNode.removeChild(elem2); " # message = driver.execute_script(js) # 执行js语句 # print(message) message_author = WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.CLASS_NAME, "author-group"))).text message_author = message_author[32:] print(message_author) # js = "document.getElementByClassName('author-ref')" # # + "elem2.parentNode.removeChild(elem2); " # driver.execute_script(js) # 执行js语句 # Highlights # message_highlights = WebDriverWait(driver, 10).until( # EC.presence_of_element_located((By.CLASS_NAME, "author-highlights"))).text message_highlights = WebDriverWait(driver, 10).until(EC.presence_of_element_located( (By.XPATH, "/html/body/div[3]/div/div/section/div/div/div/div[2]/article/div[5]/div[1]/div/p/dl"))).text # message_highlights = message_highlights.replace('•\n','') message_highlights = message_highlights.replace( '\n', '') print(message_highlights) # 摘要 message_abstract = WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.XPATH, "/html/body/div[3]/div/div/section/div/div/div/div[2]/article/div[5]/div[2]/div/p"))).text print(message_abstract) # 关键词 message_keyword = WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.CLASS_NAME, "keywords-section"))).text # print(message_keyword) message_keyword = message_keyword[9:] print(message_keyword) # doi message_doi = WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.CLASS_NAME, "doi"))).text print(message_doi) # try: # # 单位 # institute = WebDriverWait(driver, 10).until(EC.presence_of_element_located( # (By.XPATH, "/html/body/div[2]/div[1]/div[3]/div/div/div[3]/div/h3[2]/span/a"))).text # except: # institute = '无' # try: # # 关键字 # keywords = WebDriverWait(driver, 10).until( # EC.presence_of_element_located((By.CLASS_NAME, "keywords"))).text[:-1] # except: # keywords = '无' url = driver.current_url # 获取下载链接 # print(url) # 写入文件 # res = { # "message_title":message_title, # "message_author":message_author, # "message_keyword":message_keyword, # "message_highlights":message_highlights, # "message_abstract":message_abstract, # "message_doi":message_doi, # } # res = [message_title, message_author, message_keyword, # message_highlights, message_abstract, message_doi] # print(res) # mybook = Workbook() # mybook.append(res) # mybook.save('hh.xlsx') with open(save_path, 'a', encoding='utf-8') as f: res = f"{count}\t{message_title}\t{message_author}\t{message_keyword}\t{message_highlights}\t{message_abstract}\t{message_doi}\t{url}".replace( "\n", "")+"\n" f.write(res) success_text = '第' + str(count) + '条数据抓取成功' + '\n' # self.textBrowser.append(success_text) print(success_text) self.trigger.emit(success_text) # res = f"{count}\t{title}\t{authors}\t{institute}\t{date}\t{source}\t{database}\t{keywords}\t{abstract}\t{url}".replace( # "\n", "")+"\n" # with open(save_path, 'a', encoding='gbk') as f: # f.write(res) # success_text = '第' + str(count) + '条数据抓取成功' + '\n' # # self.textBrowser.append(success_text) # print(success_text) # self.trigger.emit(success_text) except: # print(f" 第 {count} 条爬取失败\n") # 跳过本条,接着下一个 continue finally: # 如果有多个窗口,关闭第二个窗口, 切换回主页 n2 = driver.window_handles if len(n2) > 1: driver.close() driver.switch_to.window(n2[0]) # 计数,判断需求是否足够 count += 1 if count > papers_need: success_text = '抓取数据结束,共抓取' + \ str(count-1) + '条数据' + '\n' print(success_text) # self.textBrowser.append(success_text) self.trigger.emit(success_text) break # 切换到下一页 WebDriverWait(driver, 10).until(EC.presence_of_element_located( (By.XPATH, "//li[@class='next-link']"))).click() elif select_value == "Water Resources Research": print(select_value) search_url = "https://agupubs.onlinelibrary.wiley.com/journal/19447973" driver.get(search_url) # 窗口最大化 driver.maximize_window() time.sleep(1) # WebDriverWait(driver, 10).until(EC.presence_of_element_located( # (By.ID, "searchField0"))) search_bar = driver.find_element( by=By.CSS_SELECTOR, value=".fakeQSInput") print(search_bar) search_bar.click() input = driver.find_element( by=By.XPATH, value='//*[@id="searchField0"]') input.send_keys(theme) # print("可以获取") # 点击搜索 time.sleep(6) input.send_keys(Keys.ENTER) print("sucessfully") # WebDriverWait(driver, 5).until( # EC.presence_of_all_elements_located((By.CLASS_NAME, "osano-visually-hidden"))) # print("cookie完成接受") # cookie_btn = driver.find_element( # By.XPATH, '//button[contains(text(), "全部接受")]') # cookie_btn = WebDriverWait(driver, 10).until( # EC.presence_of_all_elements_located((By.XPATH, "/html/body/div[1]/div[2]/div[2]/button[2]"))) # print("progressive") # cookie_btn.click() # # 6. 等待20s,是否cookie页面响应 # time.sleep(2) # input = driver.find_element( # by=By.CSS_SELECTOR, value=".searchField0") # print(input) # # 传入关键字 # input.send_keys(theme) # # 点击搜索 # input.send_keys(Keys.ENTER) time.sleep(3) # 获取总文献数和页数 res_unm = WebDriverWait(driver, 10).until(EC.presence_of_element_located( (By.XPATH, "//span[@class='result__count']"))).text print(f"共找到 {res_unm} 。") success_text = '共查询到' + str(res_unm) + \ '条数据,' + '\n' self.trigger.emit(success_text) # 当爬取数量小于需求时,循环网页页码 while count <= papers_need: title_list = WebDriverWait(driver, 10).until( EC.presence_of_all_elements_located((By.CLASS_NAME, "publication_title"))) print(len(title_list)) # print(title_list) # 循环网页一页中的条目 for i in range(len(title_list)): try: print("================第" + str(i) + "篇===================") # 点击条目 href = title_list[i].get_attribute('href') # 在新的标签页打开链接 driver.execute_script( f'window.open("{href}", "_blank");') # 获取driver的句柄 n = driver.window_handles # driver切换至最新生产的页面 driver.switch_to.window(n[-1]) # 标题 message_title = WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.CLASS_NAME, "citation__title"))).text # 作者 message_author = WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.CLASS_NAME, "accordion-tabbed"))).text message_author = message_author.replace("\n", '') print(message_author) # abstract message_abstract = WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.ID, "section-1-en"))).text message_abstract = message_abstract[9:] print(message_abstract) # doi message_doi = WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.CLASS_NAME, "epub-doi"))).text print(message_doi) url = driver.current_url # 写入文件 with open(save_path, 'a', encoding='utf-8') as f: res = f"{count}\t{message_title}\t{message_author}\t{message_abstract}\t{message_doi}\t{url}".replace( "\n", "")+"\n" f.write(res) success_text = '第' + str(count) + '条数据抓取成功' + '\n' # self.textBrowser.append(success_text) print(success_text) self.trigger.emit(success_text) except: # print("error") # 跳过本条,接着下一个 continue finally: n2 = driver.window_handles if len(n2) > 1: driver.close() driver.switch_to.window(n2[0]) # 计数,判断需求是否足够 count += 1 if count > papers_need: success_text = '抓取数据结束,共抓取' + \ str(count-1) + '条数据' + '\n' print(success_text) # self.textBrowser.append(success_text) self.trigger.emit(success_text) break # 切换到下一页 WebDriverWait(driver, 10).until(EC.presence_of_element_located( (By.CLASS_NAME, "pagination__btn--next"))).click() elif select_value == "JGR Solid Earth": print(select_value) search_url = "https://agupubs.onlinelibrary.wiley.com/journal/21699356" driver.get(search_url) # 窗口最大化 driver.maximize_window() time.sleep(1) # WebDriverWait(driver, 10).until(EC.presence_of_element_located( # (By.ID, "searchField0"))) search_bar = driver.find_element( by=By.CSS_SELECTOR, value=".fakeQSInput") print(search_bar) search_bar.click() input = driver.find_element( by=By.XPATH, value='//*[@id="searchField0"]') input.send_keys(theme) # print("可以获取") # 点击搜索 time.sleep(6) input.send_keys(Keys.ENTER) # print("sucessfully") # WebDriverWait(driver, 5).until( # EC.presence_of_all_elements_located((By.CLASS_NAME, "osano-visually-hidden"))) # print("cookie完成接受") # cookie_btn = driver.find_element( # By.XPATH, '//button[contains(text(), "全部接受")]') # cookie_btn = WebDriverWait(driver, 10).until( # EC.presence_of_all_elements_located((By.XPATH, "/html/body/div[1]/div[2]/div[2]/button[2]"))) # print("progressive") # cookie_btn.click() # # 6. 等待20s,是否cookie页面响应 # time.sleep(2) # input = driver.find_element( # by=By.CSS_SELECTOR, value=".searchField0") # print(input) # # 传入关键字 # input.send_keys(theme) # # 点击搜索 # input.send_keys(Keys.ENTER) time.sleep(3) # 获取总文献数和页数 res_unm = WebDriverWait(driver, 10).until(EC.presence_of_element_located( (By.XPATH, "//span[@class='result__count']"))).text # print(f"共找到 {res_unm} 。") success_text = '共查询到' + str(res_unm) + \ '条数据,' + '\n' self.trigger.emit(success_text) # 当爬取数量小于需求时,循环网页页码 while count <= papers_need: title_list = WebDriverWait(driver, 10).until( EC.presence_of_all_elements_located((By.CLASS_NAME, "publication_title"))) # print(len(title_list)) # print(title_list) # 循环网页一页中的条目 for i in range(len(title_list)): try: print("================第" + str(i) + "篇===================") # 点击条目 href = title_list[i].get_attribute('href') # 在新的标签页打开链接 driver.execute_script( f'window.open("{href}", "_blank");') # 获取driver的句柄 n = driver.window_handles # driver切换至最新生产的页面 driver.switch_to.window(n[-1]) # 标题 message_title = WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.CLASS_NAME, "citation__title"))).text # 作者 message_author = WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.CLASS_NAME, "accordion-tabbed"))).text message_author = message_author.replace("\n", '') # print(message_author) # abstract message_abstract = WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.ID, "section-1-en"))).text message_abstract = message_abstract[9:] # print(message_abstract) # doi message_doi = WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.CLASS_NAME, "epub-doi"))).text # print(message_doi) url = driver.current_url # 写入文件 with open(save_path, 'a', encoding='utf-8') as f: res = f"{count}\t{message_title}\t{message_author}\t{message_abstract}\t{message_doi}\t{url}".replace( "\n", "")+"\n" f.write(res) success_text = '第' + str(count) + '条数据抓取成功' + '\n' # self.textBrowser.append(success_text) # print(success_text) self.trigger.emit(success_text) except: print("error") # 跳过本条,接着下一个 continue finally: n2 = driver.window_handles if len(n2) > 1: driver.close() driver.switch_to.window(n2[0]) # 计数,判断需求是否足够 count += 1 if count > papers_need: success_text = '抓取数据结束,共抓取' + \ str(count-1) + '条数据' + '\n' # print(success_text) # self.textBrowser.append(success_text) self.trigger.emit(success_text) break # 切换到下一页 WebDriverWait(driver, 10).until(EC.presence_of_element_located( (By.CLASS_NAME, "pagination__btn--next"))).click() elif select_value == "JGR Solid Earth": print(select_value) search_url = "https://agupubs.onlinelibrary.wiley.com/journal/21699356" else: print(select_value) search_url = "https://www.cnki.net" # 打开页面 driver.get(search_url) input = driver.find_element( by=By.CSS_SELECTOR, value=".search-input") # 传入关键字 input.send_keys(theme) # 点击搜索 input.send_keys(Keys.ENTER) time.sleep(3) # 点击切换中文文献 WebDriverWait(driver, 10).until(EC.presence_of_element_located( (By.XPATH, "//div[@class='switch-ChEn']/a[@class='ch']"))).click() time.sleep(1) # 获取总文献数和页数 res_unm = WebDriverWait(driver, 10).until(EC.presence_of_element_located( (By.XPATH, "//span[@class='pagerTitleCell']/em"))).text # 去除千分位里的逗号 res_unm = int(res_unm.replace(",", '')) page_unm = int(res_unm/20) + 1 # print(f"共找到 {res_unm} 条结果, {page_unm} 页。") success_text = '共查询到' + str(res_unm) + \ '条数据,' + str(page_unm)+'页' + '\n' # self.textBrowser.append(success_text) self.trigger.emit(success_text) # 当爬取数量小于需求时,循环网页页码 while count <= papers_need: # 等待加载完全,休眠3S # time.sleep(3) title_list = WebDriverWait(driver, 10).until( EC.presence_of_all_elements_located((By.CLASS_NAME, "fz14"))) # 循环网页一页中的条目 for i in range(len(title_list)): try: # 本页的第几个条目 if count % 20 == 0: term = 20 else: term = count % 20 title_xpath = f"/html/body/div[3]/div[2]/div[2]/div[2]/form/div/table/tbody/tr[{term}]/td[2]" # print(title_xpath) author_xpath = f"/html/body/div[3]/div[2]/div[2]/div[2]/form/div/table/tbody/tr[{term}]/td[3]" source_xpath = f"/html/body/div[3]/div[2]/div[2]/div[2]/form/div/table/tbody/tr[{term}]/td[4]" date_xpath = f"/html/body/div[3]/div[2]/div[2]/div[2]/form/div/table/tbody/tr[{term}]/td[5]" database_xpath = f"/html/body/div[3]/div[2]/div[2]/div[2]/form/div/table/tbody/tr[{term}]/td[6]" title = WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.XPATH, title_xpath))).text authors = WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.XPATH, author_xpath))).text source = WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.XPATH, source_xpath))).text date = WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.XPATH, date_xpath))).text database = WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.XPATH, database_xpath))).text # 点击条目 title_list[i].click() # 获取driver的句柄 n = driver.window_handles # driver切换至最新生产的页面 driver.switch_to.window(n[-1]) # 摘要 abstract = WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.CLASS_NAME, "abstract-text"))).text try: # 单位 institute = WebDriverWait(driver, 10).until(EC.presence_of_element_located( (By.XPATH, "/html/body/div[2]/div[1]/div[3]/div/div/div[3]/div/h3[2]/span/a"))).text except: institute = '无' try: # 关键字 keywords = WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.CLASS_NAME, "keywords"))).text[:-1] except: keywords = '无' url = driver.current_url # 获取下载链接 # 写入文件 res = f"{count}\t{title}\t{authors}\t{institute}\t{date}\t{source}\t{database}\t{keywords}\t{abstract}\t{url}".replace( "\n", "")+"\n" # print(f" 第 {count} 条写入成功\n") # success_text = '第' + str(count) + '条数据抓取成功' + '\n' # self.textBrowser.append(success_text) # # 实时刷新界面 # QApplication.processEvents() # 睡眠一秒 # time.sleep(1) with open(save_path, 'a', encoding='gbk') as f: f.write(res) success_text = '第' + str(count) + '条数据抓取成功' + '\n' # self.textBrowser.append(success_text) # print(success_text) self.trigger.emit(success_text) # 实时刷新界面 # QApplication.processEvents() except: # print(f" 第 {count} 条爬取失败\n") # 跳过本条,接着下一个 continue finally: # 如果有多个窗口,关闭第二个窗口, 切换回主页 n2 = driver.window_handles if len(n2) > 1: driver.close() driver.switch_to.window(n2[0]) # 计数,判断需求是否足够 count += 1 if count > papers_need: success_text = '抓取数据结束,共抓取' + \ str(count-1) + '条数据' + '\n' # print(success_text) # self.textBrowser.append(success_text) self.trigger.emit(success_text) break # 切换到下一页 WebDriverWait(driver, 10).until(EC.presence_of_element_located( (By.XPATH, "//a[@id='PageNext']"))).click() # 关闭浏览器 driver.close() def main(): import sys app = QApplication(sys.argv) pr = Cnkiprogrammer() pr.show() app.exec() if __name__ == '__main__': main()

运行效果图

notion image