Python Crawler6:抓取知网文献信息

Python Crawler6:抓取知网文献信息

程序代码

''' Description: henggao_note version: v1.0.0 Date: 2022-04-01 10:41:56 LastEditors: henggao LastEditTime: 2022-04-07 15:17:05 ''' import time from selenium import webdriver from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.by import By from selenium.webdriver.common.desired_capabilities import DesiredCapabilities from urllib.parse import urljoin from selenium.webdriver.common.keys import Keys # # 获取总文献数和页数 # res_unm = WebDriverWait(driver, 10).until(EC.presence_of_element_located( # (By.XPATH, "//span[@class='pagerTitleCell']/em"))).text # print(res_unm) # get直接返回,不再等待界面加载完成 desired_capabilities = DesiredCapabilities.CHROME desired_capabilities["pageLoadStrategy"] = "none" # 设置谷歌驱动器的环境 options = webdriver.ChromeOptions() # 设置chrome不加载图片,提高速度 options.add_experimental_option( "prefs", {"profile.managed_default_content_settings.images": 2}) # 设置不显示窗口 # options.add_argument('--headless') # 创建一个谷歌驱动器 driver = webdriver.Chrome(options=options) # 设置搜索主题 theme = "Hadoop" # 设置所需篇数 papers_need = 30 # 打开页面 driver.get("https://www.cnki.net") input = driver.find_element(by=By.CSS_SELECTOR, value=".search-input") # 传入关键字 input.send_keys(theme) # 点击搜索 input.send_keys(Keys.ENTER) time.sleep(3) # 点击切换中文文献 WebDriverWait(driver, 10).until(EC.presence_of_element_located( (By.XPATH, "//div[@class='switch-ChEn']/a[@class='ch']"))).click() time.sleep(1) # 获取总文献数和页数 res_unm = WebDriverWait(driver, 10).until(EC.presence_of_element_located( (By.XPATH, "//span[@class='pagerTitleCell']/em"))).text # 去除千分位里的逗号 res_unm = int(res_unm.replace(",", '')) page_unm = int(res_unm/20) + 1 print(f"共找到 {res_unm} 条结果, {page_unm} 页。") # 赋值序号, 控制爬取的文章数量 count = 1 # 当爬取数量小于需求时,循环网页页码 while count <= papers_need: # 等待加载完全,休眠3S time.sleep(3) title_list = WebDriverWait(driver, 10).until( EC.presence_of_all_elements_located((By.CLASS_NAME, "fz14"))) # 循环网页一页中的条目 for i in range(len(title_list)): try: # 本页的第几个条目 if count % 20 == 0: term = 20 else: term = count % 20 title_xpath = f"/html/body/div[3]/div[2]/div[2]/div[2]/form/div/table/tbody/tr[{term}]/td[2]" # print(title_xpath) author_xpath = f"/html/body/div[3]/div[2]/div[2]/div[2]/form/div/table/tbody/tr[{term}]/td[3]" source_xpath = f"/html/body/div[3]/div[2]/div[2]/div[2]/form/div/table/tbody/tr[{term}]/td[4]" date_xpath = f"/html/body/div[3]/div[2]/div[2]/div[2]/form/div/table/tbody/tr[{term}]/td[5]" database_xpath = f"/html/body/div[3]/div[2]/div[2]/div[2]/form/div/table/tbody/tr[{term}]/td[6]" title = WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.XPATH, title_xpath))).text authors = WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.XPATH, author_xpath))).text source = WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.XPATH, source_xpath))).text date = WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.XPATH, date_xpath))).text database = WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.XPATH, database_xpath))).text # 点击条目 title_list[i].click() # href = title_list[i].get_attribute('href') # # js="window.open('{}','_blank');" # js="window.open('{}');" # print(js.format(href)) # driver.execute_script(js.format(href)) # # 获取driver的句柄 n = driver.window_handles # driver切换至最新生产的页面 driver.switch_to.window(n[-1]) # driver.switch_to_window(n[-1]) # driver.switch_to_window(driver.window_handles[-1]) # 开始获取页面信息 # title = WebDriverWait( driver, 10 ).until( EC.presence_of_element_located((By.XPATH ,"/html/body/div[2]/div[1]/div[3]/div/div/div[3]/div/h1") ) ).text # authors = WebDriverWait( driver, 10 ).until( EC.presence_of_element_located((By.XPATH ,"/html/body/div[2]/div[1]/div[3]/div/div/div[3]/div/h3[1]") ) ).text # 摘要 abstract = WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.CLASS_NAME, "abstract-text"))).text try: # 单位 # institute = WebDriverWait(driver, 10).until(EC.presence_of_element_located( # (By.XPATH, "//div[@class='wx-tit']/h3[2]/span/a"))).text if (WebDriverWait(driver, 10).until(EC.presence_of_element_located( # (By.XPATH, "//div[@class='wx-tit']/h3[2]/span/a")))) else WebDriverWait(driver, 10).until(EC.presence_of_element_located( # (By.XPATH, "//div[@class='wx-tit']/h3[2]/a[1]"))).text institute = WebDriverWait(driver, 10).until(EC.presence_of_element_located( (By.XPATH, "/html/body/div[2]/div[1]/div[3]/div/div/div[3]/div/h3[2]/span/a"))).text except: institute = '无' try: # 关键字 keywords = WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.CLASS_NAME, "keywords"))).text[:-1] except: keywords = '无' url = driver.current_url # 获取下载链接 # link = WebDriverWait( driver, 10 ).until( EC.presence_of_all_elements_located((By.CLASS_NAME ,"btn-dlcaj") ) )[0].get_attribute('href') # link = urljoin(driver.current_url, link) # 写入文件 res = f"{count}\t{title}\t{authors}\t{institute}\t{date}\t{source}\t{database}\t{keywords}\t{abstract}\t{url}".replace( "\n", "")+"\n" print(f" 第 {count} 条写入成功\n") with open('CNKI_res.tsv', 'a', encoding='gbk') as f: f.write(res) except: print(f" 第 {count} 条爬取失败\n") # 跳过本条,接着下一个 continue finally: # 如果有多个窗口,关闭第二个窗口, 切换回主页 n2 = driver.window_handles if len(n2) > 1: driver.close() # driver.switch_to_window(n2[0]) driver.switch_to.window(n2[0]) # 计数,判断需求是否足够 count += 1 if count >= papers_need: break # 切换到下一页 WebDriverWait(driver, 10).until(EC.presence_of_element_located( (By.XPATH, "//a[@id='PageNext']"))).click() # 关闭浏览器 driver.close()

效果

  • 使用excel打开CNKI_res.tsv,用制表符tab分割。
notion image
 
Python爬虫,批量获取知网文献信息
最近临近毕业,写毕业论文需要从知网查找大量的文献。但去知网一条一条进去看摘要又略显麻烦和浪费时间。于是,反手写一个爬虫,批量获取基本信息,岂不美哉? 在开始这个项目之前,我抱着不重复造轮子的心态,寻思着去Github先找找。结果发现基本上都是几年前的项目,现在早已不能使用。最后证实了, 靠别人不如靠自己 ,撸起袖子就开干! 网络爬虫就是模拟浏览器发送网络请求,接收请求响应,一种按照一定的规则,自动地抓取互联网信息的程序。 目前爬虫主要分为以 requests 库为代表的模拟请求类爬虫和以 selenium 为代表的模拟浏览器用户行为的爬虫两类。: Requests 是用Python语言编写,基于 urllib,采用 Apache2 Licensed 开源协议的 HTTP 库。它比 urllib 更加方便,可以节约我们大量的工作,完全满足 HTTP 测试需求。Requests 的哲学是以 PEP 20 的习语为中心开发的,所以它比 urllib 更加 Pythoner。 Selenium 是一个用于Web应用程序测试的工具。Selenium测试直接运行在浏览器中,就像真正的用户在操作一样。支持的浏览器包括IE,Mozilla Firefox,Safari,Google Chrome,Opera等。 中国知网作为国内最知名的文献数据库之一,有着复杂的反爬虫机制,包括: 动态JS、iframe、验证码等等。直接模拟请求难度较大,且容易被封IP地址,所以本文主要介绍如何使用 Selenium 来爬取知网。 Selenium支持非常多的浏览器,如Chrome、Firefox、Edge等,我们只要首先下载好相应浏览器的 webdriver 到python主目录中,或者加入环境变量即可。 不同浏览器的初始化: from selenium import webdriver browser = webdriver.Chrome() browser =
Python爬虫,批量获取知网文献信息
 

改进版

  • cnki.ui
<?xml version="1.0" encoding="UTF-8"?> <ui version="4.0"> <class>Dialog</class> <widget class="QDialog" name="Dialog"> <property name="geometry"> <rect> <x>0</x> <y>0</y> <width>490</width> <height>369</height> </rect> </property> <property name="windowTitle"> <string>Dialog</string> </property> <widget class="QDialogButtonBox" name="buttonBox"> <property name="geometry"> <rect> <x>330</x> <y>330</y> <width>156</width> <height>24</height> </rect> </property> <property name="orientation"> <enum>Qt::Horizontal</enum> </property> <property name="standardButtons"> <set>QDialogButtonBox::Cancel|QDialogButtonBox::Ok</set> </property> </widget> <widget class="QLabel" name="label_5"> <property name="geometry"> <rect> <x>180</x> <y>10</y> <width>131</width> <height>31</height> </rect> </property> <property name="font"> <font> <family>华文楷体</family> <pointsize>15</pointsize> </font> </property> <property name="text"> <string>知网信息获取</string> </property> </widget> <widget class="QWidget" name="gridLayoutWidget"> <property name="geometry"> <rect> <x>70</x> <y>50</y> <width>361</width> <height>251</height> </rect> </property> <layout class="QGridLayout" name="gridLayout"> <item row="1" column="0"> <widget class="QLabel" name="label_2"> <property name="font"> <font> <family>宋体</family> <pointsize>10</pointsize> </font> </property> <property name="text"> <string>请输入数量:</string> </property> </widget> </item> <item row="1" column="1"> <widget class="QLineEdit" name="lineEdit_2"> <property name="text"> <string/> </property> </widget> </item> <item row="5" column="1"> <widget class="QTextBrowser" name="textBrowser"/> </item> <item row="2" column="0"> <widget class="QLabel" name="label_3"> <property name="font"> <font> <family>宋体</family> <pointsize>10</pointsize> </font> </property> <property name="text"> <string>路径选择:</string> </property> </widget> </item> <item row="2" column="2"> <widget class="QToolButton" name="toolButton"> <property name="text"> <string>...</string> </property> </widget> </item> <item row="0" column="0"> <widget class="QLabel" name="label_1"> <property name="font"> <font> <family>宋体</family> <pointsize>10</pointsize> </font> </property> <property name="text"> <string>请输入关键词:</string> </property> </widget> </item> <item row="0" column="1"> <widget class="QLineEdit" name="lineEdit_1"> <property name="text"> <string/> </property> </widget> </item> <item row="5" column="0"> <widget class="QLabel" name="label_4"> <property name="font"> <font> <pointsize>9</pointsize> </font> </property> <property name="text"> <string>输出结果:</string> </property> </widget> </item> <item row="2" column="1"> <widget class="QLineEdit" name="lineEdit_3"> <property name="text"> <string/> </property> </widget> </item> <item row="3" column="1"> <layout class="QHBoxLayout" name="horizontalLayout"> <item> <widget class="QPushButton" name="pushButton"> <property name="text"> <string>开始</string> </property> </widget> </item> <item> <widget class="QPushButton" name="pushButton_2"> <property name="text"> <string>结束</string> </property> </widget> </item> </layout> </item> </layout> </widget> </widget> <resources/> <connections> <connection> <sender>buttonBox</sender> <signal>accepted()</signal> <receiver>Dialog</receiver> <slot>accept()</slot> <hints> <hint type="sourcelabel"> <x>248</x> <y>254</y> </hint> <hint type="destinationlabel"> <x>157</x> <y>274</y> </hint> </hints> </connection> <connection> <sender>buttonBox</sender> <signal>rejected()</signal> <receiver>Dialog</receiver> <slot>reject()</slot> <hints> <hint type="sourcelabel"> <x>316</x> <y>260</y> </hint> <hint type="destinationlabel"> <x>286</x> <y>274</y> </hint> </hints> </connection> </connections> </ui>
  • Ui_cnki.py
# Form implementation generated from reading ui file 'd:\Cumtb_Code\PyQT\cnki.ui' # # Created by: PyQt6 UI code generator 6.1.0 # # WARNING: Any manual changes made to this file will be lost when pyuic6 is # run again. Do not edit this file unless you know what you are doing. from PyQt6 import QtCore, QtGui, QtWidgets class Ui_Dialog(object): def setupUi(self, Dialog): Dialog.setObjectName("Dialog") Dialog.resize(490, 369) self.buttonBox = QtWidgets.QDialogButtonBox(Dialog) self.buttonBox.setGeometry(QtCore.QRect(330, 330, 156, 24)) self.buttonBox.setOrientation(QtCore.Qt.Orientation.Horizontal) self.buttonBox.setStandardButtons(QtWidgets.QDialogButtonBox.StandardButton.Cancel|QtWidgets.QDialogButtonBox.StandardButton.Ok) self.buttonBox.setObjectName("buttonBox") self.label_5 = QtWidgets.QLabel(Dialog) self.label_5.setGeometry(QtCore.QRect(180, 10, 131, 31)) font = QtGui.QFont() font.setFamily("华文楷体") font.setPointSize(15) self.label_5.setFont(font) self.label_5.setObjectName("label_5") self.gridLayoutWidget = QtWidgets.QWidget(Dialog) self.gridLayoutWidget.setGeometry(QtCore.QRect(70, 50, 361, 251)) self.gridLayoutWidget.setObjectName("gridLayoutWidget") self.gridLayout = QtWidgets.QGridLayout(self.gridLayoutWidget) self.gridLayout.setContentsMargins(0, 0, 0, 0) self.gridLayout.setObjectName("gridLayout") self.label_2 = QtWidgets.QLabel(self.gridLayoutWidget) font = QtGui.QFont() font.setFamily("宋体") font.setPointSize(10) self.label_2.setFont(font) self.label_2.setObjectName("label_2") self.gridLayout.addWidget(self.label_2, 1, 0, 1, 1) self.lineEdit_2 = QtWidgets.QLineEdit(self.gridLayoutWidget) self.lineEdit_2.setText("") self.lineEdit_2.setObjectName("lineEdit_2") self.gridLayout.addWidget(self.lineEdit_2, 1, 1, 1, 1) self.textBrowser = QtWidgets.QTextBrowser(self.gridLayoutWidget) self.textBrowser.setObjectName("textBrowser") self.gridLayout.addWidget(self.textBrowser, 5, 1, 1, 1) self.label_3 = QtWidgets.QLabel(self.gridLayoutWidget) font = QtGui.QFont() font.setFamily("宋体") font.setPointSize(10) self.label_3.setFont(font) self.label_3.setObjectName("label_3") self.gridLayout.addWidget(self.label_3, 2, 0, 1, 1) self.toolButton = QtWidgets.QToolButton(self.gridLayoutWidget) self.toolButton.setObjectName("toolButton") self.gridLayout.addWidget(self.toolButton, 2, 2, 1, 1) self.label_1 = QtWidgets.QLabel(self.gridLayoutWidget) font = QtGui.QFont() font.setFamily("宋体") font.setPointSize(10) self.label_1.setFont(font) self.label_1.setObjectName("label_1") self.gridLayout.addWidget(self.label_1, 0, 0, 1, 1) self.lineEdit_1 = QtWidgets.QLineEdit(self.gridLayoutWidget) self.lineEdit_1.setText("") self.lineEdit_1.setObjectName("lineEdit_1") self.gridLayout.addWidget(self.lineEdit_1, 0, 1, 1, 1) self.label_4 = QtWidgets.QLabel(self.gridLayoutWidget) font = QtGui.QFont() font.setPointSize(9) self.label_4.setFont(font) self.label_4.setObjectName("label_4") self.gridLayout.addWidget(self.label_4, 5, 0, 1, 1) self.lineEdit_3 = QtWidgets.QLineEdit(self.gridLayoutWidget) self.lineEdit_3.setText("") self.lineEdit_3.setObjectName("lineEdit_3") self.gridLayout.addWidget(self.lineEdit_3, 2, 1, 1, 1) self.horizontalLayout = QtWidgets.QHBoxLayout() self.horizontalLayout.setObjectName("horizontalLayout") self.pushButton = QtWidgets.QPushButton(self.gridLayoutWidget) self.pushButton.setObjectName("pushButton") self.horizontalLayout.addWidget(self.pushButton) self.pushButton_2 = QtWidgets.QPushButton(self.gridLayoutWidget) self.pushButton_2.setObjectName("pushButton_2") self.horizontalLayout.addWidget(self.pushButton_2) self.gridLayout.addLayout(self.horizontalLayout, 3, 1, 1, 1) self.retranslateUi(Dialog) self.buttonBox.accepted.connect(Dialog.accept) self.buttonBox.rejected.connect(Dialog.reject) QtCore.QMetaObject.connectSlotsByName(Dialog) def retranslateUi(self, Dialog): _translate = QtCore.QCoreApplication.translate Dialog.setWindowTitle(_translate("Dialog", "Dialog")) self.label_5.setText(_translate("Dialog", "知网信息获取")) self.label_2.setText(_translate("Dialog", "请输入数量:")) self.label_3.setText(_translate("Dialog", "路径选择:")) self.toolButton.setText(_translate("Dialog", "...")) self.label_1.setText(_translate("Dialog", "请输入关键词:")) self.label_4.setText(_translate("Dialog", "输出结果:")) self.pushButton.setText(_translate("Dialog", "开始")) self.pushButton_2.setText(_translate("Dialog", "结束"))
 
  • demo.py
    • 这个可以抓取,就是会出现UI界面卡顿现象。尝试使用多线程解决。
''' Description: henggao_note version: v1.0.0 Date: 2022-04-08 14:45:26 LastEditors: henggao LastEditTime: 2022-04-08 16:26:26 ''' ''' Description: henggao_note version: v1.0.0 Date: 2022-04-07 15:19:37 LastEditors: henggao LastEditTime: 2022-04-08 14:41:10 ''' # from selenium.webdriver.chrome.service import Service from PyQt6.QtWidgets import QApplication, QDialog import time from selenium import webdriver from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.by import By from selenium.webdriver.common.desired_capabilities import DesiredCapabilities from selenium.webdriver.common.keys import Keys from Ui_cnki import Ui_Dialog from PyQt6 import QtWidgets class Cnkiprogrammer(QDialog, Ui_Dialog): def __init__(self, parent=None): super(Cnkiprogrammer, self).__init__(parent) self.setupUi(self) # 在PyQt中使用time.sleep(1)会导致界面阻塞,使用QTimer # self.timer = QtCore.QTimer() # 编辑按钮 self.pushButton.clicked.connect(self.startBtn) # 选择保存路径 self.toolButton.clicked.connect(self.savePath) def savePath(self): # 保存路径 directory = QtWidgets.QFileDialog.getSaveFileName( self, "设置路径", "./", "All Files (*);;Text Files (*.tsv)") # print(directory) self.lineEdit_3.setText(directory[0]) def startBtn(self): # 开始检索,判断输入框内容是否为空 if len(self.lineEdit_1.text()) == 0: print("请输入关键词") self.lineEdit_1.setText('请输入关键词') else: self.linkSearch() def stopBtn(self): # 停止 print("结束检索") # driver.close() # 连接Chmore浏览,访问知网,进行搜索 def linkSearch(self): # get直接返回,不再等待界面加载完成 desired_capabilities = DesiredCapabilities.CHROME desired_capabilities["pageLoadStrategy"] = "none" # 设置谷歌驱动器的环境 options = webdriver.ChromeOptions() # 设置chrome不加载图片,提高速度 options.add_experimental_option( "prefs", {"profile.managed_default_content_settings.images": 2}) # 设置不显示窗口 options.add_argument('--headless') # 创建一个谷歌驱动器 # s = Service('chromedriver.exe') # driver = webdriver.Chrome(options=options,service=s) # 设置chromedriver位置 driver = webdriver.Chrome(options=options,executable_path='chromedriver.exe') # driver = webdriver.Chrome(options=options) # 设置搜索主题 theme = self.lineEdit_1.text() # 设置所需篇数 papers_need = int(self.lineEdit_2.text()) # 存储路径 save_path = self.lineEdit_3.text() print(save_path) # 打开页面 driver.get("https://www.cnki.net") input = driver.find_element( by=By.CSS_SELECTOR, value=".search-input") # 传入关键字 input.send_keys(theme) # 点击搜索 input.send_keys(Keys.ENTER) time.sleep(3) # 点击切换中文文献 WebDriverWait(driver, 10).until(EC.presence_of_element_located( (By.XPATH, "//div[@class='switch-ChEn']/a[@class='ch']"))).click() time.sleep(1) # 获取总文献数和页数 res_unm = WebDriverWait(driver, 10).until(EC.presence_of_element_located( (By.XPATH, "//span[@class='pagerTitleCell']/em"))).text # 去除千分位里的逗号 res_unm = int(res_unm.replace(",", '')) page_unm = int(res_unm/20) + 1 # print(f"共找到 {res_unm} 条结果, {page_unm} 页。") success_text = '共查询到' + str(res_unm) + \ '条数据,' + str(page_unm)+'页' + '\n' self.textBrowser.append(success_text) # 赋值序号, 控制爬取的文章数量 global count count = 1 # 当爬取数量小于需求时,循环网页页码 while count <= papers_need: # 等待加载完全,休眠3S # time.sleep(3) title_list = WebDriverWait(driver, 10).until( EC.presence_of_all_elements_located((By.CLASS_NAME, "fz14"))) # 循环网页一页中的条目 for i in range(len(title_list)): try: # 本页的第几个条目 if count % 20 == 0: term = 20 else: term = count % 20 title_xpath = f"/html/body/div[3]/div[2]/div[2]/div[2]/form/div/table/tbody/tr[{term}]/td[2]" # print(title_xpath) author_xpath = f"/html/body/div[3]/div[2]/div[2]/div[2]/form/div/table/tbody/tr[{term}]/td[3]" source_xpath = f"/html/body/div[3]/div[2]/div[2]/div[2]/form/div/table/tbody/tr[{term}]/td[4]" date_xpath = f"/html/body/div[3]/div[2]/div[2]/div[2]/form/div/table/tbody/tr[{term}]/td[5]" database_xpath = f"/html/body/div[3]/div[2]/div[2]/div[2]/form/div/table/tbody/tr[{term}]/td[6]" title = WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.XPATH, title_xpath))).text authors = WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.XPATH, author_xpath))).text source = WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.XPATH, source_xpath))).text date = WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.XPATH, date_xpath))).text database = WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.XPATH, database_xpath))).text # 点击条目 title_list[i].click() # 获取driver的句柄 n = driver.window_handles # driver切换至最新生产的页面 driver.switch_to.window(n[-1]) # 摘要 abstract = WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.CLASS_NAME, "abstract-text"))).text try: # 单位 institute = WebDriverWait(driver, 10).until(EC.presence_of_element_located( (By.XPATH, "/html/body/div[2]/div[1]/div[3]/div/div/div[3]/div/h3[2]/span/a"))).text except: institute = '无' try: # 关键字 keywords = WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.CLASS_NAME, "keywords"))).text[:-1] except: keywords = '无' url = driver.current_url # 获取下载链接 # 写入文件 res = f"{count}\t{title}\t{authors}\t{institute}\t{date}\t{source}\t{database}\t{keywords}\t{abstract}\t{url}".replace( "\n", "")+"\n" # print(f" 第 {count} 条写入成功\n") # success_text = '第' + str(count) + '条数据抓取成功' + '\n' # self.textBrowser.append(success_text) # # 实时刷新界面 # QApplication.processEvents() # 睡眠一秒 # time.sleep(1) with open(save_path, 'a', encoding='gbk') as f: f.write(res) success_text = '第' + str(count) + '条数据抓取成功' + '\n' self.textBrowser.append(success_text) # 实时刷新界面 QApplication.processEvents() except: # print(f" 第 {count} 条爬取失败\n") # 跳过本条,接着下一个 continue finally: # 如果有多个窗口,关闭第二个窗口, 切换回主页 n2 = driver.window_handles if len(n2) > 1: driver.close() driver.switch_to.window(n2[0]) # 计数,判断需求是否足够 count += 1 if count > papers_need: success_text = '抓取数据结束,共抓取' + \ str(count-1) + '条数据' + '\n' self.textBrowser.append(success_text) break # 切换到下一页 WebDriverWait(driver, 10).until(EC.presence_of_element_located( (By.XPATH, "//a[@id='PageNext']"))).click() # 关闭浏览器 driver.close() def main(): import sys app = QApplication(sys.argv) pr = Cnkiprogrammer() pr.show() app.exec() if __name__ == '__main__': main()