多线程抓取知网文献信息 # -*- coding: utf-8 -*-
import sys
import time
from PyQt6.QtCore import QThread, pyqtSignal
from Ui_cnki import Ui_Dialog
from PyQt6.QtWidgets import QApplication, QDialog
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.common.keys import Keys
from PyQt6 import QtWidgets
class Cnkiprogrammer(QDialog, Ui_Dialog):
def __init__(self, parent=None):
super(Cnkiprogrammer, self).__init__(parent)
self.setupUi(self)
# 实例化线程对象
self.work = WorkThread()
# self.runButton.clicked.connect(self.execute)
# 编辑按钮
self.pushButton.clicked.connect(self.startBtn)
# 选择保存路径
self.toolButton.clicked.connect(self.savePath)
def savePath(self):
# 保存路径
directory = QtWidgets.QFileDialog.getSaveFileName(
self, "设置路径", "./", "All Files (*);;Text Files (*.tsv)")
# print(directory)
self.lineEdit_3.setText(directory[0])
def startBtn(self):
# 开始检索,判断输入框内容是否为空
if len(self.lineEdit_1.text()) == 0:
print("请输入关键词")
self.lineEdit_1.setText('请输入关键词')
else:
# self.linkSearch()
desired_capabilities = DesiredCapabilities.CHROME
desired_capabilities["pageLoadStrategy"] = "none"
# 设置谷歌驱动器的环境
options = webdriver.ChromeOptions()
# 设置chrome不加载图片,提高速度
options.add_experimental_option(
"prefs", {"profile.managed_default_content_settings.images": 2})
# 设置不显示窗口
options.add_argument('--headless')
# 设置全局变量,在线程run中获取值
global driver
global theme
global papers_need
global save_path
# 创建一个谷歌驱动器
driver = webdriver.Chrome(options=options)
# 设置搜索主题
theme = self.lineEdit_1.text()
# 设置所需篇数
papers_need = int(self.lineEdit_2.text())
# 存储路径
save_path = self.lineEdit_3.text()
# print(save_path)
# 启动线程
self.work.start()
# 线程自定义信号连接的槽函数
# self.work.trigger.connect(self.display)
# self.trigger.emit(success_text)
self.work.trigger.connect(self.display)
# self.textBrowser.append(str)
def execute(self):
# 启动线程
self.work.start()
# 线程自定义信号连接的槽函数
self.work.trigger.connect(self.display)
def display(self, str):
# 由于自定义信号时自动传递一个字符串参数,所以在这个槽函数中要接受一个参数
# self.listWidget.addItem(str)
self.textBrowser.append(str)
class WorkThread(QThread):
# 自定义信号对象。参数str就代表这个信号可以传一个字符串
trigger = pyqtSignal(str)
def __int__(self):
# 初始化函数
super(WorkThread, self).__init__()
def run(self):
# 重写线程执行的run函数
# 触发自定义信号
# for i in range(20):
# time.sleep(1)
# # 通过自定义信号把待显示的字符串传递给槽函数
# self.trigger.emit(str(i))
# get直接返回,不再等待界面加载完成
print("开始")
print(theme)
print(papers_need)
print(save_path)
# 打开页面
driver.get("https://www.cnki.net")
input = driver.find_element(
by=By.CSS_SELECTOR, value=".search-input")
# 传入关键字
input.send_keys(theme)
# 点击搜索
input.send_keys(Keys.ENTER)
time.sleep(3)
# 点击切换中文文献
WebDriverWait(driver, 10).until(EC.presence_of_element_located(
(By.XPATH, "//div[@class='switch-ChEn']/a[@class='ch']"))).click()
time.sleep(1)
# 获取总文献数和页数
res_unm = WebDriverWait(driver, 10).until(EC.presence_of_element_located(
(By.XPATH, "//span[@class='pagerTitleCell']/em"))).text
# 去除千分位里的逗号
res_unm = int(res_unm.replace(",", ''))
page_unm = int(res_unm/20) + 1
print(f"共找到 {res_unm} 条结果, {page_unm} 页。")
success_text = '共查询到' + str(res_unm) + \
'条数据,' + str(page_unm)+'页' + '\n'
# self.textBrowser.append(success_text)
self.trigger.emit(success_text)
# 赋值序号, 控制爬取的文章数量
global count
count = 1
# 当爬取数量小于需求时,循环网页页码
while count <= papers_need:
# 等待加载完全,休眠3S
# time.sleep(3)
title_list = WebDriverWait(driver, 10).until(
EC.presence_of_all_elements_located((By.CLASS_NAME, "fz14")))
# 循环网页一页中的条目
for i in range(len(title_list)):
try:
# 本页的第几个条目
if count % 20 == 0:
term = 20
else:
term = count % 20
title_xpath = f"/html/body/div[3]/div[2]/div[2]/div[2]/form/div/table/tbody/tr[{term}]/td[2]"
# print(title_xpath)
author_xpath = f"/html/body/div[3]/div[2]/div[2]/div[2]/form/div/table/tbody/tr[{term}]/td[3]"
source_xpath = f"/html/body/div[3]/div[2]/div[2]/div[2]/form/div/table/tbody/tr[{term}]/td[4]"
date_xpath = f"/html/body/div[3]/div[2]/div[2]/div[2]/form/div/table/tbody/tr[{term}]/td[5]"
database_xpath = f"/html/body/div[3]/div[2]/div[2]/div[2]/form/div/table/tbody/tr[{term}]/td[6]"
title = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.XPATH, title_xpath))).text
authors = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.XPATH, author_xpath))).text
source = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.XPATH, source_xpath))).text
date = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.XPATH, date_xpath))).text
database = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.XPATH, database_xpath))).text
# 点击条目
title_list[i].click()
# 获取driver的句柄
n = driver.window_handles
# driver切换至最新生产的页面
driver.switch_to.window(n[-1])
# 摘要
abstract = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CLASS_NAME, "abstract-text"))).text
try:
# 单位
institute = WebDriverWait(driver, 10).until(EC.presence_of_element_located(
(By.XPATH, "/html/body/div[2]/div[1]/div[3]/div/div/div[3]/div/h3[2]/span/a"))).text
except:
institute = '无'
try:
# 关键字
keywords = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CLASS_NAME, "keywords"))).text[:-1]
except:
keywords = '无'
url = driver.current_url
# 获取下载链接
# 写入文件
res = f"{count}\t{title}\t{authors}\t{institute}\t{date}\t{source}\t{database}\t{keywords}\t{abstract}\t{url}".replace(
"\n", "")+"\n"
# print(f" 第 {count} 条写入成功\n")
# success_text = '第' + str(count) + '条数据抓取成功' + '\n'
# self.textBrowser.append(success_text)
# # 实时刷新界面
# QApplication.processEvents()
# 睡眠一秒
# time.sleep(1)
with open(save_path, 'a', encoding='gbk') as f:
f.write(res)
success_text = '第' + str(count) + '条数据抓取成功' + '\n'
# self.textBrowser.append(success_text)
print(success_text)
self.trigger.emit(success_text)
# 实时刷新界面
# QApplication.processEvents()
except:
# print(f" 第 {count} 条爬取失败\n")
# 跳过本条,接着下一个
continue
finally:
# 如果有多个窗口,关闭第二个窗口, 切换回主页
n2 = driver.window_handles
if len(n2) > 1:
driver.close()
driver.switch_to.window(n2[0])
# 计数,判断需求是否足够
count += 1
if count > papers_need:
success_text = '抓取数据结束,共抓取' + \
str(count-1) + '条数据' + '\n'
print(success_text)
# self.textBrowser.append(success_text)
self.trigger.emit(success_text)
break
# 切换到下一页
WebDriverWait(driver, 10).until(EC.presence_of_element_located(
(By.XPATH, "//a[@id='PageNext']"))).click()
# 关闭浏览器
driver.close()
# 完成发出信号
# self.trigger.emit()
def main():
import sys
app = QApplication(sys.argv)
pr = Cnkiprogrammer()
pr.show()
app.exec()
if __name__ == '__main__':
main()
High arsenic (As) groundwater in reduced shallow Holocene and Pleistocene aquifers has been intensively investigated, but the occurrence and the genesis mechanisms of high As groundwater in deep Pliocene aquifers affected by geothermal activity still remain unclear. To address these issues, geochemical characteristics of groundwater and aquifer sediments in both middle-Pliocene aquifers and Quaternary aquifers of the Guide basin were investigated to clarify groundwater-sediments interaction and the causes of As enrichment in groundwater from middle-Pliocene aquifer. Higher As and Na+ concentrations were observed in groundwater from middle-Pliocene aquifer (GPA) than those in groundwater from Quaternary aquifer (GQA), while GPA had lower Ca2+ concentrations than GQA. Results showed that middle-Pliocene aquifer sediments had low contents of carbonate minerals, and water-soluble Ca-bearing minerals relative to Quaternary aquifer sediments, which explain higher concentration of Ca2+ in GQA than in GPA. Na+ from weathering of silicates (i.e. (Na+)*), being calculated based on mass balance, accounted for high proportion of dissolved Na+ (up to 68%) in GPA. Weathering of silicates was related to As accumulation in GPA, which was proved by a positive correlation between As and (Na+)* in GPA, and high proportion of As bound to unweathered silicates (up to 65.7%) in middle-Pliocene aquifer sediments. The weathering of silicate minerals directly released As bound to silicates into GPA, and indirectly led to As desorption from solid surfaces by increasing pH, HCO3– and CO32–. Both (Na+)* and As in groundwater increased with the increasing groundwater temperature, showing that high temperature was conducive to weathering of silicates and As enrichment. This paper establishes a bridge between high groundwater temperature and high As concentration with weathering of silicates in aquifers.