使用requests爬取图片
- 简单,就是速度较慢
''' Description: henggao_note version: v1.0.0 Date: 2022-03-17 17:12:46 LastEditors: henggao LastEditTime: 2022-06-14 16:42:54 ''' import re import requests from lxml import etree import time import os headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36'} url = 'https://www.xiurenb.com' resp = requests.get(url=url, headers=headers) resp.encoding='utf-8' tree = etree.HTML(resp.text) li_list = tree.xpath("//ul[@class='update_area_lists cl']//li") for li in li_list: img_list = [] img_dict = {} task_list = [] gril_url = li.xpath('./a/@href')[0] # print(gril_url) # 拼接每个角色url, 并给每个角色建一个文件夹 box_url = url + gril_url # 获取每个角色的name girl_name = li.xpath('./a/div/span/text()')[0] gril_file = './data/' + girl_name if not os.path.exists(gril_file): os.mkdir(gril_file) # print(girl_name) # 这里进入详情页 page_detail = requests.get(url=box_url, headers=headers) page_detail.encoding = 'utf-8' detail_tree = etree.HTML(page_detail.text) # 遍历页数 page_url = detail_tree.xpath("//div[@class='page']")[0] page_href = page_url.findall("a") # 删除第一个和最后一个 page_href.pop(0) page_href.pop() # 获取每页url for k in page_href: per_page_url = k.get('href') box_url = url + per_page_url # 这里进入详情页 page_detail = requests.get(url=box_url, headers=headers) page_detail.encoding = 'utf-8' detail_tree = etree.HTML(page_detail.text) # 获取图片url,https://p.xiurenb.com/uploadfile/202206/13/1B154140463.jpg list_url = detail_tree.xpath('/html/body/div[3]/div/div/div[5]/p/img/@src') # 获取名字 gril_page_name = detail_tree.xpath('/html/body/div[3]/div/div/div[2]/div/a[3]/span/text()')[0] print(gril_page_name) # 每页的三张图片 for per_url in list_url: print(per_url) per_name = per_url[per_url.rfind("/"):] # 在per_url中查找最后一个正斜杠/后面的字符,图片名称 gril_img = './data/'+ gril_page_name + per_name # 拼接每一张图片的url per_url= url + per_url # 二进制写入图片 bytes_img = requests.get(url=per_url, headers=headers).content with open(gril_img, 'wb',) as fp: fp.write(bytes_img)
使用异步优化
什么是 aiohttp?一个异步的 HTTP 客户端\服务端框架,基于 asyncio 的异步模块。可用于实现异步爬虫,更快于 requests 的同步爬虫。
pip install aiohttp
实战
''' Description: henggao_note version: v1.0.0 Date: 2022-03-17 17:12:46 LastEditors: henggao LastEditTime: 2022-06-15 11:44:31 ''' import re import requests from lxml import etree import os import aiohttp import asyncio import time home_url = "https://www.xiurenb.com/new.html" url = 'https://www.xiurenb.com' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36'} semaphore = asyncio.Semaphore(500) # 限制并发量为100 # 获取页面图片下载 async def RealDownLoad(img_url, filename): # with(await semaphore): async with semaphore: async with aiohttp.ClientSession() as sess: for per_img_url in img_url: per_name = per_img_url[per_img_url.rfind("/"):] # 在per_url中查找最后一个正斜杠/后面的字符,图片名称 down_gril_name = './data/'+ filename + per_name async with await sess.get(url=per_img_url, headers=headers) as response: # 二进制获取图片 print(down_gril_name) img_bytes = await response.read() with open(down_gril_name, 'wb') as img: img.write(img_bytes) # 每张图片URL async def DownLoad(page_url_list, filename): img_url = [] for page in page_url_list: async with aiohttp.ClientSession() as sess: async with await sess.get(url=page, headers=headers) as response: # page_detail.encoding = 'utf-8' page_detail = await response.text() detail_tree = etree.HTML(page_detail) list_url = detail_tree.xpath('/html/body/div[3]/div/div/div[5]/p/img/@src') # 每页的三张图片 for per_url in list_url: # 拼接每一张图片的url per_url= url + per_url img_url.append(per_url) return await RealDownLoad(img_url, filename) # 每页URL async def GetInDetailPage(boxes): page_url_list = [] # 这里进入详情页 page_detail = requests.get(url=boxes['url'], headers=headers) page_detail.encoding = 'utf-8' detail_tree = etree.HTML(page_detail.text) # 遍历页数 page_url = detail_tree.xpath("//div[@class='page']")[0] page_href = page_url.findall("a") # 删除第一个和最后一个 page_href.pop(0) page_href.pop() # 获取每页url for k in page_href: per_page_url = k.get('href') box_url = url + per_page_url page_url_list.append(box_url) return await DownLoad(page_url_list, boxes['girl_name']) async def main(): tasks = [] resp = requests.get(url=home_url, headers=headers) resp.encoding='utf-8' tree = etree.HTML(resp.text) li_list = tree.xpath("//ul[@class='update_area_lists cl']//li") box_url_list = [] for li in li_list: box = {} gril_url = li.xpath('./a/@href')[0] # 拼接每个角色url, 并给每个角色建一个文件夹 box_url = url + gril_url # 获取每个角色的name girl_name = li.xpath('./a/div/span/text()')[0] gril_file = './data/' + girl_name if not os.path.exists(gril_file): os.mkdir(gril_file) box['url'] = box_url box['girl_name'] = girl_name box_url_list.append(box) # 进入详情页面 for boxes in box_url_list: c = GetInDetailPage(boxes) tasks.append(asyncio.ensure_future(c)) await asyncio.wait(tasks) loop = asyncio.get_event_loop() start = time.time() loop.run_until_complete(main()) print("用时: ", time.time()-start) exit()