本来是打算写一个异步爬取的,但是暂时还没有发现是什么问题 好像没有实现异步
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 | import requests from lxml import etree import os import asyncio import aiohttp def get_ppt_list(page_need, headers): for i in range(1, int(page_need)+1): if i == 1: url = 'http://sc.chinaz.com/ppt/free.html' list_page_text = requests.get(url=url, headers=headers).text list_page_tree = etree.HTML(list_page_text) list_ppt_url = list_page_tree.xpath('//*[@id="main"]/div/div/a/@href') tasks = [] for url in list_ppt_url: c = get_ppt(url, headers) task = asyncio.ensure_future(c) tasks.append(task) loop = asyncio.get_event_loop() loop.run_until_complete(asyncio.wait(tasks)) else: url = 'http://sc.chinaz.com/ppt/free_{}.html'.format(i) list_page_text = requests.get(url=url, headers=headers).text list_page_tree = etree.HTML(list_page_text) list_ppt_url = list_page_tree.xpath('//*[@id="main"]/div/div/a/@href') tasks = [] for url in list_ppt_url: c = get_ppt(url, headers) task = asyncio.ensure_future(c) tasks.append(task) loop = asyncio.get_event_loop() loop.run_until_complete(asyncio.wait(tasks)) async def get_ppt(url, headers): async with aiohttp.ClientSession() as session: async with await session.get(url=url, headers=headers) as ppt_page: ppt_page_text = await ppt_page.text() ppt_page_tree = etree.HTML(ppt_page_text) title = ppt_page_tree.xpath('//div[@class="ppt_tit clearfix"]/h1/text()')[0] title = title + '.rar' ppt_path = './站长PPT/' + title down_url = ppt_page_tree.xpath('//*[@id="down"]/div[2]/ul/li[3]/a/@href')[0] async with await session.get(url=down_url, headers=headers) as ppt: ppt = await ppt.read() with open(ppt_path, 'wb') as fp: fp.write(ppt) print("{} 下载成功!!!".format(title)) url = 'http://sc.chinaz.com/ppt/free.html' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.92 Safari/537.36', } list_page_text = requests.get(url=url, headers=headers).text list_page_tree = etree.HTML(list_page_text) page_limit = list_page_tree.xpath('//div[@class="pagination fr clearfix clear"]/a[8]/b/text()')[0] while True: print("该模板页数上限是{}, 请输入您所需要的总页数".format(page_limit)) page_need = input("<<<") if page_need.isdigit() and 1 <= int(page_need) <= int(page_limit): if not os.path.exists('./站长PPT'): os.mkdir('./站长PPT') get_ppt_list(page_need, headers) break else: print("请重新输入正确的页数") continue |