批量爬取站长免费简历, 可以自行选择爬取页数.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 | from lxml import etree import requests import os # 封装解析下载函数 def cv_down(tree, headers): cv_href = tree.xpath('//div[@class="sc_warp mt20"]/div/div/div/a/@href') for href in cv_href: act_response = requests.get(url=href, headers=headers).text act_tree = etree.HTML(act_response) cv_title = act_tree.xpath('//div[@class="ppt_tit clearfix"]/h1/text()') cv_title = cv_title[0].encode('ISO-8859-1').decode('utf-8') + '.rar' dow_url = act_tree.xpath('//div[@class="clearfix mt20 downlist"]/ul/li[1]/a/@href')[0] doc = requests.get(url=dow_url, headers=headers).content cv_path = './免费简历/' + cv_title with open(cv_path, 'wb') as fp: fp.write(doc) print(cv_title, '下载完成!!!') # 检查文件夹是否存在,并创建文件夹 if not os.path.exists('./免费简历'): os.mkdir('./免费简历') first_page_url = 'http://sc.chinaz.com/jianli/free.html' headers = { "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.92 Safari/537.36' } tree = etree.HTML(requests.get(url=first_page_url, headers=headers).text) page_limit = tree.xpath('//div[@class="pagination fr clearfix clear"]/a[8]/b/text()')[0] while True: print(page_limit + " pages at most") page = input("Please enter how many page you want: ") if page.isdigit() and 1 <= int(page) <= int(page_limit): for i in range(1, int(page) + 1): if i == 1: cv_down(tree, headers) else: other_page_url = 'http://sc.chinaz.com/jianli/free_' + str(i) + '.html' # 一页以上的特殊性需要重新制定链接 response = requests.get(url=other_page_url, headers=headers).text tree = etree.HTML(response) cv_down(tree, headers) break else: print("You can only enter right numbers!!!") continue |