因为真正的4K图需要登录,并且登录后一天也就只能下载一张,所以就只解析爬取了内容页的高清图片(1200*633左右)
代码实现了我目前所能想到的用户骚操作的判定,例如选择的类目不在这个网站内, 或者这个类目没有那么多页之类的 废话不多说直接上代码吧
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 | # By.Zeno import requests from lxml import etree import os # 封装从栏目页进入内容页连接解析函数 def name(url, type_name, headers): url = url + page_list_url[int(type_name)] doc_name = './' + page_list_name[int(type_name)].encode('ISO-8859-1').decode('gbk') page_list_text = requests.get(url=url, headers=headers).text page_list_tree = etree.HTML(page_list_text) page_list_limit = page_list_tree.xpath('//*[@id="main"]/div[4]/a[7]/text()')[0] while True: print("{}上限页数为{}页".format(page_list_name[int(type_name)].encode('ISO-8859-1').decode('gbk'), page_list_limit)) page_need = input("请输入您要爬取{}的页数: ".format(page_list_name[int(type_name)].encode('ISO-8859-1').decode('gbk'))) if page_need.isdigit() and 1 <= int(page_need) <= int(page_list_limit): # 判断是否填写有误(包括页数判断) if not os.path.exists(doc_name): os.mkdir(doc_name) for i in range(1, int(page_need) + 1): if i == 1: # 第1页爬取 crawler(page_list_tree, doc_name) else: # 第2页及以上爬取 page_url = url + 'index_' + str(i) + '.html' page_list_text = requests.get(url=page_url, headers=headers).text page_list_tree = etree.HTML(page_list_text) crawler(page_list_tree, doc_name) break else: print("请重新输入正确的数字") # 封装内容页图片连接解析函数 def crawler(page_list_tree, doc_name): img_list_url = page_list_tree.xpath('//*[@id="main"]/div[3]/ul/li/a/@href') img_name = page_list_tree.xpath('//*[@id="main"]/div[3]/ul/li/a/b/text()') for i in range(len(img_name)): img_url = 'http://pic.netbian.com' + img_list_url[i] name = img_name[i].encode('ISO-8859-1').decode('gbk') + '.jpg' img_page = requests.get(url=img_url, headers=headers).text img_page_tree = etree.HTML(img_page) page_img_src = img_page_tree.xpath('//*[@id="img"]/img/@src')[0] page_img_src = 'http://pic.netbian.com' + page_img_src img = requests.get(url=page_img_src, headers=headers).content img_path = doc_name + '/' + name with open(img_path, 'wb') as fp: fp.write(img) print(name + '下载成功!!!') url = 'http://pic.netbian.com' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3775.400 QQBrowser/10.6.4208.400' } response = requests.get(url=url, headers=headers).text tree = etree.HTML(response) page_list_url = tree.xpath('//*[@id="main"]/div[2]/a/@href') page_list_name = tree.xpath('//*[@id="main"]/div[2]/a/text()') while True: print("0.风景 1.美女 2.游戏 3.动漫 4.影视 5.明星 6.汽车 7.动物 8.人物 9.美食 10.宗教 11.背景") type_name = input("请输入对应数字: ") if type_name.isdigit() and 0 <= int(type_name) <= 11: #判断是否数字且是否超出可爬取范围 name(url, type_name, headers) break else: print("请重新输入正确的数字") continue |