今日分享的是一个利用python的BaidySpider库实现的百度搜索爬取
关键词自己在query那边修改就好了, 目前提取了的就只有百度的链接, 如果需要其他信息, 可以自己打断点对其他数据也清洗入库即可
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 | class MyThread(threading.Thread): def __init__(self, page): super().__init__() self.page = page def run(self) -> None: while True: if self.page.empty(): break self.get_info() def get_info(self): page = self.page.get() spider = BaiduSpider() response = spider.search_web(query="欧阳靖", pn=page) data = [] for result in response["results"]: url = result.get("url") if result.get("url"): data.append(url) continue result_type = result.get("type") # 百科处理 if result.get("result") and result_type != 'total': data.append(result.get("result").get("url")) continue # 其他处理 if result_type != 'total' and result_type != "related": for i in result.get("results"): data.append(i.get('url')) print(data) # 启动线程队列 work_queue = queue.Queue() # 传参 for i in range(1, 50): work_queue.put(i) threads = [] for j in range(10): thread_do = MyThread(work_queue) thread_do.start() threads.append(thread_do) for k in threads: k.join() |