多线程爬取糗事百科段子
上一篇中有一个爬取糗事百科段子的demo,但是如果需要请求的url太多的情况下,一个一个请求肯定会很慢,影响效率,而且耗时主要是在网络请求中,属于IO密集型的代码,所以GIL锁在这里的影响不会很大。
主要用到了threading模块和queue相关的内容,而这部分内容网上一搜一大把- -不多解释。
以下为改写为多线程的代码:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98
| import requests from lxml import html import threading from queue import Queue import os
class QuibaiSpdier: def __init__(self): self.url_temp = "https://www.qiushibaike.com/hot/page/{}/" self.headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.75 Safari/537.36"} self.url_queue = Queue() self.html_queue = Queue() self.content_queue = Queue()
def get_url_list(self): for i in range(1, 14): self.url_queue.put(self.url_temp.format(i))
def parse_url(self): while True: url = self.url_queue.get() print("request:" + url) response = requests.get(url, headers=self.headers) self.html_queue.put(response.content.decode()) self.url_queue.task_done()
def get_content_list(self): while True: html_str = self.html_queue.get() html_elements = html.etree.HTML(html_str) div_list = html_elements.xpath("//div[@id='content-left']/div") content_list = [] for div in div_list: item = {} item["content"] = div.xpath(".//div[@class='content']/span/text()") item["author_gender"] = div.xpath(".//div[contains(@class,'articleGender')]/@class") item["author_gender"] = item["author_gender"][0].split(" ")[-1].replace("Icon", "") if len( item["author_gender"]) > 0 else None item["content_img"] = div.xpath(".//div[@class='thumb']/a/img/@src") item["content_img"] = "https:" + item["content_img"][0] if len(item["content_img"]) > 0 else None item["author_img"] = div.xpath(".//div[@class='author clearfix']//img/@src") item["author_img"] = "https:" + item["author_img"][0] if len(item["author_img"]) > 0 else None item["stats_vote"] = div.xpath(".//span[@class='stats-vote']/i/text()") item["stats_vote"] = item["stats_vote"][0] if len(item["stats_vote"]) > 0 else None content_list.append(item) self.content_queue.put(content_list) self.html_queue.task_done()
def save_content_list(self): while True: content_list = self.content_queue.get() with open("qiubai.txt", "a", encoding='utf-8') as f: for content in content_list: f.write("\n".join(content["content"]) + "点赞数:" + content["stats_vote"]) f.write("\n") self.content_queue.task_done()
def run(self): thread_list = [] t_geturl = threading.Thread(target=self.get_url_list) thread_list.append(t_geturl) for i in range(7): t_parseurl = threading.Thread(target=self.parse_url) thread_list.append(t_parseurl) for i in range(3): t_getcontent = threading.Thread(target=self.get_content_list) thread_list.append(t_getcontent) t_save = threading.Thread(target=self.save_content_list) thread_list.append(t_save) for t in thread_list: t.setDaemon(True) t.start() for q in [self.url_queue, self.html_queue, self.content_queue]: q.join() print("主线程结束")
if __name__ == '__main__': if os.path.exists("qiubai.txt"): os.remove("qiubai.txt") qiubai = QuibaiSpdier() qiubai.run()
|