pokemonlei

陈磊的博客 | pokemonlei

多线程爬虫

多线程爬取糗事百科段子

上一篇中有一个爬取糗事百科段子的demo,但是如果需要请求的url太多的情况下,一个一个请求肯定会很慢,影响效率,而且耗时主要是在网络请求中,属于IO密集型的代码,所以GIL锁在这里的影响不会很大。

主要用到了threading模块和queue相关的内容,而这部分内容网上一搜一大把- -不多解释。

以下为改写为多线程的代码:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
# coding=utf-8
import requests
from lxml import html
import threading
from queue import Queue
import os

class QuibaiSpdier:
def __init__(self):
self.url_temp = "https://www.qiushibaike.com/hot/page/{}/"
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.75 Safari/537.36"}
self.url_queue = Queue() # 需要请求的url地址
self.html_queue = Queue()
self.content_queue = Queue()

def get_url_list(self):
# return [self.url_temp.format(i) for i in range(1, 14)]
for i in range(1, 14):
self.url_queue.put(self.url_temp.format(i))

def parse_url(self):
while True:
url = self.url_queue.get()
print("request:" + url)
response = requests.get(url, headers=self.headers)
# return response.content.decode()
self.html_queue.put(response.content.decode())
self.url_queue.task_done()

def get_content_list(self): # 提取数据
while True:
html_str = self.html_queue.get()
html_elements = html.etree.HTML(html_str)
div_list = html_elements.xpath("//div[@id='content-left']/div")
content_list = []
for div in div_list:
item = {}
item["content"] = div.xpath(".//div[@class='content']/span/text()")
item["author_gender"] = div.xpath(".//div[contains(@class,'articleGender')]/@class")
item["author_gender"] = item["author_gender"][0].split(" ")[-1].replace("Icon", "") if len(
item["author_gender"]) > 0 else None
item["content_img"] = div.xpath(".//div[@class='thumb']/a/img/@src")
item["content_img"] = "https:" + item["content_img"][0] if len(item["content_img"]) > 0 else None
item["author_img"] = div.xpath(".//div[@class='author clearfix']//img/@src")
item["author_img"] = "https:" + item["author_img"][0] if len(item["author_img"]) > 0 else None
item["stats_vote"] = div.xpath(".//span[@class='stats-vote']/i/text()")
item["stats_vote"] = item["stats_vote"][0] if len(item["stats_vote"]) > 0 else None
content_list.append(item)
# return content_list
self.content_queue.put(content_list)
self.html_queue.task_done()

def save_content_list(self): # 保存数据
while True:
content_list = self.content_queue.get()
with open("qiubai.txt", "a", encoding='utf-8') as f:
for content in content_list:
# f.write(json.dumps(content, ensure_ascii=False))
f.write("\n".join(content["content"]) + "点赞数:" + content["stats_vote"])
f.write("\n")
self.content_queue.task_done()

def run(self): # 实现主要逻辑
thread_list = []
# 1.构造url_list
# url_list = self.get_url_list()
t_geturl = threading.Thread(target=self.get_url_list)
thread_list.append(t_geturl)
# 2.遍历list,发送请求获取相应
for i in range(7): # 来7个线程来请求url地址
t_parseurl = threading.Thread(target=self.parse_url)
thread_list.append(t_parseurl)
# for url in url_list:
# html_str = self.parse_url(url)
# 3.提取数据
# content_list = self.get_content_list(html_str)
for i in range(3):
t_getcontent = threading.Thread(target=self.get_content_list)
thread_list.append(t_getcontent)
# 4.保存数据
# self.save_content_list(content_list)
t_save = threading.Thread(target=self.save_content_list)
thread_list.append(t_save)
for t in thread_list:
t.setDaemon(True) # 子线程设置为守护线程,表示该线程不重要,主线程结束则子线程也会结束
t.start()
for q in [self.url_queue, self.html_queue, self.content_queue]:
q.join() # 阻塞等待子线程结束
print("主线程结束")


if __name__ == '__main__':
if os.path.exists("qiubai.txt"):
os.remove("qiubai.txt")
qiubai = QuibaiSpdier()
qiubai.run()