多线程爬斗图啦

趁热打铁吧,再写一个爬虫练手,也是再次熟悉下BeautifulSoup库的使用。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import requests
from bs4 import BeautifulSoup
import os
import re
import threading

dirs="斗图"
headers ={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
}

def mkdir():
if not os.path.exists(dirs):
os.mkdir(dirs)
os.chdir(dirs)
return True
else:
print("文件夹已存在")
return False

def get_one_page(url):
res = requests.get(url, headers=headers)
content = res.content
soup = BeautifulSoup(content, 'lxml')
img_list = soup.find_all('img', attrs={'class': 'img-responsive lazy image_dta'})
one_page_list =[]
for img in img_list:
t = (img['alt'],img['data-original'])
one_page_list.append(t)
return one_page_list

def download_img(img,name,suf,i):
with open(name+ "."+suf, 'wb+') as f:
print("正在下载"+name)
f.write(img.content)

if __name__ =="__main__":
mkdir()
for i in range(1,2605):
one_page_list=get_one_page('http://www.doutula.com/photo/list/?page='+str(i))
threads=[]
for j in one_page_list:
name=re.sub('[\/:*?"<>|_]','',j[0])
suf=j[1][-3:]#获取后缀
img = requests.get(j[1])
t = threading.Thread(target=download_img,args=[img,name,suf,i])
threads.append(t)
for i in range(0,len(threads)-1):
threads[i].start()
for i in range(0, len(threads) - 1):
threads[i].join()