python爬虫-爬取一波美女图

最近简单地看了下python爬虫的视频。便自己尝试写了下爬虫操作,计划的是把某一个网站上的美女图全给爬下来,不过经过计算,查不多有好几百G的样子,还是算了。就首先下载一点点先看看。
本次爬虫使用的是python2.7的版本,并且本次的目标网站并没有采用js来加载图片,所以没有涉及对js脚本的解析,都是通过来分析html文件通过正则来一步步提取图片网址,然后存起来。
首先这个网站有很多分类,到美女图这个子网页,可以发现有很多页,同时每页有多个相册,每个相册点进去就会有多个页,每页有多张照片
流程大概是这样

找到所有页数
—-遍历所有的页数
—-遍历当前页的所有相册(给每个相册建立一个目录)
—-遍历当前相册的所有图片(遍历此相册的所有页(遍历当前页的所有照片并找到图片的url))
—-获得图片url就存起来
不说了,直接上代码
这个版本是windows上的运行版本

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import urllib
import re
import os
import time
import socket
def get_html(url):
socket.setdefaulttimeout(10)
papg = urllib.urlopen(url)
html = papg.read()
html = unicode(html, "gbk").encode("utf8")
return html

def get_img(html):
imgre = re.compile(r'<img src="(.*?)"')
imglist = re.findall(imgre, html)
for imgurl in imglist:
print imgurl
global x
urllib.urlretrieve(imgurl, '.\\photo\%05d.jpg'%x)
x += 1
print("正在下载第%s张图片"%x)

def get_tag_list(html):
szurlre = re.compile(r'<a href="(http://www.5442.com/tag/.*?.html)" class')
tag_list = re.findall(szurlre, html)
return tag_list

def get_page_num(html):
szurlre = re.compile(r'(\d+).html\'>末页')
szresult = re.findall(szurlre, html)
if len(szresult) == 0:
page_num = 0
else:
page_num = int(szresult[0])
print page_num
return page_num

def get_page_num2(html):
szurlre = re.compile(r'共(\d+)页')
szresult = re.findall(szurlre, html)
if len(szresult) == 0:
page_num = 0
else:
page_num = int(szresult[0])
print page_num
return page_num

#获得单页的相册
def get_ablum_list(html):
szurlre = re.compile(r'(http://www.5442.com/meinv/2\d+/\d+.html)" target=')
ablum_list = re.findall(szurlre, html);
return ablum_list
#获得相册的名称
def get_ablum_name(html):
szurlre = re.compile(r'<title>(\S+)</title>')
ablum_name = re.findall(szurlre, html)
return ablum_name[0]
#获得单页的图片
def get_photo(html, dir, photo_num):
imgre = re.compile(r'点击图片进入下一页\' ><img src=\'(http://\S+.jpg)\' alt=')
imglist = re.findall(imgre, html)
for imgurl in imglist:
try:
socket.setdefaulttimeout(2)
urllib.urlretrieve(imgurl, unicode('.\\photo\\%s\%05d.jpg'%(dir, photo_num), "utf8"))
print("正在下载第%s张图片"%photo_num)
photo_num = photo_num + 1
except:
continue
return photo_num

url = "http://www.5442.com/meinv/"
baseurl = "http://www.5442.com"
html = get_html(url)
page_num = get_page_num(html)
print ("一共有%s页"%page_num)
ablum_num = 0
try:
os.mkdir("photo")
except:
print "目录已经存在,继续下载"
#遍历所有的页
for i in range(1, page_num):
if i != 1:
url = "http://www.5442.com/meinv/list_1_%s.html"%i
try:
html = get_html(url)
except:
continue
ablum_list = get_ablum_list(html)
#遍历当前页的所有相册
for ablum_url in ablum_list:
ablum_num = ablum_num + 1
try:
photo_html = get_html(ablum_url)
except:
continue
url_part = ablum_url[0:-5]
photo_page_num = get_page_num2(photo_html)
#获取相册名有点问题,直接以数字来创建更加方便,便于分
#ablum_name = get_ablum_name(photo_html)
ablum_name = "编程资料" + "%05d" % ablum_num
print ablum_name
photo_num = 0
#创建相册对应的目录
ui_ablum_name = unicode(ablum_name, "utf8")
try:
os.mkdir(".\\photo\\"+ui_ablum_name)
except:
continue
for i in range(1, photo_page_num):
if i != 1:
ablum_url = url_part + "_%d"%i + ".html"
try:
photo_html = get_html(ablum_url)
except:
continue
#进行存储操作
photo_num = get_photo(photo_html, ablum_name, photo_num)

运行效果截图:

这样就运行成功了。

以下是linux下的运行代码,主要是编码和存储的路径格式不一样

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
#!/usr/bin/python
# -*- coding:utf8 -*-

import urllib
import re
import os
import time
import socket
def get_html(url):
socket.setdefaulttimeout(2)
papg = urllib.urlopen(url)
html = papg.read()
html = unicode(html, "gbk").encode("utf8")
return html

def get_img(html):
imgre = re.compile(r'<img src="(.*?)"')
imglist = re.findall(imgre, html)
for imgurl in imglist:
print imgurl
global x
urllib.urlretrieve(imgurl, '.\\photo\%05d.jpg'%x)
x += 1
print("正在下载第%s张图片"%x)

def get_tag_list(html):
szurlre = re.compile(r'<a href="(http://www.5442.com/tag/.*?.html)" class')
tag_list = re.findall(szurlre, html)
return tag_list

def get_page_num(html):
szurlre = re.compile(r'(\d+).html\'>末页')
szresult = re.findall(szurlre, html)
if len(szresult) == 0:
page_num = 0
else:
page_num = int(szresult[0])
print page_num
return page_num

def get_page_num2(html):
szurlre = re.compile(r'共(\d+)页')
szresult = re.findall(szurlre, html)
if len(szresult) == 0:
page_num = 0
else:
page_num = int(szresult[0])
print page_num
return page_num

#获得单页的相册
def get_ablum_list(html):
szurlre = re.compile(r'(http://www.5442.com/meinv/2\d+/\d+.html)" target=')
ablum_list = re.findall(szurlre, html);
return ablum_list
#获得相册的名称
def get_ablum_name(html):
szurlre = re.compile(r'<title>(\S+)</title>')
ablum_name = re.findall(szurlre, html)
return ablum_name[0]
#获得单页的图片
def get_photo(html, dir, photo_num):
imgre = re.compile(r'点击图片进入下一页\' ><img src=\'(http://\S+.jpg)\' alt=')
imglist = re.findall(imgre, html)
for imgurl in imglist:
try:
socket.setdefaulttimeout(2)
urllib.urlretrieve(imgurl, './photo//%s//%05d.jpg'%(dir, photo_num))
print("正在下载第%s张图片"%photo_num)
photo_num = photo_num + 1
except:
continue
return photo_num

url = "http://www.5442.com/meinv/"
baseurl = "http://www.5442.com"
html = get_html(url)
page_num = get_page_num(html)
print ("一共有%s页"%page_num)
ablum_num = 0
try:
os.mkdir("./photo")
except:
print "目录已经存在"
for i in range(1, page_num):
if i != 1:
url = "http://www.5442.com/meinv/list_1_%s.html"%i
try:
html = get_html(url)
except:
continue
ablum_list = get_ablum_list(html)
for ablum_url in ablum_list:
ablum_num = ablum_num + 1
try:
photo_html = get_html(ablum_url)
except:
continue
url_part = ablum_url[0:-5]
photo_page_num = get_page_num2(photo_html)
ablum_name = "编程资料" + "%05d" % ablum_num
print ablum_name
photo_num = 0
#创建相册对应的目录
ui_ablum_name = ablum_name
try:
os.mkdir("./photo/"+ui_ablum_name)
except:
continue
for i in range(1, photo_page_num):
if i != 1:
ablum_url = url_part + "_%d"%i + ".html"
try:
photo_html = get_html(ablum_url)
except:
continue
photo_num = get_photo(photo_html, ablum_name, photo_num)

运行效果:

保存目录

Contents
,