|
python3抓取http://www.mzitu.com/的图片无法显示怎么解决?
似乎是防盗链
- #!/usr/bin/env python3
- import requests
- from bs4 import BeautifulSoup
- import os
- # 爬取目标
- url = 'http://www.mzitu.com/page/'
- parser = 'html.parser'
- cur_path = 'D:/LXYWorking/PythonFiles/Datamining/'
- # 设置报头,Http协议
- header = {
- 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.104 Safari/537.36'}
- # 爬取的预览页面数量
- preview_page_cnt = 2
- for cur_page in range(1, int(preview_page_cnt) + 1):
- cur_url = url + str(cur_page)
- cur_page = requests.get(cur_url, headers=header)
- # 解析网页
- soup = BeautifulSoup(cur_page.text, parser)
- # 图片入口和文字入口取一个即可
- preview_link_list = soup.find(id='pins').find_all('a', target='_blank')[1::2]
- for link in preview_link_list:
- dir_name = link.get_text().strip().replace('?', '')
- link = link['href']
- soup = BeautifulSoup(requests.get(link).text, parser)
- # 获取图片数量
- pic_cnt = soup.find('div', class_='pagenavi').find_all('a')[4].get_text()
- # 创建目录
- pic_path = cur_path
- if os.path.exists(pic_path):
- print('directory exist!')
- else:
- os.mkdir(pic_path)
- os.chdir(pic_path) # 进入目录,开始下载
- print('下载' + dir_name + '...')
- # 遍历获取每页图片的地址
- for pic_index in range(1, int(pic_cnt) + 1):
- pic_link = link + '/' + str(pic_index)
- cur_page = requests.get(pic_link, headers=header)
- soup = BeautifulSoup(cur_page.text, parser)
- pic_src = soup.find('div', 'main-image').find('img')['src']
- pic_name = pic_src.split('/')[-1]
- f = open(pic_name, 'wb')
- f.write(requests.get(pic_src, headers=header).content)
- f.close()
- os.chdir(cur_path) # 完成下载,退出目录
- print('下载完成')
复制代码 请问有什么解决的方法?
|
|