|
写了一段从百度贴吧爬图片的程序,之前是因为有url没有"http:"而出现报错因此写了一个小循环,如果不存在http:则加上将url加上http:
import re
import urllib.request
def getHtml(url):
page = urllib.request.urlopen(url)
html = page.read()
return html
def getImg(html):
reg=r'src="(.*?\.jpg)"'
imgre=re.compile(reg)
html=html.decode('utf-8')#python3
imglist=re.findall(imgre,html)
x=0
for wd1 in imglist:
if'http:'not in wd1:
wd2='http:'+wd1
urllib.request.urlretrieve(wd2,"%s.jpg" %x)
else:
urllib.request.urlretrieve(wd1,"%s.jpg" %x)
x+=1
html=getHtml("https://tieba.baidu.com/p/3589472971?red_tag=0137538703")
print(getImg(html))
但是加上之后报错了,具体错误会如下
URLError Traceback (most recent call last)<ipython-input-34-9e85601cc9f3> in <module>() 22 23html=getHtml("https://tieba.baidu.com/p/3589472971?red_tag=0137538703")---> 24print(getImg(html))<ipython-input-34-9e85601cc9f3> in getImg(html) 16 if'http:'not in wd1: 17 wd2='http:'+wd1---> 18urllib.request.urlretrieve(wd2,"%s.jpg" %x) 19 else: 20 urllib.request.urlretrieve(wd1,"%s.jpg" %x)D:\Anaconda\lib\urllib\request.py in urlretrieve(url, filename, reporthook, data) 246 url_type, path = splittype(url) 247 --> 248with contextlib.closing(urlopen(url, data)) as fp: 249 headers =fp.info() 250 D:\Anaconda\lib\urllib\request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context) 221 else: 222 opener = _opener--> 223return opener.open(url, data, timeout) 224 225 def install_opener(opener):D:\Anaconda\lib\urllib\request.py inopen(self, fullurl, data, timeout) 522 for processor in self.process_request.get(protocol, []): 523 meth = getattr(processor,meth_name)--> 524req = meth(req) 525 526 resp**e = self._open(req, data)D:\Anaconda\lib\urllib\request.py in do_request_(self, request) 1239 host = request.host 1240 if not host:-> 1241raise URLError('no host given') 1242 1243 if request.data is notNone: # POSTURLError: <urlopen error no host given>请大神看一下,错误出在哪里,用的是python3.6版本
|
|