|
最近在试一下怎么写爬虫,尝试着从有趣网站之家保存所有的有趣网站名及链接,并保存在txt文件中,可是遇到了一个问题,就是在进行递归下一页的链接地址的时候出现了一个AttributeError,看了好久都找不到原因,求助各位大神- #!/usr/bin/python
- # -*- coding:utf-8 -*-
- import urllib
- import re
- url = 'http://youquhome.com/'
- def info_get(urls):
- urlcode = urllib.urlopen(urls).read()
- titlere = r'<a href="http://youquhome.com/.*?/" rel="bookmark">(.*?[\u4e00-\u9fa5]*?.*?)</a>' #网页标题正则
- urlre = r'<a href="(.*?)" target="_blank" rel="external nofollow">' #网页链接正则
- nextre = r'<a class="nextpostslink" rel="next" href="(.*?)">' #下一页正则
- global codelist
- global urllist
- codelist = re.findall(titlere,urlcode)
- urllist = re.findall(urlre,urlcode)
- nexturl = re.findall(nextre,urlcode) #下一页的链接
- return nexturl
- def info_write():
- x = 0
- fo = open('youquurl.txt','a')
- while True:
- fo.write(codelist[x]+': ')
- fo.write(urllist[x]+'\n')
- x+=1
- if x==len(codelist) or x==len(urllist): #比较的作用是判断列表是否到了最后一个元素
- break
- fo.close()
- if __name__ == '__main__':
- while url != []:
- url = info_get(url)
- info_write()
- print 'Done!'
复制代码 下面是错误信息:
Traceback (most recent call last):
File "youqu_urlget.py", line 33, in <module>
url = info_get(url)
File "youqu_urlget.py", line 9, in info_get
urlcode = urllib.urlopen(urls).read()
File "/usr/lib/python2.7/urllib.py", line 87, in urlopen
return opener.open(url)
File "/usr/lib/python2.7/urllib.py", line 180, in open
fullurl = unwrap(toBytes(fullurl))
File "/usr/lib/python2.7/urllib.py", line 1060, in unwrap
url = url.strip()
AttributeError: 'list' object has no attribute 'strip'
各位大神帮帮忙。。。
|
|