|
本帖最后由 asd5412487 于 2017-12-7 22:34 编辑
- from bs4 import BeautifulSoup
- import requests
- import time
- import urllib
- url = 'http://www.chinaseed114.com/seed/pzdq/'
- r = requests.get(url)
- r.encoding = r.apparent_encoding
- soup = BeautifulSoup(r.text,'lxml')
- def get_url(url):
- url1=[]
- urls=soup.select('tr > td > a')
- for url in urls:
- url1.append(url.get('href'))
- return url1
- def get_single_url(url):
- c=[]
- urls =[str(url)+'{}.html'.format(str(i)) for i in range(1,80,1)]
- for single_url in urls:
- c.extend(get_detail_name(single_url))
- return c
- def get_detail_name(url):
- try:
- a=[]
- r = requests.get(url,timeout=8)
- r.encoding = r.apparent_encoding
- soup = BeautifulSoup(r.text,'lxml')
- if 'miaomu' in url:
- return(get_miaomu_name(url))
- else:
- names=soup.select('tr > td > ul > li.t_c > a.px14')
- for name in names:
- a.append(name.get_text())
- return a
-
- except:
- pass
-
- def get_miaomu_name(url):
- b=[]
- r = requests.get(url)
- r.encoding = r.apparent_encoding
- soup = BeautifulSoup(r.text,'lxml')
- names1=soup.select('ul > li.catlist_li > a')
- for name1 in names1:
- b.append(name1.get_text())
- return b
- def get_item_info(url):
- names=soup.select('tr > td > a')
- urls=get_url(url)
- for name,url1 in zip(names,urls):
- r = requests.get(url1)
- r.encoding = r.apparent_encoding
- soup1 = BeautifulSoup(r.text,'lxml')
- data={
- '品种':name.get_text(),
- '详细':get_single_url(url1)
- }
- with open("test.txt",'a+') as f:
- f.write('品种:{}\t详细:{}\n'.format(data['品种'],data['详细'])) #把data的内容弄到一个文件里
- get_item_info(url)
复制代码 代码的模块是没有错的,主要问题是后面的文件保存那里。
出现的错误是:UnicodeEncodeError: 'gbk' codec can't encode character '\ufffd' in position 1501: illegal multibyte sequence
我有看不懂错误的原因。也不懂该如何修改。
请大神们指教。 |
|