|
我的spider
- # -*- coding: utf-8 -*-
- from scrapy.spiders import Spider
- from scrapy.selector import Selector
-
- from scrapy.http import Request
- from Myspider.items import MyspiderItem
- class BaiduSpider(Spider):
- name = "w3school"
- allowed_domains = ["263zw.com"]
- start_urls = [
- "http://www.263zw.com/53103/7518437.html"
- ]
-
- def parse(self, response):
- sel = Selector(response)
- items = []
- maindiv=sel.xpath('//div[@class="main"]');
- print maindiv.extract()
-
- title=maindiv[0].xpath('div/h3/text()').extract()
- desc=maindiv[0].xpath('//div[@id="chapterContent"]/text()').extract()
- encdesc=desc[0].decode('gbk','ignore')
- enctitle=title[0]
- print u'title:'+enctitle
- print u'desc:'+encdesc
- item = MyspiderItem()
- item['title']=enctitle
- item['desc']=encdesc
-
- return item
-
- #items = []
-
- #log.msg("Append done.",level='INFO')
- #return items
- #获得下一篇文章的url
- #urls = sel.xpath('//div[@class="page"]/a/@href').extract()
- #url="http://www.263zw.com" +urls[2]
-
- #print url
- #yield Request(url, callback=self.parse)#再次请求新的URL
-
复制代码
pipeline是
- # -*- coding: utf-8 -*-
- # Define your item pipelines here
- #
- # Don't forget to add your pipeline to the ITEM_PIPELINES setting
- # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
- import json
- import codecs
- class MyspiderPipeline(object):
-
- def __init__(self):
- self.file = codecs.open('D://w3school_data_utf8.json', 'wb', encoding='utf-8')
-
-
- def process_item(self, item, spider):
- print 'hello '+item['desc'].encode("GBK","ignore")
- #self.file.write(line['title'].decode("UTF-8", "ignore"))
- #print line
- self.file.write(item['desc'].encode("unicode_escape"))
- return item
复制代码
我爬取的网站是一个小说站,http://www.263zw.com/53103/7518437.html , 不知道是不是有防爬取的代码,
错误在spider的 :encdesc=desc[0].decode('gbk','ignore') ,
报错是 UnicodeEncodeError: 'ascii' codec can't encode characters in position 4-57: ordinal not in range(128)
如果我不加.decode('gbk','ignore') , 就会报 gbk什么错误illegal multibyte sequence python, 也是编码的问题,
不知道该怎么办了, 有知道的大神吗, 求救啊
|
|