Scrapy+selenium爬虫翻页问题

javyru · 发表于 2020-2-2 19:49:38

我写了个爬虫，爬取一个化妆品购物网站，但遇到翻页时出现了问题：
很奇怪的是：我把下一页直接字符串输入没有问题可以进入，但通过resp**e.css+urllib.parse.urljoin写入时就不会翻页了（我已在shell里测试，css语句正确，如下）
>>> next_urlid = str(resp**e.css(".module-pagination-main.myaccount-product-list a:nth-child(3)::attr(href)").extract()[1])
>>> next_url1 = "https://www.sephora.cn/brand/givenchy-190/page3/?hasInventory=0&sortField=1&sortMode=desc"
>>> parse.urljoin(resp**e.url, next_urlid)
'https://www.sephora.cn/brand/givenchy-190/page2/?hasInventory=0&sortField=1&sortMode=desc'
>>> next_url1 = "https://www.sephora.cn/brand/givenchy-190/page3/?hasInventory=0&sortField=1&sortMode=desc"
>>> next_url2 = parse.urljoin(resp**e.url, next_urlid)
>>> next_url2
'https://www.sephora.cn/brand/givenchy-190/page2/?hasInventory=0&sortField=1&sortMode=desc'
>>>

相关翻页代码：
class SpGivSpider(scrapy.Spider):
name = 'sp_giv'
allowed_domains = ['www.sephora.cn']
start_urls = ['https://www.sephora.cn/brand/givenchy-190/page1/?hasInventory=0&sortField=1&sortMode=desc']

def __init__(self):
      self.browser = webdriver.Chrome(executable_path="D:/sp_test/chromedriver.exe")
      # self.browser = webdriver.Chrome(executable_path="D:/sp_test/chromedriver.exe")
super().__init__()
      # dispatcher.connect(self.spider_closed, signals.spider_closed)
def start_requests(self):
      url = "https://www.sephora.cn/brand/givenchy-190/page1/?hasInventory=0&sortField=1&sortMode=desc"
resp**e = scrapy.Request(url,callback=self.parse)
      yield resp**e

def close(self,spider):
      self.browser.quit()

# def spider_closed(self, spider):
#    #当爬虫退出的时候关闭chrome
#    print ("spider closed")
#    self.browser.quit()
def parse(self, resp**e):

      article_url = resp**e.css("ul.cate_prod .p_img a")
      for post_url in article_url:
         post_img=post_url.css("img::attr(src)").extract_first("")
         post_url=post_url.css("::attr(href)").extract_first("")
         resp**e = scrapy.Request(post_url,meta={"img_url":post_img},callback=self.parse_detail)
         # resp**e = scrapy.Request(post_url, callback=self.parse_detail)
yield resp**e
      # next_urlid="/brand/givenchy-190/page3/?hasInventory=0&sortField=1&sortMode=desc"
next_urlid = resp**e.css(".module-pagination-main.myaccount-product-list a:nth-child(3)::attr(href)").extract()[1]

      # next_url2 = parse.urljoin(resp**e.url, next_urlid)
      # next_url = "https://www.sephora.cn"+next_urlid
      # next_url="https://www.sephora.cn"+ next_urlid
      # next_url1 = "https://www.sephora.cn/brand/givenchy-190/page3/?hasInventory=0&sortField=1&sortMode=desc"
      # next_url = "https://www.sephora.cn"+next_urlid
next_url = parse.urljoin(resp**e.url, next_urlid)

      if next_urlid:
         resp**e = scrapy.Request(url=next_url, callback=self.parse)
yield Request

def parse_detail(self,resp**e):
      article_item=ArticleItem()
      post_img_url=resp**e.meta.get("img_url","")
      sp_select=resp**e.xpath('//*[@id="root"]/div/div[4]/div/div[3]/div[1]/div/p[1]/text()').extract()
      # t_selector = Selector(text=self.browser.get(Request.url))
article_item["price"] = int(resp**e.xpath('//p[contains(@class,"three")]/text()').extract()[0])
      article_item["points"] = resp**e.css('.points .nowPoint::text').extract()
      article_item["url_object_id"]=get_md5(resp**e.url)
      article_item["sp_select"]=sp_select
      article_item["post_img_url"]=post_img_url
      article_item["url"]=resp**e.url
      yield article_item
如果蓝色部分用next_url1就能成功
麻烦好心人帮忙看一下，困扰了好几天了，谢谢

		自动登录	找回密码
密码			立即注册

[求助] Scrapy+selenium爬虫翻页问题