|
9威望
我写了个爬虫,爬取一个化妆品购物网站,但遇到翻页时出现了问题:
很奇怪的是:我把下一页直接字符串输入没有问题可以进入,但通过resp**e.css+urllib.parse.urljoin写入时就不会翻页了(我已在shell里测试,css语句正确,如下)
>>> next_urlid = str(resp**e.css(".module-pagination-main.myaccount-product-list a:nth-child(3)::attr(href)").extract()[1])
>>> next_url1 = "https://www.sephora.cn/brand/givenchy-190/page3/?hasInventory=0&sortField=1&sortMode=desc"
>>> parse.urljoin(resp**e.url, next_urlid)
'https://www.sephora.cn/brand/givenchy-190/page2/?hasInventory=0&sortField=1&sortMode=desc'
>>> next_url1 = "https://www.sephora.cn/brand/givenchy-190/page3/?hasInventory=0&sortField=1&sortMode=desc"
>>> next_url2 = parse.urljoin(resp**e.url, next_urlid)
>>> next_url2
'https://www.sephora.cn/brand/givenchy-190/page2/?hasInventory=0&sortField=1&sortMode=desc'
>>>
相关翻页代码:
class SpGivSpider(scrapy.Spider):
name = 'sp_giv'
allowed_domains = ['www.sephora.cn']
start_urls = ['https://www.sephora.cn/brand/givenchy-190/page1/?hasInventory=0&sortField=1&sortMode=desc']
def __init__(self):
self.browser = webdriver.Chrome(executable_path="D:/sp_test/chromedriver.exe")
# self.browser = webdriver.Chrome(executable_path="D:/sp_test/chromedriver.exe")
super().__init__()
# dispatcher.connect(self.spider_closed, signals.spider_closed)
def start_requests(self):
url = "https://www.sephora.cn/brand/givenchy-190/page1/?hasInventory=0&sortField=1&sortMode=desc"
resp**e = scrapy.Request(url,callback=self.parse)
yield resp**e
def close(self,spider):
self.browser.quit()
# def spider_closed(self, spider):
# #当爬虫退出的时候关闭chrome
# print ("spider closed")
# self.browser.quit()
def parse(self, resp**e):
article_url = resp**e.css("ul.cate_prod .p_img a")
for post_url in article_url:
post_img=post_url.css("img::attr(src)").extract_first("")
post_url=post_url.css("::attr(href)").extract_first("")
resp**e = scrapy.Request(post_url,meta={"img_url":post_img},callback=self.parse_detail)
# resp**e = scrapy.Request(post_url, callback=self.parse_detail)
yield resp**e
# next_urlid="/brand/givenchy-190/page3/?hasInventory=0&sortField=1&sortMode=desc"
next_urlid = resp**e.css(".module-pagination-main.myaccount-product-list a:nth-child(3)::attr(href)").extract()[1]
# next_url2 = parse.urljoin(resp**e.url, next_urlid)
# next_url = "https://www.sephora.cn"+next_urlid
# next_url="https://www.sephora.cn"+ next_urlid
# next_url1 = "https://www.sephora.cn/brand/givenchy-190/page3/?hasInventory=0&sortField=1&sortMode=desc"
# next_url = "https://www.sephora.cn"+next_urlid
next_url = parse.urljoin(resp**e.url, next_urlid)
if next_urlid:
resp**e = scrapy.Request(url=next_url, callback=self.parse)
yield Request
def parse_detail(self,resp**e):
article_item=ArticleItem()
post_img_url=resp**e.meta.get("img_url","")
sp_select=resp**e.xpath('//*[@id="root"]/div/div[4]/div/div[3]/div[1]/div/p[1]/text()').extract()
# t_selector = Selector(text=self.browser.get(Request.url))
article_item["price"] = int(resp**e.xpath('//p[contains(@class,"three")]/text()').extract()[0])
article_item["points"] = resp**e.css('.points .nowPoint::text').extract()
article_item["url_object_id"]=get_md5(resp**e.url)
article_item["sp_select"]=sp_select
article_item["post_img_url"]=post_img_url
article_item["url"]=resp**e.url
yield article_item
如果蓝色部分用next_url1就能成功
麻烦好心人帮忙看一下,困扰了好几天了,谢谢
|
|