|
大家好,这是我写的爬取糗事百科的内容,图片以及评论数的爬虫,但是只能爬取到第4页的内容,不能爬取1-3页的内容,求助大家帮我看看问题。
from lxml import etree
import requests
import json
class QiubaiSpider:
def __init__(self):
self.url_temp='https://www.qiushibaike.com/8hr/page/{}/'
self.headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'}
def get_url_list(self): #根据url规律,构造url_list
url_list=[self.url_temp.format(i) for i in range(1,5)]
return url_list
def parse_url(self,url):
print('Now parsing:',url)
resp**e=requests.get(url,headers=self.headers)
return resp**e.content.decode()
def get_content_list(self,html_str):
html=etree.HTML(html_str)
div_list=html.xpath('.//div[@id = "content-left"]/div')
content_list=[]#content_list等于空列表
# 分组
for div in div_list:
item={}
item['author_name']=div.xpath('.//h2/text()')[0].strip() if len(div.xpath('.//h2/text()'))>0 else None
item['content']=div.xpath('.//div[@class="content"]/span/text()')
item['content'] = [i.strip() for i in item['content']]
item['photo']=div.xpath('.//div[@class="thumb"]/a/img/@src')
item['photo']='https:'+item['photo'][0] if len(item['photo'])>0 else None
item['stats_vote']=div.xpath('.//span[@class="stats-vote"]/i/text()')[0] if len(div.xpath('.//span[@class="stats-vote"]/i/text()'))>0 else None
item['stats_comments']=div.xpath('.//span[@class="stats-comments"]/a/i/text()')[0] if len(div.xpath('.//span[@class="stats-comments"]/a/i/text()'))>0 else None
content_list.append(item)#将item的数据传入content_list中
return content_list
print(div_list)
def save_content_list(self,content_list): #保存
with open('qiubai_1.txt','a',encoding='utf-8') as f:
for content in content_list:
f.write(json.dumps(content,ensure_ascii=False))
f.write('\n')
print('保存成功')
def run(self):#实现主要逻辑
#1、根据url地址的规律,构造url_list
url_list=self.get_url_list()
#2、发送请求获取相应
for url in url_list:
html_str=self.parse_url(url)
#3、提取数据
content_list=self.get_content_list(html_str)
#4、保存
self.save_content_list(content_list)
if __name__ == '__main__':
qiubai=QiubaiSpider()
qiubai.run()
|
|