|
一是先得到这个li[@class="entry"]列表,接着循环在每一个标签下获取内容,但xpath直接就得到了所有的内容,按道理应该是每次得到一个字符串呢
这是一个ajax的网页,https://www.ithome.com/html/android/321451.htm 这是获取内容的网页,代码中post后得到的。虽然得到数据了,但是这儿的逻辑我就弄不清楚,求解决
这是一部分代码,可调试
#-*-coding:utf-8-*-
__author__ = 'duolaAO'
import requests
from lxml import etree
import re
#获取it热评
def parse_hot_comment(newsid):
info_list = []
data = {
'newsID': newsid,
'type': 'hotcomment'
}
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'}
try:
html = requests.post('https://dyn.ithome.com/ithome/getajaxdata.aspx',headers=headers,data=data)
html.encoding = "utf-8"
selector = etree.HTML(html.text)
comment_list = selector.xpath('//li[@class="entry"]')
for comment in comment_list:
#评论内容
content = comment.xpath('//p/text()')
#用户名
user = comment.xpath('//strong[2]/a/text()')
#地址
comment_address = comment.xpath('//span[@class="mobile android"]/text()')
#发表时间
comment_time = comment.xpath('//span[@class="mobile android"]/text()')
except Exception as e:
print(e)
'''
categoryid:71
type:pccategorypage
page:3
'''
def parse_news_id(newsid,page_start):
#找到当前分类下的文章id
data = {
'categoryid':newsid,
'type':'pccategorypage',
'page':'1'
}
#循环获取newsid,每次10页
for page in range(page_start,page_start + 11):
data['page'] = str(page)
try:
html = requests.post( 'http://it.ithome.com/ithome/getajaxdata.aspx', data=data)
selector = etree.HTML(html.content)
news_list = selector.xpath('//li/a[@class="list_thumbnail"]')
#迭代生成所有新闻链接返回
for news in news_list:
# print(news.xpath('@href')[0].split('/')[-1].replace('.htm',''))
# print(re.search('\d+',(news.xpath('@href')[0])).group())
yield re.search('\d+',(news.xpath('@href')[0])).group()
except:
return None
def main(page_start):
#安卓新闻分类
ID = '71'
#android对象
android = parse_news_id(ID,page_start)
# 迭代抓取
for newsid in android:
hot_comment_dic = parse_hot_comment(newsid)
if hot_comment_dic:
for comment in hot_comment_dic:
print(comment)
else:
print("评论不存在")
main(12)
|
-
|