用lxml的xpath处理遇到的bug

多啦AOA · 发表于 2017-8-27 09:09:00

一是先得到这个li[@class="entry"]列表，接着循环在每一个标签下获取内容，但xpath直接就得到了所有的内容，按道理应该是每次得到一个字符串呢
这是一个ajax的网页，https://www.ithome.com/html/android/321451.htm  这是获取内容的网页，代码中post后得到的。虽然得到数据了，但是这儿的逻辑我就弄不清楚，求解决

这是一部分代码，可调试
#-*-coding:utf-8-*-
__author__ = 'duolaAO'

import requests
from lxml import etree
import re

#获取it热评
def parse_hot_comment(newsid):
info_list = []
data = {
      'newsID': newsid,
      'type': 'hotcomment'
}
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'}
try:
      html = requests.post('https://dyn.ithome.com/ithome/getajaxdata.aspx',headers=headers,data=data)
      html.encoding = "utf-8"
      selector = etree.HTML(html.text)
      comment_list = selector.xpath('//li[@class="entry"]')
      for comment in comment_list:

         #评论内容
         content = comment.xpath('//p/text()')
         #用户名
         user = comment.xpath('//strong[2]/a/text()')
         #地址
         comment_address = comment.xpath('//span[@class="mobile android"]/text()')
         #发表时间
         comment_time = comment.xpath('//span[@class="mobile android"]/text()')
except Exception as e:
      print(e)
'''
categoryid:71
type:pccategorypage
page:3
'''
def parse_news_id(newsid,page_start):
#找到当前分类下的文章id
data = {
      'categoryid':newsid,
      'type':'pccategorypage',
      'page':'1'
}
#循环获取newsid，每次10页
for page in range(page_start,page_start + 11):
      data['page'] = str(page)
      try:
         html = requests.post( 'http://it.ithome.com/ithome/getajaxdata.aspx', data=data)
         selector = etree.HTML(html.content)
         news_list = selector.xpath('//li/a[@class="list_thumbnail"]')
         #迭代生成所有新闻链接返回
         for news in news_list:
            # print(news.xpath('@href')[0].split('/')[-1].replace('.htm',''))
            # print(re.search('\d+',(news.xpath('@href')[0])).group())
            yield re.search('\d+',(news.xpath('@href')[0])).group()
      except:
         return None

def main(page_start):
#安卓新闻分类
ID = '71'
#android对象
android = parse_news_id(ID,page_start)

# 迭代抓取
for newsid in android:

      hot_comment_dic = parse_hot_comment(newsid)
      if hot_comment_dic:
         for comment in hot_comment_dic:
            print(comment)
      else:
         print("评论不存在")
main(12)

		自动登录	找回密码
密码			立即注册

[求助] 用lxml的xpath处理遇到的bug