|
本帖最后由 pythonLearner 于 2015-12-13 19:31 编辑
- __author__ = 'KS'
- # -*- coding: utf-8 -*-
- import urllib2
- import re
- def getContentOrComment(urlJoin):
- user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
- headers = {'User-Agent': user_agent}
- req = urllib2.Request(url=urlJoin, headers=headers)
- response = urllib2.urlopen(req)
- content = response.read()
- return content
- # 文章地址
- url = "http://www.qiushibaike.com/textnew/page/%d"
- # 评论地址
- commentUrl = "http://www.qiushibaike.com/article/%s"
- page = 0
- while True:
- getFromCustomer = raw_input("next page ? print Enter key to continue or 'exit' to stop\n")
- if getFromCustomer == "exit":
- break
- page += 1
- urlJoin = url % page
- print urlJoin
- articlePage = getContentOrComment(urlJoin)
- # print content
- # 把评论一起抠出来,文章郑泽匹配
- # group[0]是文章id
- # group[1]是文章内容
- # group[2]是注释
- rexArticle = r'<div[\s]+class\="article[\s]+block[\s]+untagged[\s]+mb15"[\s]+id\=\'qiushi_tag_(\d+?)\'\>[' \
- r'\s\S]*?\<div[' \
- r'\s]+class\="content"\>[\s]*([\S]+?)[\s]*(\<!--\d*--\>)*[\s]*\</div\>'
- rexComment = r'\<span[\s]+class\="body"\>([^\n{][\s\S]+?)\</span\>'
- patternArticle = re.compile(rexArticle)
- patternComment = re.compile(rexComment)
- for string in re.findall(patternArticle, articlePage):
- articleId = string[0]
- articleContent = string[1]
- commentTimes = 1
- print '\n'
- print articleContent
- commentPage = getContentOrComment(commentUrl % articleId)
- for comment in re.findall(patternComment, commentPage):
- print " ", commentTimes, "楼回复:", comment
- commentTimes += 1
- print '\n'
复制代码
|
|