|
- __author__ = 'KS'
- # -*- coding: utf-8 -*-
- import urllib2
- import re
- from bs4 import BeautifulSoup
- def getContentOrComment(urlJoin):
- user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
- headers = {'User-Agent': user_agent}
- req = urllib2.Request(url=urlJoin, headers=headers)
- response = urllib2.urlopen(req)
- content = response.read().decode('utf-8','ignore')
- return content
- # 文章地址
- articleUrl = "http://www.qiushibaike.com/textnew/page/%d"
- # 评论地址
- commentUrl = "http://www.qiushibaike.com/article/%s"
- page = 0
- while True:
- getFromCustomer = raw_input("next page ? print Enter key to continue or 'exit' to stop\n")
- if getFromCustomer == "exit":
- break
- page += 1
- urlJoin = articleUrl % page
- print urlJoin
- articlePage = getContentOrComment(urlJoin)
- soupArticle = BeautifulSoup(articlePage, 'html.parser')
- # print soup.prettify()
- articleFloor = 1
- for string in soupArticle.find_all(attrs="article block untagged mb15"):
- commentId = str(string.get('id')).strip()[11:]
- print "\n"
- print articleFloor, ".", string.find(attrs="content").get_text().strip()
- articleFloor += 1
- commentPage = getContentOrComment(commentUrl % commentId)
- soupComment = BeautifulSoup(commentPage, 'html.parser')
- commentFloor = 1
- for comment in soupComment.find_all(attrs="body"):
- print " ", commentFloor, "楼回复:", comment.get_text().strip()
- commentFloor += 1
复制代码
|
|