|
# !/usr/bin/env python
# _*_ coding:utf-8 _*_
import requests
import re
url = 'http://www.jjwxc.net/onebook.php?novelid=2518'
resp**e = requests.get(url)
resp**e.encoding = 'gb2312'
html = resp**e.text
title = re.findall(r'<title>《(.*?)》雪牛_【衍生小说|纯爱小说】_晋江文学城</title>',html,re.S)[0
fb = open('%s.txt' % title, 'w', encoding='utf-8')
re_table = r'<tbody>(.*?)</tbody>'
tbody = re.findall(re_table, html, re.S|re.M)[11
chapter_info_list = re.findall(r'href="(.*?)">(.*?)<',tbody, re.S)[3
# print(chapter_info_list)
for chapter_info in chapter_info_list:
chapter_url, chapter_title = chapter_info
# print(chapter_url, chapter_title)
chapter_requests = requests.get(chapter_url)
chapter_requests.encoding = 'gb2312'
chapter_html = chapter_requests.text
# 提取章节内容
infoz = r' <div align="center" style="float:left;width:713px;padding-left: 0px; padding-top:14px;font-size:16px;">(.*?)<div id="favoriteshow_3" style="display:none" align="center">'
chapter_div = re.findall(infoz, str(chapter_html), re.S|re.M) # 1行 # chapter_div = re.findall(infoz, str(chapter_html), re.S|re.M)[0] # 2行 # 清洗数据,先将空格换成空
chapter_div = chapter_div.replace(' ', '') # 3行
print(chapter_div)
exit()
红字的这三行有疑问。(运行结果有截图,已上传图片附件)问题1:在2、3行注释后,运行正常,能提取文章的全部内容,包括html元素、空格等。问题2:在2行注解后,运行报错,上网搜索了报错没找到原因。问题3:在1行注解后,运行正常,空格去除了,但文章内容只留下了开头跟结尾。
|
|