|
import requests
from bs4 import BeautifulSoup
class Downloader(object):
"""
定义一个下载整本小说的类
"""
def __init__(self):
self.server = 'https://www.biqukan.com/' # 属性一:笔趣看html
self.target = 'https://www.biqukan.com/1_1094/' # 属性二:《一念永恒》的html
self.names = [] # 属性三:定义一个列表用于存放各章节名
self.urls = [] # 属性四:定义一个列表用于存放章节的连接地址
self.nums = 0 # 属性五:章节数
"""
定义一个类的方法:获取该本小说的各个章节的网址
"""
def get_download_url(self):
req = requests.get(self.target)
div_bf = BeautifulSoup(req.text, "lxml")
div = div_bf.find_all('div', {'class': 'listmain'})
a_bf = BeautifulSoup(str(div[0]), "lxml")
a = a_bf.find_all('a') # 各章连接放在节点a中
self.nums = len(a[16:]) # 计算总章节数,从第十五个连接地址开始
for each in a[16:]:
self.names.append(each.string) # 获得各个章节名称
self.urls.append(self.server + each.get('href')) # 获得各个章节的网址
"""
定义一个类的方法:获得各个章节的文字内容
"""
def get_contents(self, target):
req = requests.get(target)
bf = BeautifulSoup(req.text, "lxml")
texts = bf.find_all('div', {'class': 'showtxt'})
texts = str(texts[0].text.replace('\xa0'*8, '\n\n'))
return texts
"""
定义一个类的方法:将爬取到的内容写入文件中
"""
def writer(self, name, path, text):
with open(path, 'a', encoding='utf-8') as f:
f.write(name + '\n')
f.writelines(text)
f.write('\n\n')
def main():
dl = Downloader()
dl.get_download_url()
print('《一念永恒》开始下载:')
for i in range(dl.nums):
dl.writer(dl.names, '一念永恒.text', dl.get_contents(dl.urls))
print('《一念永恒》下载完成!')
if __name__ == '__main__':
main()
结果:D:\python3.6.1\python.exe D:/python保存库/spider/story_allspider.py《一念永恒》开始下载:Traceback (most recent call last): File "D:/python保存库/spider/story_allspider.py", line 67, in <module> main() File "D:/python保存库/spider/story_allspider.py", line 60, in main dl.writer(dl.names, '一念永恒.text', dl.get_contents(dl.urls)) File "D:/python保存库/spider/story_allspider.py", line 41, in get_contents texts = str(texts[0].text.replace('\xa0'*8, '\n\n'))IndexError: list index out of rangeProcess finished with exit code 1
|
|