|
问题已经解决了,我把代码贴出来
# -*- coding: utf-8 -*-
import requests as req
from lxml import etree
import csv
import time
header = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}
f = open('douban_book_250.csv', 'w+', encoding = 'utf-8')
doubanbook = csv.writer(f)
doubanbook.writerow(('book', 'writer', 'publisher', 'date', 'price', 'rate', 'reader', 'comment', 'cover'))
for i in range(0, 10):
print('--------------------现在开始爬取第' + str(i+1) + '页--------------------')
url = 'https://book.douban.com/top250?start=' + str(25*i)
resp**e = req.get(url, headers = header)
xml = etree.HTML(resp**e.content)
for j in range(1,26):
books_xpath = '//*[@id="content"]/div/div[1]/div/table[' + str(j) + ']/tr/td[2]/div[1]/a/text()'
books = xml.xpath(books_xpath)[0].strip()
try:
writers_xpath = '//*[@id="content"]/div/div[1]/div/table[' + str(j) + ']/tr/td[2]/p[1]/text()'
infos = xml.xpath(writers_xpath)[0].strip()#strip函数进行去除‘/’
writers = infos.split('/')[-4]#split()函数通过指定分隔符进行切片
publisher = infos.split('/')[-3]
date = infos.split('/')[-2]
price = infos.split('/')[-1]
except(TypeError, IndexError):
writers_xpath = '//*[@id="content"]/div/div[1]/div/table[' + str(j) + ']/tr/td[2]/p[1]/text()'
infos = xml.xpath(writers_xpath)[0].strip()#strip函数进行去除‘/’
publisher = infos.split('/')[-3]
date = infos.split('/')[-2]
price = infos.split('/')[-1]
ratings_xpath = '//*[@id="content"]/div/div[1]/div/table[' + str(j) + ']/tr/td[2]/div[2]/span[2]/text()'
ratings = xml.xpath(ratings_xpath)[0].strip()
readers_xpath = '//*[@id="content"]/div/div[1]/div/table[' + str(j) + ']/tr/td[2]/div[2]/span[3]/text()'
readers = xml.xpath(readers_xpath)[0].strip('()').strip()
comments_xpath = '//*[@id="content"]/div/div[1]/div/table[' + str(j) + ']/tr/td[2]/p[2]/span/text()'
comment = xml.xpath(comments_xpath)
if len(comment) != 0:
comments = xml.xpath(comments_xpath)[0].strip()
else:
comments = '空白'
imgs_xpath = '//*[@id="content"]/div/div[1]/div/table[' + str(j) + ']/tr/td[1]/a/img/@src'
imgs = xml.xpath(imgs_xpath)[0].strip()
doubanbook.writerow((books, writers, publisher, date, price, ratings, readers, comments, imgs))
print(books)
print(writers)
print(publisher)
print(date)
print(price)
print(ratings)
print(readers)
print(comments)
print(imgs + '\n')
time.sleep(0.5)
print(3*'\n')
f.close()
print('OK') |
|