|
应该是网页的原因 我重新改了一下:import requests
import re
from xlwt import Workbook
import xlrd
import time
def key_name(number):
# 获取页面的内容并返回
name = '手机'
URL_1 = "https://s.taobao.com/search?ie=utf8&initiative_id=staobaoz_20170905&stats_click=search_radio_all%3A1&js=1&imgfile=&q="
URL_2 = "&suggest=0_1&_input_charset=utf-8&wq=u&suggest_query=u&source=suggest&p4ppushleft=5%2C48&s="
URL = (URL_1 + name + URL_2 + str(number))
print(URL)
res = requests.get(URL) #requests的 get 返回值是一个 Response 对象,这个对象有很多属性text、encoding、status_code、links等,
return res.text #这个对象的text属性即(resp.text)包含了相应的HTML文本,我们想要爬的数据就是在这里啦,encoding是显示resp.text的编码,也可以修改编码
def find_date(text):
# 根据整个页面的信息,获取商品的数据所在的HTML源码并放回
reg = r'"auctions":\[({.+?\})],"recommendAuctions":'
reg = re.compile(reg) #函数re.compile将正则表达式(以字符串书写的)转换为模式对象,可以实现更加有效的匹配
info = re.findall(reg, text) #info是一个list
return info[0] #info[0]是一个string
def manipulation_data(info, N, sheet):
# 解析获取的HTML源码,获取数据
# eval函数,用于计算字符串表达式的值
Date = eval(info)
for d in Date:
sheet.write(N, 0, d['raw_title'])
sheet.write(N, 1, d['view_price'])
sheet.write(N, 2, d['view_sales'])
sheet.write(N, 3, d['nick'])
sheet.write(N, 4, d['item_loc'])
N = N + 1
return N
def main():
book = Workbook()
sheet = book.add_sheet('淘宝手机数据')
sheet.write(0, 0, '手机标题')
sheet.write(0, 1, '价格')
sheet.write(0, 2, '付款人数')
sheet.write(0, 3, '专卖店')
sheet.write(0, 4, '发货地')
# N用于记录表格的数据行数,便于写入数据
k = 0
N = 1
for i in range(0,11):
text = key_name(k+i * 48)
info = find_date(text)
N = manipulation_data(info, N, sheet)
book.save('淘宝手机数据.xls')
print('下载第' + str(i) + '页完成')
if __name__ == '__main__':
main()
但是出现以下的错误:C:\Users\Administrator\AppData\Local\Programs\Python\Python35\python.exe F:/Python代码/爬虫/抓取淘宝页面参数到表格.pyhttps://s.taobao.com/search?ie=u ... amp;imgfile=&q=手机&suggest=0_1&_input_charset=utf-8&wq=u&suggest_query=u&source=suggest&p4ppushleft=5%2C48&s=0<class 'list'>Traceback (most recent call last): File "F:/Python代码/爬虫/抓取淘宝页面参数到表格.py", line 51, in <module> main() File "F:/Python代码/爬虫/抓取淘宝页面参数到表格.py", line 47, in main N = manipulation_data(info, N, sheet) File "F:/Python代码/爬虫/抓取淘宝页面参数到表格.py", line 24, in manipulation_data Date = eval(info) File "<string>", line 1, in <module>NameError: name 'true' is not defined
这又是为什么呢
|
|