Python3爬虫爬取淘宝商品数据

anni · 发表于 2017-12-5 13:34:58

import requests
import re
from xlwt import Workbook
import xlrd
import time

def key_name( number ):
#获取页面的内容并返回
name = '手机'
URL_1 = "https://s.taobao.com/search?ie=utf8&initiative_id=staobaoz_20170905&stats_click=search_radio_all%3A1&js=1&imgfile=&q="
URL_2 = "&suggest=0_1&_input_charset=utf-8&wq=u&suggest_query=u&source=suggest&p4ppushleft=5%2C48&s="
URL = ( URL_1 + name + URL_2 + str(number))
#print(URL)
res = requests.get( URL )
return res.text

def find_date( text):
#根据整个页面的信息，获取商品的数据所在的HTML源码并放回
reg = r',"data":{"spus":\[({.+?)\]}},"header":'
reg = re.compile(reg)
info = re.findall(reg, text)
return info[0]

def manipulation_data( info, N, sheet ):
#解析获取的HTML源码，获取数据
Date = eval(info)

for d in Date:
      T = " ".join([t['tag'] for t in d['tag_info']])
      #print(d['title'] + '\t' + d['price'] + '\t' + d['importantKey'][0:len(d['importantKey'])-1] + '\t' + T)

      sheet.write(N,0,d['title'])
      sheet.write(N,1,d['price'])
      sheet.write(N,2,T)
      N = N + 1
return N

def main():

book = Workbook()
sheet = book.add_sheet('淘宝手机数据')
sheet.write(0,0,'品牌')
sheet.write(0,1,'价格')
sheet.write(0,2,'配置')
book.save('淘宝手机数据.xls')
#k用于生成链接，每个链接的最后面的数字相差48.
#N用于记录表格的数据行数，便于写入数据
k = 0
N = 1
for i in range(10+1):
      text = key_name( k + i * 48 )
      info = find_date(text)
      N = manipulation_data( info ,N, sheet )

      book.save('淘宝手机数据.xls')
      print('下载第' + str(i) + '页完成')

if __name__ == '__main__':
main()

这是从网上找的代码  之前很成功  现在却出现错误了：
C:\Users\Administrator\AppData\Local\Programs\Python\Python35\python.exe F:/Python代码/爬虫/c.py
Traceback (most recent call last):
  File "F:/Python代码/爬虫/c.py", line 63, in <module>
main()
  File "F:/Python代码/爬虫/c.py", line 55, in main
info = find_date(text)
  File "F:/Python代码/爬虫/c.py", line 24, in find_date
return info[0]
IndexError: list index out of range

大神帮忙看看  原因在哪儿

剑心无痕 · 发表于 2017-12-5 16:17:41

def find_date( text):
#根据整个页面的信息，获取商品的数据所在的HTML源码并放回
reg = r',"data":{"spus":\[({.+?)\]}},"header":'
reg = re.compile(reg)
info = re.findall(reg, text)
if info == []:
print(text) # 当报错的时候，查看报错的txt是什么，再去查看原因是为什么
return info[0]

anni · 发表于 2017-12-5 16:43:35

有没有可能没抓到数据的原因可能那个网页源代码发生了变化

anni · 发表于 2017-12-6 15:08:35

剑心无痕发表于 2017-12-5 16:17
def find_date( text):
#根据整个页面的信息，获取商品的数据所在的HTML源码并放回
reg = r',"data ...

应该是网页的原因我重新改了一下：import requests
import re
from xlwt import Workbook
import xlrd
import time
def key_name(number):
# 获取页面的内容并返回
name = '手机'
URL_1 = "https://s.taobao.com/search?ie=utf8&initiative_id=staobaoz_20170905&stats_click=search_radio_all%3A1&js=1&imgfile=&q="
URL_2 = "&suggest=0_1&_input_charset=utf-8&wq=u&suggest_query=u&source=suggest&p4ppushleft=5%2C48&s="
URL = (URL_1 + name + URL_2 + str(number))
print(URL)
res = requests.get(URL) #requests的 get 返回值是一个 Response 对象，这个对象有很多属性text、encoding、status_code、links等，
return res.text #这个对象的text属性即（resp.text）包含了相应的HTML文本，我们想要爬的数据就是在这里啦，encoding是显示resp.text的编码，也可以修改编码
def find_date(text):
# 根据整个页面的信息，获取商品的数据所在的HTML源码并放回
reg = r'"auctions":\[({.+?\})],"recommendAuctions":'
reg = re.compile(reg) #函数re.compile将正则表达式（以字符串书写的）转换为模式对象，可以实现更加有效的匹配
info = re.findall(reg, text)  #info是一个list
return info[0]  #info[0]是一个string
def manipulation_data(info, N, sheet):
# 解析获取的HTML源码，获取数据
# eval函数，用于计算字符串表达式的值
Date = eval(info)
for d in Date:
      sheet.write(N, 0, d['raw_title'])
      sheet.write(N, 1, d['view_price'])
      sheet.write(N, 2, d['view_sales'])
      sheet.write(N, 3, d['nick'])
      sheet.write(N, 4, d['item_loc'])
      N = N + 1
return N
def main():
book = Workbook()
sheet = book.add_sheet('淘宝手机数据')
sheet.write(0, 0, '手机标题')
sheet.write(0, 1, '价格')
sheet.write(0, 2, '付款人数')
sheet.write(0, 3, '专卖店')
sheet.write(0, 4, '发货地')
# N用于记录表格的数据行数，便于写入数据
k = 0
N = 1
for i in range(0,11):
      text = key_name(k+i * 48)
      info = find_date(text)
      N = manipulation_data(info, N, sheet)
      book.save('淘宝手机数据.xls')
      print('下载第' + str(i) + '页完成')
if __name__ == '__main__':
main()
但是出现以下的错误：C:\Users\Administrator\AppData\Local\Programs\Python\Python35\python.exe F:/Python代码/爬虫/抓取淘宝页面参数到表格.pyhttps://s.taobao.com/search?ie=u ... amp;imgfile=&q=手机&suggest=0_1&_input_charset=utf-8&wq=u&suggest_query=u&source=suggest&p4ppushleft=5%2C48&s=0<class 'list'>Traceback (most recent call last):  File "F:/Python代码/爬虫/抓取淘宝页面参数到表格.py", line 51, in <module> main()  File "F:/Python代码/爬虫/抓取淘宝页面参数到表格.py", line 47, in main N = manipulation_data(info, N, sheet)  File "F:/Python代码/爬虫/抓取淘宝页面参数到表格.py", line 24, in manipulation_data Date = eval(info)  File "<string>", line 1, in <module>NameError: name 'true' is not defined
这又是为什么呢

		自动登录	找回密码
密码			立即注册

[求助] Python3爬虫爬取淘宝商品数据

热心会员

默默耕耘

优秀版主