|
- # -*-coding:utf-8-*-
- import urllib2 #基本使用所谓网页抓取,就是把URL地址中制定的网络资源从网络流中读取出来
- import sys #参数,输入输出 #解析、遍历、维护"标签树"的功能树
- import time
- from bs4 import BeautifulSoup
- import cookielib
- import re
- import urllib
- #写入Excel表格需要使用的库
- from openpyxl import Workbook #openpyxl (可读写excel表)专门处理2007及以上版本产生的xlsx文件
- reload(sys) # reload是内置函数,传给reload的是已经存在的模块对象,而不是变量名
- sys.setdefaultencoding('utf_8') #设置系统默认编码
- print sys.version # 打印当前版本信息
- sys.setdefaultencoding('utf_8')
- headers = {
- 'Accept':'******+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
- 'Accept-Encoding':'******',
- 'Accept-Language':'*******',
- 'Connection':'keep-alive',
- 'Host':'*******',
- 'User-Agent':'*******; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) ********15A372 Safari/604.1'
- }
- data = {
- "username":"******",
- "password":"*******"
- }
- post_data = urllib.urlencode(data)
- cookieJar = cookielib.CookieJar()
- opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookieJar))
- #urllater = ['K3', 'SSC', 'SYX5','FC3D','PL35','KL8','PK10', 'LHC', 'BRNN',********,'BENZBMW','SICBO']
- #urlfront = '*********'
- #模拟网页请求(加入cookie),打印出html
- #for i in range(len(urllater)):
- # urlinter = urlfront + str(urllater[i])
- # print(urlinter)
- urlinter = '*********'
- req = urllib2.Request(urlinter, post_data, headers=headers)
- req.get_method = lambda: 'HEAD'
- result = opener.open(req)
- result = opener.open(urlinter)
- time.sleep(1)
- html = result.read()
- print(html)
- bs = BeautifulSoup(html ,'html.parser', from_encoding='utf_8') # 第二个参数为文档解析器,若不传入该参数,自适应
- #'html.parser'
- alllist1 = bs.find_all('body') # 找标签
- print(alllist1)
复制代码
|
|