|
- # 登录页面信息
- main_url = 'https://accounts.douban.com/login?source=movie'
- formdata = {
- "form_email":"*********@qq.com",
- "form_password":"****",
- "source":"movie",
- "redir":"https://movie.douban.com/subject/21937452/",
- "login":"登录"
- }
- user_agent = r'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36 Firefox/23.0'
- headers = {'User-Agnet': user_agent, 'Connection': 'keep-alive'}
- # 保存cookies便于后续页面的保持登陆
- from urllib import request
- from http import cookiejar
- cookie = cookiejar.CookieJar()
- cookie_support = request.HTTPCookieProcessor(cookie)
- opener = request.build_opener(cookie_support)
- # 编码信息,生成请求,打开页面获取内容
- from urllib import parse
- logingpostdata = parse.urlencode(formdata).encode('utf-8')
- req_ligin = request.Request(url=main_url, data=logingpostdata, headers=headers)
- response_login = opener.open(req_ligin).read().decode('utf-8')
- # 获取验证码图片地址
- from bs4 import BeautifulSoup
- try:
- soup = BeautifulSoup(response_login, "html.parser")
- if soup.find('img', id='captcha_image'):
- captchaAddr = soup.find('img', id='captcha_image')['src']
- # 匹配验证码id
- import re
- reCaptchaID = r'<input type="hidden" name="captcha-id" value="(.*?)"/'
- captchaID = re.findall(reCaptchaID, response_login)
- # 下载验证码图片
- request.urlretrieve(captchaAddr, "captcha.jpg")
- # 输入验证码并加入提交信息中,重新编码提交获得页面内容
- captcha = input('please input the captcha:')
- formdata['captcha-solution'] = captcha
- formdata['captcha-id'] = captchaID[0]
- logingpostdata = parse.urlencode(formdata).encode('utf-8')
- req_ligin = request.Request(url=main_url, data=logingpostdata, headers=headers)
- response_login = opener.open(req_ligin).read().decode('utf-8')
- finally:
- # 设置等待时间,避免爬取太快
- import time
- # 用于在超时的时候抛出异常,便于捕获重连
- import socket
- timeout = 3
- socket.setdefaulttimeout(timeout)
-
- import random
- import pandas as pd
- xls_file=pd.ExcelFile('F:\\anaconda\\Scripts\\素媛.xlsx')
- xls_file.sheet_names#显示出读入excel文件中的表名字
- table1=xls_file.parse('好评')
- mes=table1.用户主页
- for i in mes:
- url=i.strip()
- req_comment = request.Request(url=url, headers=headers)
- # 超时重连
- state = False
- while not state:
- try:
- html = opener.open(req_comment).read().decode('utf-8')
- state = True
- except socket.timeout:
- state = False
- res=BeautifulSoup(html, "html.parser")
- try:
- a=res.find('div',class_='user-info')
- b=a.find('a').get_text()
- print(url+" "+b)
- except:
- print(url+" "+"无")
- time.sleep(random.uniform(0,3))
- print("抓取完成")
复制代码 |
|