|
- # -*- coding: utf-8 -*-
- import scrapy
- class TestSpider(scrapy.Spider):
- name = 'test'
- def start_requests(self):
- url = 'https://www.lagou.com/gongsi/8523.html'
- headers = {
- 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36',
- 'Cookie': '_putrc=4BDD5841CFB6CF89; login=true;'
- }
- yield scrapy.Request(url=url,
- callback=self.parse_page,
- headers=headers)
- def parse_page(self, response):
- print(response.text)
复制代码 上面这个是用get方式可以正常得到网页的代码
- # -*- coding: utf-8 -*-
- import scrapy
- import json
- class LagSpider(scrapy.Spider):
- name = 'lag'
- allowed_domains = ['www.lagou.com']
- pn=1
- def start_requests(self, pn=1):
- headers = {
- "User-Agent": "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.2; Win64; x64; Trident/7.0; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.30729; .NET CLR 3.5.30729)",
- 'Referer': "https://www.lagou.com/jobs/list_%E7%BD%91%E7%BB%9C%E5%AE%89%E5%85%A8?labelWords=&fromSearch=true&suginput="
- }
- url = 'https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false&isSchoolJob=0'
- yield scrapy.FormRequest(
- url=url,
- formdata={
- 'first': 'true',
- 'pn': str(pn),
- 'kd': '网络安全'
- },
- callback=self.parse_page,
- headers=headers
- )
- def parse_page(self, response):
- print('11111111111111')
- datas = json.loads(response.text)['content']
- # if datas['pageNo'] * datas['pageSize'] < datas['positionResult']['totalCount']:
- # self.pn += 1
- datas = datas['positionResult']['result']
- data = datas[0]
- lagou_company_url = 'https://www.lagou.com/gongsi/' + \
- str(data['companyId']) + '.html'
- print(lagou_company_url)
- headers = {
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36',
- 'Cookie': '_putrc=4BDD5841CFB6CF89; login=true;'
- }
- yield scrapy.Request(lagou_company_url, callback=self.parse_company, headers=headers, method='GET')
- print('3333333333333333333')
- def parse_company(self, response):
- print(response.text)
- print('22222222222222222')
复制代码 这个是post 文件之后,利用文件内容拼接url,之后Request 网页的方法,但是却出现请求内容不对的错误
不知道这个是什么问题,望解答
|
|