|
- from selenium import webdriver
- from selenium.common.excepti** import TimeoutException
- from selenium.webdriver.common.by import By
- from selenium.webdriver.support import expected_conditi** as EC
- from selenium.webdriver.support.wait import WebDriverWait
- from pyquery import PyQuery as pq
- from pymongo import MongoClient
- #创建mogodb数据对象
- client=MongoClient()
- db=client['taobao']
- collection=db['taobao']
- browser=webdriver.Chrome()
- wait=WebDriverWait(browser,10)
- max_page=100
- def index_page():
- try:
- browser.get('https://www.taobao.com')
- input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#q')))
- submit = wait.until(
- EC.element_to_be_clickable((By.CSS_SELECTOR, '#J_TSearchForm > div.search-button > button')))
- input.send_keys('ipad')
- submit.click()
- print('正在爬取第', page, '页')
- get_products()
- num = browser.find_element_by_link_text('下一页')
- num.click()
- except TimeoutException:
- print('time out!')
- return index_page()
- # 提取商品数据
- def get_products():
- wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'mainsrp-itemlist.items.item')))
- html=browser.page_source
- doc=pq(html)
- items=doc('#mainsrp-itemlist.items.item').items()
- for item in items:
- product={
- 'image': item.find('.pic a img').attr('data-src'),
- 'price': item.find('.price').text(),
- 'deal': item.finc('.deal-cnt').text(),
- 'title': item.find('.title').text(),
- 'shop': item.find('.shop').text(),
- 'location': item.find('.location').text()
- }
- print(product)
- save_to_mogo(product)
- #保存到mogodb
- def save_to_mogo(result):
- try:
- if collection.insert(result):
- print('保存成功',result)
- except Exception:
- print('保存失败',result)
- if __name__=='__main__':
- for page in range(2, max_page + 1):
- index_page(page)
复制代码 自己想爬取淘宝上有关“ipad"商品的信息,这是自己根据别人的代码修改的,但是自己修改后,为何总是显示time out呢,求助。
|
|