|
之前url的合集我倒是通过设置为set类,除去重复值了
test1.item_info5.ensure_index('url', unique=True)#设置了test1数据库里item_info5这张表作为索引了,
目前使用list(test1.item_info5.index_information()) 查看索引,给的是[u'_id_',u'url_1']
但是运行程序以后还是报原来的错误,pymongo.errors.DuplicateKeyError: E11000 duplicate key error collection: test1.item_info5 index: _id_ dup key: { : ObjectId('586dc164cc47800760677492') },我用的代码如下:
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
import requests
import time
import pymongo
client = pymongo.MongoClient('localhost', 27017)
ceshi = client['ceshi']
test1 = client['test1']
item_info = test1['item_info5']
def get_item_info(url):
if url == 'http://jump.zhineng.58.com/jump':
pass
else:
wb_data = requests.get(url)
soup = BeautifulSoup(wb_data.text, 'lxml')
global dict
if soup.find_all(attrs ={'class':"soldout_btn"}):
print 'soldout'
pass
else:
titles = soup.select('h1.info_titile')
prices = soup.select('div.price_li > span.price_now > i')
# date = soup.select('.time')[0].text
areas = soup.select('div.palce_li > span > i')
for title, price, areas in zip(titles,prices,areas):
dict = {
'title':titles[0].get_text(),
'price':int(price.text),
'area':areas.text,
'url':url
}
item_info.insert_one(dict)
print dict
#以上代码存于pages_parsing.py
----------------------------------------------------------------------
# -*- coding: utf-8 -*-
from multiprocessing import Pool
from pages_parsing import get_item_info,item_info,get_links_from
import pymongo
client = pymongo.MongoClient('localhost', 27017)
ceshi = client['ceshi']
links = ceshi['link1']
test1 = client['test1']
item_info = test1['item_info5']
test1.item_info5.ensure_index('url', unique=True)
url_db1 = [item['url'] for item in links.find()]#筛选好的url,无重复值
if __name__ == '__main__':
pool = Pool()
pool = Pool(processes=2)
pool.map(get_item_info,url_db1)
|
|