|
这是我爬取国务院领导动态网页链接、标题、日期并录入MongoDB的源码:想问一下如何加入判断爬取的数据是否已录入数据库的语句,使得该程序可以重复运行爬取最新的数据,并且不会将之前爬取的数据录入MongoDB。
from bs4 import BeautifulSoup
import requests
import pymongo
import time
urls = ['http://sousuo.gov.cn/column/31250/{}.htm'.format(str(i)) for i in range (0,10,1)]
client = pymongo.MongoClient('localhost',27017)
guowuyuan = client['Guowuyuan']
news_info = guowuyuan['news_info']
news_info.ensure_index('title', unique=True)
def get_news (url,data=None):
wb_data = requests.get(url)
soup = BeautifulSoup(wb_data.text, 'lxml')
titles = soup.select('div.content > div > div.news_box > div.list.list_1.list_2 > ul > li > h4 > a')
dates = soup.select('div.content > div > div.news_box > div.list.list_1.list_2 > ul > li > h4 > span')
links = soup.select('div.content > div > div.news_box > div.list.list_1.list_2 > ul > li > h4 > a')
time.sleep(2)
for title, date, link in zip(titles, dates, links):
data = {
'title': title.get_text(),
'date': date.get_text(),
'link': link.get("href"),
"location":'National'
}
return data
for single_url in urls:
data = get_news(single_url)
|
|