|
现在的代码如下,再往下不知道怎么写了。
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
import requests
import re
import csv
url = ['http://www.tripadvisor.cn/Restaurants-g654326-oa{}-Sakai_Osaka_Prefecture_Kinki.html'.format(str(i)) for i in range(30,930,30)]
def meishizhuaqu(lianjie):
for singleurl in lianjie:
wb_data1 = requests.get(singleurl)
soup = BeautifulSoup(wb_data1.text,'lxml')
lala = soup.select('div.shortSellDetails > h3 > a ')
laji = []
for ever in lala:
jieguo = "http://www.tripadvisor.cn"+ever["href"]
buhao = jieguo.split()
laji = laji+buhao
for qnmlgb in laji:
wb_data = requests.get(qnmlgb)
soup = BeautifulSoup(wb_data.text,'lxml')
biaoti = soup.select('#HEADING')
jutixinxi = soup.select('div > address')
dianhua = soup.select('div.contact_info > div > div:nth-of-type(1) > div')
leixing = soup.select('#HEADING_GROUP > div > div.heading_ratings > div:nth-of-type(2) > span > div > a')
quanbuwenben = soup.get_text().replace('\n','')
w1 = "CurrentCenter"
w2 = "signature"
d = re.compile(w1+'(.*?)'+w2,re.S)
zuobiao = d.findall(quanbuwenben)
for j,i,k,h,n in zip(biaoti,jutixinxi,dianhua,zuobiao,leixing):
data = {
'标题':j.get_text().replace('\n',''),
'地址':i.get_text().replace('\n',''),
'电话':k.get_text().replace('\n','').replace('+','('),
'坐标':h.replace('.png|','').replace('&',''),
'分类':n.get_text().replace('\n',''),
}
|
|