|
想用bs4抓取网页上的表格抓出来确是图里的样子好多都是空值
[img]file:///C:\Users\yangtongle\Documents\Tencent Files\1002495039\Image\C2C\%4XP8A4W[PDXCEL6U5$]%~U.png[/img]目标网址是http://bj.zhue.com.cn/list.php?s ... &lx=&page=2
以下是源码哪个大神能给指点指点
import time
import re
from urllib.request import urlretrieve
from bs4 import BeautifulSoup
import requests
url = 'http://bj.zhue.com.cn/list.php?sort=2&s_id=0&c_id=0&cou_id=0&city_id=0&mid=&lx=&page=2'
headers_data = {
'User-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
}
def get_html(url):
try:
response = requests.get(url=url,headers=headers_data)
response.encoding = 'gbk'
#print(response.text)
return response.text
except Exception as e:
print(e)
html=get_html(url)
new_html=(html.replace('<a>','')).replace('</a>','')
def down_show(ulist,html):
try:
soup = BeautifulSoup(html,'lxml')
a = soup.find('table',class_='t_f')
trs = a.find_all('tr',attrs={'bgcolor':'#ffffff'})
for tr in trs:
ui = []
for td in tr:
ui.append(td.string)
ulist.append(ui)
except Exception as e:
print(e)
urll = []
down_show(urll,html)
for d in range(1,31):
print(urll[d])
print('\n')
|
-
|