|
我用beautifulsoup来获取某英文单词在thesaurus.com网站的同义词,代码如下:
import sys
from bs4 import BeautifulSoup
import urllib
import urllib.request
import urllib.parse
import gzip
def open_url(url, data={}, proxy=False, method='GET', cookie='', browser_args=None):
netloc = urllib.parse.urlparse(url).netloc
headers = [
('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 ' \
'(KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'),
('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'),
('Accept-Charset', 'ISO-8859-1,utf-8;q=0.7,*;q=0.3'),
('Accept-Encoding', 'gzip, deflate, sdch, br'),
# ('Connection', 'close'),
('Referer', 'http://%s/' % netloc),
('authority', netloc),
]
req = urllib.request.Request(url)
if cookie:
headers.append(
('Cookies', cookie)
)
if browser_args:
headers.extend(browser_args)
for k, v in headers:
req.add_header(k, v)
req.method = method
if proxy:
req.set_proxy('192.168.8.108:1080', 'http')
data = urllib.parse.urlencode(data).encode('utf-8')
html = urllib.request.urlopen(req, data=data).read()
if isinstance(html, bytes):
try:
html = gzip.decompress(html).decode("utf-8")
except:
html = html.decode("utf-8")
return html
word = sys.argv[1]
# word = 'poll'
html = open_url('http://www.thesaurus.com/browse/%s?s=t' % word)
soup = BeautifulSoup(html, 'html.parser')
synonyms = soup.find('div', {'class': 'relevancy-block'})
for span in synonyms.find_all('span', {'class': 'text'}):
print(span.text)
之前一直没问题。。两个月没跑的代码,突然出现报错AttributeError: 'NoneType' object has no attribute 'find_all'
不知道是什么原因。。求助
|
|