|
__author__ = '打错一只北风'
from bs4 import BeautifulSoup
import requests
#url的兄弟姐妹
def get_urls(url):
html = requests.get(url).text
soup = BeautifulSoup(html, "html.parser")
soups = soup.find_all(target="_blank")
ff = []
urls = {}
for soup0 in soups:
ff.append(soup0.get('href'))
for i in range(len(ff)):
urls = 'http://www.jianshu.com'+str(ff)
print(urls)
return urls
def url_open(url):
urls = get_urls(url)
for url in urls:
with open('jianshu.html','ab') as file:
html = requests.get(url).text
file.write(html)
#解析html,以得到目标对象
def get_mywilling(url):
url_open(url)
html = open('jianshu.html','r')
soup = BeautifulSoup(html, "html.parser")
return soup.find_all('p')
#将目标对象排版,储存
def save_mywilling():
mywilling = get_mywilling(url)
with open('D:jianshu.txt','ab') as file:
goal = str(mywilling).encode('utf-8')
file.write(goal)
file.close()
print('成功收集一次!')
#主函数
def main():
print('爬虫启动中')
save_mywilling()
#模块测试
if __name__=='__main__':
url = 'http://www.jianshu.com/'
main()
#get_urls(url)
'''
<a target="_blank" href="/p/adf0509309d9">15个小方法,专治各种不开心</a>
http://www.jianshu.com/p/adf0509309d9
'''
|
|