|
本帖最后由 tangxiaomin1996 于 2022-7-19 21:07 编辑
代码如下,请高手指教
def getHTMLlines(htmlpath):
f=open(htmlpath,'r',encoding='utf-8')
ls=f.readlines()
f.close()
return ls
def extractImageUrls(htmllist):
urls=[]
for line in htmllist:
if 'a' in line:
url=line.split('href=')[-1].split('"')[0]
if 'http' in url:
urls.append(url)
return urls
def showResults(urls):
count=0
for url in urls:
print('第{:2}个URL:{}'.format(count,url))
count+=1
def saveResults(filepath,urls):
f=open(filepath,'w')
for url in urls:
f.write(url+'\n')
f.close()
def main():
intputfile='baidu3.html'
outputfile='baidu3-urls.txt'
htmlLines=getHTMLlines(intputfile)
imageUrls=extractImageUrls(htmlLines)
showResults(imageUrls)
saveResults(outputfile,imageUrls)
main()
|
|