|
import requests
import queue
import time
import logging
import threading
loghandel=logging.getLogger("totest")
fhand = logging.FileHandler("./log/main.log", 'a')
formator = logging.Formatter(
'%(asctime)s - %(name)s - %(levelname)s - %(module)s - %(lineno)d - %(message)s',
'%a, %d %b %Y %H:%M:%S')
fhand.setFormatter(formator)
fhand.setLevel(logging.DEBUG)
loghandel.addHandler(fhand)
loghandel.setLevel(logging.DEBUG)
def test5(threadcount):
reqqueue = queue.Queue()
respqueue=queue.Queue()
for i in range(1,624):
reqqueue.put("http://club.autohome.com.cn/bbs/forum-c-3170-{0}.html?orderby=dateline&qaType=-1".format(i))
def downloadpage(url):
starttime = time.time()
startclock = time.clock()
rsp = requests.get(url)
endtime = time.time()
endclock = time.clock()
loghandel.info(rsp.url + " " + str(rsp.elapsed.microseconds) + " " + str(rsp.status_code) + " " + str(
endtime - starttime)+" "+ str(endclock - startclock))
respqueue.put(rsp)
while reqqueue.qsize()!=0:
threadpool=[]
for i in range(1, threadcount):
try:
page = reqqueue.get(block=False)
t = threading.Thread(target=downloadpage(page))
threadpool.append(t)
except queue.Empty as e:
loghandel.error("the pagequeue is empty")
break
for t in threadpool:
t.start()
for t in threadpool:
t.join()
threadpool.clear()
return respqueue
if __name__ == '__main__':
for i in [2, 6, 11, 31, 51]:
starttime = time.time()
startclock = time.clock()
test5(i)
endtime = time.time()
endclock = time.clock()
print(str(i - 1), endtime - starttime, endclock - startclock)
---------------------------------------------------------------------------------------------------------------------------------------
1.这一个简单的抓取网页信息的脚本,六百多页要三分钟,于是就该多线程了,测了一下,感觉没有效果啊
1 244.80229902267456 7.2211739999999995
5 242.83057475090027 6.998869
10 244.70902132987976 6.464195000000002
30 242.68053889274597 6.5377849999999995
50 231.3118827342987 6.582313999999997
三列分别代表了 线程数 time.time()的差值和time.clock()的差值。时间的变化感觉可以忽略不计了。
2.观察了一下日志,截取一段日志内容如下:
Mon, 22 Aug 2016 20:22:04 - totest - INFO - totest - 85 - http://club.autohome.com.cn/bbs/ ... =dateline&qaType=-1 520455 200 0.5733683109283447 0.009866000000002373
Mon, 22 Aug 2016 20:22:04 - totest - INFO - totest - 85 - http://club.autohome.com.cn/bbs/ ... =dateline&qaType=-1 551292 200 0.6094365119934082 0.010680999999998164
Mon, 22 Aug 2016 20:22:05 - totest - INFO - totest - 85 - http://club.autohome.com.cn/bbs/ ... =dateline&qaType=-1 528814 200 0.5740671157836914 0.012908000000003028
Mon, 22 Aug 2016 20:22:06 - totest - INFO - totest - 85 - http://club.autohome.com.cn/bbs/ ... =dateline&qaType=-1 558045 200 0.6106369495391846 0.008852000000004523
Mon, 22 Aug 2016 20:22:06 - totest - INFO - totest - 85 - http://club.autohome.com.cn/bbs/ ... =dateline&qaType=-1 499849 200 0.5567328929901123 0.010531999999997765
Mon, 22 Aug 2016 20:22:07 - totest - INFO - totest - 85 - http://club.autohome.com.cn/bbs/ ... =dateline&qaType=-1 523191 200 0.5853497982025146 0.00894800000000373
既然是并发的,那么页号从616到621怎么是顺序的呢?不应当是哪一个请求先结束就行记录哪一个请求吗?
请大家指教
|
|