一段爬虫代码，有三大问题请高手指点

010land · 发表于 2018-4-15 22:29:29

写了一段爬取理想论坛“http://www.55188.com/forum-8-1.html”的爬虫代码，但是不尽人意，主要问题有：
1.运行不稳定，比如我要取五页以上帖子及其子贴，十有八九要中途崩溃。
2.运行慢，取两页时间都要等十多分钟。
3.间或有乱码现象，无论是将encoding调整成utf-8,GBK,GB2312都是这样。

代码如下，请高手指点，先谢谢了！
# 理想论坛爬虫1.04，用于爬取主贴再爬子贴
from bs4 import BeautifulSoup
import requests
import threading
import re
import pymysql
import time
import datetime

user_agent='Mozilla/4.0 (compatible;MEIE 5.5;windows NT)'
headers={'User-Agent':user_agent}

# 主帖数组
topics=[]

# 主帖总数，也是爬虫/线程总数
Total=0

# 已完成任务的爬虫/线程数
Completed=0

# 已经插入DB的记录数
Inserted=0

# sql数组
sqls=[]

# 帖子爬虫类
class topicCrawler(threading.Thread):
# 构造函数
def __init__(self,name,url,id):
threading.Thread.__init__(self,name=name)
# 以下是传进来的值
self.name=name # 线程名，等于帖子标题
self.initUrl=url # 输出用
self.url=url    # 爬行用当前地址及下一页地址
self.id=id    # 线程id

# 以下是自己生成的值
self.infos=[] # 找到的子贴地址
self.msgs=[]    # 要输出的消息数组

# 将信息添加进消息数组
def addmsg(self,msg):
currTime=time.strftime('%H:%M:%S',time.localtime(time.time()))
self.msgs.append(currTime+' '+msg);

# 线程要运行的函数
def run(self):
while(self.url!="none"):
self.addmsg('开始读取页面:'+self.url)

try:
rsp=requests.get(self.url,headers=headers)
self.url="none"                         # 用完之后置空，看下一页能否取到值
soup= BeautifulSoup(rsp.text,'html.parser',from_encoding='utf-8')# from_encoding='utf-8'有必要再加
self.addmsg('已经取得页面信息')

# 找出一页里每条发言
childTopicCount=0
for divs in soup.find_all('div',class_="postinfo"):
# 用正则表达式将多个空白字符替换成一个空格
RE = re.compile(r'(\s+)')
line=RE.sub(" ",divs.text)
arr=line.split(' ')
arrLength=len(arr)

if arrLength==7:
info={'楼层':arr[1],
   '作者':arr[2].replace('只看：',''),
   '日期':arr[4],
   '时间':arr[5]}
self.infos.append(info);
childTopicCount=childTopicCount+1
elif arrLength==8:
info={'楼层':arr[1],
   '作者':arr[2].replace('只看：',''),
   '日期':arr[5],
   '时间':arr[6]}
self.infos.append(info);
childTopicCount=childTopicCount+1

self.addmsg('获取到'+str(childTopicCount)+'个子贴')

#找下一页所在地址
for pagesDiv in soup.find_all('div',class_="pages"):
for strong in pagesDiv.find_all('strong'):
self.addmsg('当前为第'+strong.text+'页')

# 找右边的兄弟节点
nextNode=strong.next_sibling
if nextNode and nextNode.get("href"): # 如果右边的兄弟节点存在，且其有href属性
self.url='http://www.55188.com/'+nextNode.get("href")

if self.url!="none":
self.addmsg('找到下一页')
continue
else:
self.addmsg('没有下一页了,小爬读取工作结束')

countWillbeInsert=len(self.infos)
if countWillbeInsert==0:
self.addmsg('小爬居然没有读到任何子贴数据!')
else:
# 准备写DB的insert语句
self.addmsg('小爬准备向数据库插入'+str(countWillbeInsert)+'条记录')
pushSql(self.name,self.infos)
self.addmsg('小爬#'+self.id+'工作完成')

# 一次性输出全部信息，这种方式可以避免线程间抢控制台一行行输出看起来乱
allMsg="";
allMsg=allMsg+'\n小爬ID:'+self.id+'\n';
allMsg=allMsg+'小爬名:'+self.name+'\n';
allMsg=allMsg+'小爬战斗和生活的地方:'+self.initUrl+'\n';
for msg in self.msgs:
allMsg=allMsg+'#'+self.id+' '+msg+'\n'
print(allMsg)

# 输出比例
global Completed
Completed=Completed+1
percent=round(Completed*100/Total,2)
print('----已有'+str(percent)+'%的小爬完成任务')

# 如果全部完成再写数据库
if Completed==Total:
allWillBeInsert=len(sqls)
print('----共有'+str(allWillBeInsert)+'条数据将被插入DB')
sum=insertDB()
print('----共有'+str(sum)+'条数据已经被插入DB')

except Exception as e:
print("#"+self.id+" 线程"+self.name+"发生异常:"+str(e))
self.addmsg('小爬工作出现异常:'+str(e))

continue

# 将生成的Sql语句放到数组里
def pushSql(crawlName,infos):
for info in infos:
sql="insert into test.topic3(floor,author,tdate,ttime,crawlername,addtime) values ('"+info['楼层']+"','"+info['作者']+"','"+info['日期']+"','"+info['时间']+"','"+crawlName+"',now() )"
sqls.append(sql)
#if sql not in sqls:
# sqls.append(sql)

# 数据库插值
def insertDB():
conn=pymysql.connect(host='127.0.0.1',user='root',passwd='12345678',db='test',charset='utf8')

sum=0;# 插入成功总数

for sql in sqls:
count=conn.query(sql) #单条是否成功

if count==0:
print(sql+'插入记录失败');

sum+=count

conn.commit()
conn.close()

return sum

# 找出论坛页中的贴子
def getTopics(pageUrl):
print("开始读取页面"+pageUrl+"的帖子");

try:
rsp=requests.get(pageUrl,headers=headers)
soup= BeautifulSoup(rsp.text,'html.parser',from_encoding='gb2312')# ,from_encoding='utf-8'

# 找出span
for spans in soup.find_all('span',class_="forumdisplay"):
#找出link
for link in spans.find_all('a'):
if link and link.get("href"):
topicLink="http://www.55188.com/"+link.get("href")
topicText=link.text
topic={'link':topicLink,'text':topicText} # 做个小字典对象

if topic not in topics:
topics.append(topic)# 把字典对象塞入数组
print('读取到帖子:'+topic['text'])

print("读取页面"+pageUrl+"的帖子完毕");
except Exception as e:
print("线程"+self.name+"发生异常:"+str(e))

# 入口函数
def main():
# 找帖子
st=datetime.datetime.now();
for i in range(1,4):
print("\n第"+str(i)+"页")
pageUrl='http://www.55188.com/forum-8-'+str(i)+'.html' # 这个页是论坛页，即第1页，第2页等
getTopics(pageUrl);
et=datetime.datetime.now();

global Total
Total=len(topics);# 得到总数
print('共找到'+str(Total)+'个帖子地址,用时'+str((et-st).seconds)+'秒');

# 开始爬帖子的子贴
index=0;
for topic in topics:
index=index+1

title=topic['text']
topicUrl=topic['link']

tc=topicCrawler(name=title,url=topicUrl,id=str(index))
tc.start()

# 启动一个线程的测试函数
def test():
#http://www.55188.com/thread-8332680-1-3.html
tc=topicCrawler(name='test',url='http://www.55188.com/thread-8333398-1-3.html',id='007')
tc.start()

# 开始
main()

010land · 发表于 2018-4-16 17:56:01

关键问题已找到，if Completed==Total: 这句在多线程环境是不可靠的，Completed有可能超过Total而使程序终止。

		自动登录	找回密码
密码			立即注册

[求助] 一段爬虫代码，有三大问题请高手指点