The Queue module implements multi-producer, multi-consumer queues. It is especially useful in threaded programming when information must be exchanged safely between multiple threads.
classscrapy_threading(threading.Thread):"""Thread class to handle scrapy task"""def__init__(self,task,wanted):threading.Thread.__init__(self)self.do_task=taskself.wanted=wanteddefrun(self):globalvisited_uidsglobaltask_queueglobalscrapedgloballockwhilescraped<self.wanted:#crawl info based on each uidiftask_queue:uid=task_queue.get()ifuidinvisited_uids:#already crawledtask_queue.task_done()else:try:gains=self.do_task(uid)#per debugwow='{0: <25}'.format('['+time.asctime()+'] ')+' uid_'+'{0: <12}'.format(uid)printwowforuidingains:task_queue.put(uid)#signals that queue job is donetask_queue.task_done()#counting scrapied numberwithlock:scraped+=1#per debugprint'scraped: '+str(scraped)exceptException,e:printepasselse:time.sleep(30)
抓取的主进程
1234567891011121314151617181920212223242526
defscrapy(self):login_status=login(self.login_username,self.login_password,self.cookies_file)iflogin_status:ifself.start_uid:task_queue.put(self.start_uid)elifself.uids_file:uids_list=self.__load_uids__()foruidinuids_list:task_queue.put(uid)else:#start uid or uids file is neededraiseException('ERROR: Start uid or uids file is needed.')#spawn a pool of threads, and pass them queue instance for_inrange(self.thread_number):st=scrapy_threading(self.scrapy_do_task,self.wanted)st.setDaemon(True)st.start()task_queue.join()
3 抓取任务接口
12345678
defscrapy_do_task(self,uid=None):''' User needs to overwrite this method to perform uid-based scrapy task. @param uid: weibo uid @return: a list of uids gained from this task, optional '''#return []pass
[login_account_info]#account info for logining weibo login_username=ur_login_account_name_herelogin_uid=weibo_uid_of_login_account_herelogin_password=account_password_herecookies_file=weibo_cookies.dat[scrapy_settings]thread_number=50wanted=100000#only one property of below 2 is required, and start_uid takes advantage of uids_file#also note that arguments from constructor will overwrite this two properties start_uid=1197161814uids_file=
#!/usr/bin/env python#coding=utf8fromweibo_scrapyimportscrapyclassmy_scrapy(scrapy):defscrapy_do_task(self,uid=None):''' User needs to overwrite this method to perform uid-based scrapy task. @param uid: weibo uid @return: a list of uids gained from this task, optional '''super(my_scrapy,self).__init__(**kwds)#do what you want with uid here, note that this scrapy is uid based, so make sure there are uids in task queue, #or gain new uids from this functionprint'WOW...'return'replace this string with uid list which gained from this task'if__name__=='__main__':s=my_scrapy(start_uid='1197161814')s.scrapy()