python手機號前7位歸屬地爬蟲代碼實例
需求分析
項目上需要用到手機號前7位,判斷號碼是否合法,還有歸屬地查詢。舊的數(shù)據(jù)是幾年前了太久了,打算用python爬蟲重新爬一份
單線程版本
# coding:utf-8 import requests from datetime import datetime class PhoneInfoSpider: def __init__(self, phoneSections): self.phoneSections = phoneSections def phoneInfoHandler(self, textData): text = textData.splitlines(True) # print("text length:" + str(len(text))) if len(text) >= 9: number = text[1].split('\'')[1] province = text[2].split('\'')[1] mobile_area = text[3].split('\'')[1] postcode = text[5].split('\'')[1] line = "number:" + number + ",province:" + province + ",mobile_area:" + mobile_area + ",postcode:" + postcode line_text = number + "," + province + "," + mobile_area + "," + postcode print(line_text) # print("province:" + province) try: f = open('./result.txt', 'a') f.write(str(line_text) + '\n') except Exception as e: print(Exception, ":", e) def requestPhoneInfo(self, phoneNum): try: url = 'https://tcc.taobao.com/cc/json/mobile_tel_segment.htm?tel=' + phoneNum response = requests.get(url) self.phoneInfoHandler(response.text) except Exception as e: print(Exception, ":", e) def requestAllSections(self): # last用于接上次異常退出前的號碼 last = 0 # last = 4 # 自動生成手機號碼,后四位補0 for head in self.phoneSections: head_begin = datetime.now() print(head + " begin time:" + str(head_begin)) # for i in range(last, 10000): for i in range(last, 10): middle = str(i).zfill(4) phoneNum = head + middle + "0000" self.requestPhoneInfo(phoneNum) last = 0 head_end = datetime.now() print(head + " end time:" + str(head_end)) if __name__ == '__main__': task_begin = datetime.now() print("phone check begin time:" + str(task_begin)) # 電信,聯(lián)通,移動,虛擬運營商 dx = ['133', '149', '153', '173', '177', '180', '181', '189', '199'] lt = ['130', '131', '132', '145', '146', '155', '156', '166', '171', '175', '176', '185', '186', '166'] yd = ['134', '135', '136', '137', '138', '139', '147', '148', '150', '151', '152', '157', '158', '159', '172', '178', '182', '183', '184', '187', '188', '198'] add = ['170'] all_num = dx + lt + yd + add # print(all_num) print(len(all_num)) # 要爬的號碼段 spider = PhoneInfoSpider(all_num) spider.requestAllSections() task_end = datetime.now() print("phone check end time:" + str(task_end))
發(fā)現(xiàn)爬取一個號段,共10000次查詢,單線程版大概要多1個半小時,太慢了。
多線程版本
# coding:utf-8 import requests from datetime import datetime import queue import threading threadNum = 32 class MyThread(threading.Thread): def __init__(self, func): threading.Thread.__init__(self) self.func = func def run(self): self.func() def requestPhoneInfo(): global lock while True: lock.acquire() if q.qsize() != 0: print("queue size:" + str(q.qsize())) p = q.get() # 獲得任務 lock.release() middle = str(9999 - q.qsize()).zfill(4) phoneNum = phone_head + middle + "0000" print("phoneNum:" + phoneNum) try: url = 'https://tcc.taobao.com/cc/json/mobile_tel_segment.htm?tel=' + phoneNum # print(url) response = requests.get(url) # print(response.text) phoneInfoHandler(response.text) except Exception as e: print(Exception, ":", e) else: lock.release() break def phoneInfoHandler(textData): text = textData.splitlines(True) if len(text) >= 9: number = text[1].split('\'')[1] province = text[2].split('\'')[1] mobile_area = text[3].split('\'')[1] postcode = text[5].split('\'')[1] line = "number:" + number + ",province:" + province + ",mobile_area:" + mobile_area + ",postcode:" + postcode line_text = number + "," + province + "," + mobile_area + "," + postcode print(line_text) # print("province:" + province) try: f = open('./result.txt', 'a') f.write(str(line_text) + '\n') except Exception as e: print(Exception, ":", e) if __name__ == '__main__': task_begin = datetime.now() print("phone check begin time:" + str(task_begin)) dx = ['133', '149', '153', '173', '177', '180', '181', '189', '199'] lt = ['130', '131', '132', '145', '155', '156', '166', '171', '175', '176', '185', '186', '166'] yd = ['134', '135', '136', '137', '138', '139', '147', '150', '151', '152', '157', '158', '159', '172', '178', '182', '183', '184', '187', '188', '198'] all_num = dx + lt + yd print(len(all_num)) for head in all_num: head_begin = datetime.now() print(head + " begin time:" + str(head_begin)) q = queue.Queue() threads = [] lock = threading.Lock() for p in range(10000): q.put(p + 1) print(q.qsize()) for i in range(threadNum): middle = str(i).zfill(4) global phone_head phone_head = head thread = MyThread(requestPhoneInfo) thread.start() threads.append(thread) for thread in threads: thread.join() head_end = datetime.now() print(head + " end time:" + str(head_end)) task_end = datetime.now() print("phone check end time:" + str(task_end))
多線程版的1個號碼段1000條數(shù)據(jù),大概2,3min就好,cpu使用飆升,大概維持在70%左右。
總共40多個號段,爬完大概1,2個小時,總數(shù)據(jù)41w左右
以上就是本文的全部內容,希望對大家的學習有所幫助,也希望大家多多支持腳本之家。
相關文章
Python 專題六 局部變量、全局變量global、導入模塊變量
本文主要講述python全局變量、局部變量和導入模塊變量的方法。具有很好的參考價值,下面跟著小編一起來看下吧2017-03-03python格式的Caffe圖片數(shù)據(jù)均值計算學習
這篇文章主要為大家介紹了python格式的Caffe圖片數(shù)據(jù)均值計算學習示例詳解,有需要的朋友可以借鑒參考下,希望能夠有所幫助,祝大家多多進步,早日升職加薪2022-06-06Pytorch GPU顯存充足卻顯示out of memory的解決方式
今天小編就為大家分享一篇Pytorch GPU顯存充足卻顯示out of memory的解決方式,具有很好的參考價值,希望對大家有所幫助。一起跟隨小編過來看看吧2020-01-01在PyCharm中高效使用遠程文件編輯功能的實現(xiàn)
PyCharm作為業(yè)界領先的集成開發(fā)環(huán)境(IDE),提供了強大的本地和遠程開發(fā)功能,本文詳細介紹了如何在PyCharm中使用遠程文件編輯功能,希望能夠幫助你提高遠程開發(fā)的效率和體驗2024-08-08