欧美bbbwbbbw肥妇,免费乱码人妻系列日韩,一级黄片

Python實(shí)現(xiàn)大文件排序的方法

 更新時(shí)間:2015年07月10日 12:26:42   作者:Sephiroth  
這篇文章主要介紹了Python大文件排序的方法,涉及Python針對(duì)文件、緩存及日期等操作的相關(guān)技巧,具有一定參考借鑒價(jià)值,需要的朋友可以參考下

本文實(shí)例講述了Python實(shí)現(xiàn)大文件排序的方法。分享給大家供大家參考。具體實(shí)現(xiàn)方法如下:

import gzip
import os
from multiprocessing import Process, Queue, Pipe, current_process, freeze_support
from datetime import datetime
def sort_worker(input,output):
 while True:
  lines = input.get().splitlines()
  element_set = {}
  for line in lines:
    if line.strip() == 'STOP':
      return
    try:
      element = line.split(' ')[0]
      if not element_set.get(element): element_set[element] = ''
    except:
      pass
  sorted_element = sorted(element_set)
  #print sorted_element
  output.put('\n'.join(sorted_element))
def write_worker(input, pre):
  os.system('mkdir %s'%pre)
  i = 0
  while True:
    content = input.get()
    if content.strip() == 'STOP':
      return
    write_sorted_bulk(content, '%s/%s'%(pre, i))
    i += 1
def write_sorted_bulk(content, filename):
  f = file(filename, 'w')
  f.write(content)
  f.close()
def split_sort_file(filename, num_sort = 3, buf_size = 65536*64*4):
  t = datetime.now()
  pre, ext = os.path.splitext(filename)
  if ext == '.gz':
    file_file = gzip.open(filename, 'rb')
  else:
    file_file = open(filename)
  bulk_queue = Queue(10)
  sorted_queue = Queue(10)
  NUM_SORT = num_sort
  sort_worker_pool = []
  for i in range(NUM_SORT):
    sort_worker_pool.append( Process(target=sort_worker, args=(bulk_queue, sorted_queue)) )
    sort_worker_pool[i].start()
  NUM_WRITE = 1
  write_worker_pool = []
  for i in range(NUM_WRITE):
    write_worker_pool.append( Process(target=write_worker, args=(sorted_queue, pre)) )
    write_worker_pool[i].start()
  buf = file_file.read(buf_size)
  sorted_count = 0
  while len(buf):
    end_line = buf.rfind('\n')
    #print buf[:end_line+1]
    bulk_queue.put(buf[:end_line+1])
    sorted_count += 1
    if end_line != -1:
      buf = buf[end_line+1:] + file_file.read(buf_size)
    else:
      buf = file_file.read(buf_size)
  for i in range(NUM_SORT):
    bulk_queue.put('STOP')
  for i in range(NUM_SORT):
    sort_worker_pool[i].join()
   
  for i in range(NUM_WRITE):
    sorted_queue.put('STOP')
  for i in range(NUM_WRITE):
    write_worker_pool[i].join()
  print 'elasped ', datetime.now() - t
  return sorted_count
from heapq import heappush, heappop
from datetime import datetime
from multiprocessing import Process, Queue, Pipe, current_process, freeze_support
import os
class file_heap:
  def __init__(self, dir, idx = 0, count = 1):
    files = os.listdir(dir)
    self.heap = []
    self.files = {}
    self.bulks = {}
    self.pre_element = None
    for i in range(len(files)):
      file = files[i]
      if hash(file) % count != idx: continue
      input = open(os.path.join(dir, file))
      self.files[i] = input
      self.bulks[i] = ''
      heappush(self.heap, (self.get_next_element_buffered(i), i))
  def get_next_element_buffered(self, i):
    if len(self.bulks[i]) < 256:
      if self.files[i] is not None:
        buf = self.files[i].read(65536)
        if buf:
          self.bulks[i] += buf
        else:
          self.files[i].close()
          self.files[i] = None
    end_line = self.bulks[i].find('\n')
    if end_line == -1:
      end_line = len(self.bulks[i])
    element = self.bulks[i][:end_line]
    self.bulks[i] = self.bulks[i][end_line+1:]
    return element
  def poppush_uniq(self):
    while True:
      element = self.poppush()
      if element is None:
        return None
      if element != self.pre_element:
        self.pre_element = element
        return element
  def poppush(self):
    try:
      element, index = heappop(self.heap)
    except IndexError:
      return None
    new_element = self.get_next_element_buffered(index)
    if new_element:
      heappush(self.heap, (new_element, index))
    return element
def heappoppush(dir, queue, idx = 0, count = 1):
  heap = file_heap(dir, idx, count)
  while True:
    d = heap.poppush_uniq()
    queue.put(d)
    if d is None: return
def heappoppush2(dir, queue, count = 1):
  heap = []
  procs = []
  queues = []
  pre_element = None
  for i in range(count):
    q = Queue(1024)
    q_buf = queue_buffer(q)
    queues.append(q_buf)
    p = Process(target=heappoppush, args=(dir, q_buf, i, count))
    procs.append(p)
    p.start()
  queues = tuple(queues)
  for i in range(count):
    heappush(heap, (queues[i].get(), i))
  while True:
    try:
      d, i= heappop(heap)
    except IndexError:
      queue.put(None)
      for p in procs:
        p.join()
      return
    else:
      if d is not None:
        heappush(heap,(queues[i].get(), i))
        if d != pre_element:
          pre_element = d
          queue.put(d)
def merge_file(dir):
  heap = file_heap( dir )
  os.system('rm -f '+dir+'.merge')
  fmerge = open(dir+'.merge', 'a')
  element = heap.poppush_uniq()
  fmerge.write(element+'\n')
  while element is not None:
    element = heap.poppush_uniq()
    fmerge.write(element+'\n')
class queue_buffer:
  def __init__(self, queue):
    self.q = queue
    self.rbuf = []
    self.wbuf = []
  def get(self):
    if len(self.rbuf) == 0:
      self.rbuf = self.q.get()
    r = self.rbuf[0]
    del self.rbuf[0]
    return r
  def put(self, d):
    self.wbuf.append(d)
    if d is None or len(self.wbuf) > 1024:
      self.q.put(self.wbuf)
      self.wbuf = []
def diff_file(file_old, file_new, file_diff, buf = 268435456):
  print 'buffer size', buf
  from file_split import split_sort_file
  os.system('rm -rf '+ os.path.splitext(file_old)[0] )
  os.system('rm -rf '+ os.path.splitext(file_new)[0] )
  t = datetime.now()
  split_sort_file(file_old,5,buf)
  split_sort_file(file_new,5,buf)
  print 'split elasped ', datetime.now() - t
  os.system('cat %s/* | wc -l'%os.path.splitext(file_old)[0])
  os.system('cat %s/* | wc -l'%os.path.splitext(file_new)[0])
  os.system('rm -f '+file_diff)
  t = datetime.now()
  zdiff = open(file_diff, 'a')
  old_q = Queue(1024)
  new_q = Queue(1024)
  old_queue = queue_buffer(old_q)
  new_queue = queue_buffer(new_q)
  h1 = Process(target=heappoppush2, args=(os.path.splitext(file_old)[0], old_queue, 3))
  h2 = Process(target=heappoppush2, args=(os.path.splitext(file_new)[0], new_queue, 3))
  h1.start(), h2.start()
  old = old_queue.get()
  new = new_queue.get()
  old_count, new_count = 0, 0
  while old is not None or new is not None:
    if old > new or old is None:
      zdiff.write('< '+new+'\n')
      new = new_queue.get()
      new_count +=1
    elif old < new or new is None:
      zdiff.write('> '+old+'\n')
      old = old_queue.get()
      old_count +=1
    else:
      old = old_queue.get()
      new = new_queue.get()
  print 'new_count:', new_count
  print 'old_count:', old_count
  print 'diff elasped ', datetime.now() - t
  h1.join(), h2.join()

希望本文所述對(duì)大家的Python程序設(shè)計(jì)有所幫助。

相關(guān)文章

  • python學(xué)習(xí)實(shí)操案例(二)

    python學(xué)習(xí)實(shí)操案例(二)

    這篇文章主要介紹了python學(xué)習(xí)實(shí)操案例,主要實(shí)操內(nèi)容有二進(jìn)制轉(zhuǎn)換、為自己手機(jī)充值、、計(jì)算能量的消耗等,需要的小伙伴可以參考一下
    2022-02-02
  • python scrapy框架中Request對(duì)象和Response對(duì)象的介紹

    python scrapy框架中Request對(duì)象和Response對(duì)象的介紹

    本文介紹了python基礎(chǔ)之scrapy框架中Request對(duì)象和Response對(duì)象的介紹,Request對(duì)象主要是用來(lái)請(qǐng)求數(shù)據(jù),爬取一頁(yè)的數(shù)據(jù)重新發(fā)送一個(gè)請(qǐng)求的時(shí)候調(diào)用,Response對(duì)象一般是由scrapy給你自動(dòng)構(gòu)建的,因此開(kāi)發(fā)者不需要關(guān)心如何創(chuàng)建Response對(duì)象,下面來(lái)一起來(lái)了解更多內(nèi)容吧
    2022-02-02
  • 詳解基于python-django框架的支付寶支付案例

    詳解基于python-django框架的支付寶支付案例

    這篇文章主要介紹了基于python-django框架的支付寶支付案例,文中通過(guò)示例代碼介紹的非常詳細(xì),對(duì)大家的學(xué)習(xí)或者工作具有一定的參考學(xué)習(xí)價(jià)值,需要的朋友們下面隨著小編來(lái)一起學(xué)習(xí)學(xué)習(xí)吧
    2019-09-09
  • python scipy求解非線(xiàn)性方程的方法(fsolve/root)

    python scipy求解非線(xiàn)性方程的方法(fsolve/root)

    今天小編就為大家分享一篇python scipy求解非線(xiàn)性方程的方法(fsolve/root),具有很好的參考價(jià)值,希望對(duì)大家有所幫助。一起跟隨小編過(guò)來(lái)看看吧
    2018-11-11
  • 對(duì)python中UDP,socket的使用詳解

    對(duì)python中UDP,socket的使用詳解

    今天小編就為大家分享一篇對(duì)python中UDP,socket的使用詳解,具有很好的參考價(jià)值,希望對(duì)大家有所幫助。一起跟隨小編過(guò)來(lái)看看吧
    2019-08-08
  • pytorch + visdom CNN處理自建圖片數(shù)據(jù)集的方法

    pytorch + visdom CNN處理自建圖片數(shù)據(jù)集的方法

    這篇文章主要介紹了pytorch + visdom CNN處理自建圖片數(shù)據(jù)集的方法,小編覺(jué)得挺不錯(cuò)的,現(xiàn)在分享給大家,也給大家做個(gè)參考。一起跟隨小編過(guò)來(lái)看看吧
    2018-06-06
  • Anaconda3+tensorflow2.0.0+PyCharm安裝與環(huán)境搭建(圖文)

    Anaconda3+tensorflow2.0.0+PyCharm安裝與環(huán)境搭建(圖文)

    這篇文章主要介紹了Anaconda3+tensorflow2.0.0+PyCharm安裝與環(huán)境搭建(圖文),文中通過(guò)示例代碼介紹的非常詳細(xì),對(duì)大家的學(xué)習(xí)或者工作具有一定的參考學(xué)習(xí)價(jià)值,需要的朋友們下面隨著小編來(lái)一起學(xué)習(xí)學(xué)習(xí)吧
    2020-02-02
  • pycharm 批量修改變量名稱(chēng)的方法

    pycharm 批量修改變量名稱(chēng)的方法

    這篇文章主要介紹了pycharm 批量修改變量名稱(chēng)的方法,文中通過(guò)示例代碼介紹的非常詳細(xì),對(duì)大家的學(xué)習(xí)或者工作具有一定的參考學(xué)習(xí)價(jià)值,需要的朋友們下面隨著小編來(lái)一起學(xué)習(xí)學(xué)習(xí)吧
    2019-08-08
  • python標(biāo)準(zhǔn)庫(kù)turtle海龜繪圖實(shí)現(xiàn)簡(jiǎn)單奧運(yùn)五環(huán)

    python標(biāo)準(zhǔn)庫(kù)turtle海龜繪圖實(shí)現(xiàn)簡(jiǎn)單奧運(yùn)五環(huán)

    這篇文章主要為大家介紹了python使用turtle實(shí)現(xiàn)最簡(jiǎn)單簡(jiǎn)單奧運(yùn)五環(huán)繪圖,有需要的朋友可以借鑒參考下,希望能夠有所幫助,祝大家多多進(jìn)步,早日升職加薪
    2022-05-05
  • python常用小腳本實(shí)例總結(jié)

    python常用小腳本實(shí)例總結(jié)

    在日常的工作中我們總會(huì)面臨到各式各樣的問(wèn)題,下面這篇文章主要給大家介紹了關(guān)于python常用小腳本的相關(guān)資料,文中通過(guò)示例代碼介紹的非常詳細(xì),需要的朋友可以參考下
    2022-07-07

最新評(píng)論