Created
April 11, 2021 15:14
-
-
Save dpricha89/7eb2bde5568648c07007bd2702ddf186 to your computer and use it in GitHub Desktop.
Get top 10 lines in large file
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import unittest | |
import sys, array, tempfile, heapq | |
import io | |
from itertools import islice | |
NUM_LINES_IN_MEM_PER_FILE = 100000 | |
NUM_LINES_IN_EACH_FILE = 1000000 | |
class SortLargeFile: | |
def intsfromfile(self, f): | |
while True: | |
next_n_lines = list(islice(f, NUM_LINES_IN_MEM_PER_FILE)) | |
if not next_n_lines: | |
break | |
for line in next_n_lines: | |
yield line | |
f.close() | |
def saveToTemp(self, arr): | |
print("size of the arr {}".format(len(arr))) | |
f = tempfile.TemporaryFile(mode="r+") | |
li = list(sorted(arr)) | |
f.writelines(i for i in li) | |
f.seek(0) | |
return self.intsfromfile(f) | |
def sort(self, source): | |
print('source {}'.format(source)) | |
iters = [] | |
with open(source, 'r') as sf: | |
arr = [] | |
for line in sf: | |
arr.append(line) | |
if len(arr) >= NUM_LINES_IN_EACH_FILE: | |
iters += self.saveToTemp(arr) | |
arr = [] | |
if arr: | |
iters += self.saveToTemp(arr) | |
print('Number of tmp files', len(iters)) | |
last = None | |
count = 0 | |
top_ten = [] | |
for x in heapq.merge(*iters): | |
count += 1 | |
if not last or last != x: | |
if not last: | |
last=x.strip('\n') | |
if len(top_ten) < 10: | |
heapq.heappush(top_ten, (count,last)) | |
elif top_ten[0][0] < count or (top_ten[0][0] == count and top_ten[0][1] < last): | |
heapq.heappushpop(top_ten, (count,last)) | |
last = x | |
count = 0 | |
if len(top_ten) < 10 or top_ten[0][0] < count: | |
heapq.heappushpop(top_ten, (count,last)) | |
while top_ten: | |
val = heapq.heappop(top_ten) | |
print(val[1], val[0]) | |
slf = SortLargeFile() | |
slf.sort('/Users/drichards/grabbag/test_mocking.py/hash_keys_small.txt') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment