Last active
August 29, 2015 14:13
-
-
Save jonhurlock/f8549fca73474d8944f1 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Imagine you have a stream of data. | |
As long as the stream keeps streaming you must keep reading the data. | |
You can only read in x bytes of data at a time. | |
You must count all occurances of a search term y in the stream until | |
the stream stops. | |
Edge cases: | |
search term is empty | |
search term is larger than the number of bytes currently being read. | |
need to account for partial matches | |
multiple matches - stream: "aaa" term: "aa", how many times does aa appear? | |
do we need to hand "aa\na" as a match for "aa"? | |
do you want case and white space sensitivity? | |
Explanation: | |
I used a file as my stream of data, where consume_stream, reads in a file | |
'stream_source', we can specify x bytes by using byte_size, and the search | |
term y. | |
It will print out found n occurences of the term y. | |
and return the number of matches as an integer. | |
To run | |
>>> print consume_stream('mytest.txt',1,'jon') | |
I have purposesly not used 'str.count()' or 'in' or regex's | |
as i wanted to use arrays only. I have used slicing, however, | |
this i feel can quickly be achieved in other langues. | |
see: http://bit.ly/1KOKGh9 for a quick explanation of slicing | |
""" | |
# we've set this as a global variable | |
previous_buffer_data = "" | |
def consume_stream(stream_source, byte_size, search_term): | |
global stream_size | |
occurences = 0 | |
# we are using a file here just for testing rather | |
# than using an actual 'stream' of data. | |
f = open(stream_source, "rb") | |
try: | |
byte = f.read(byte_size) | |
occurences += read_stream_buffer(byte,search_term) | |
while(byte != ""): | |
byte = f.read(byte_size) | |
occurences += read_stream_buffer(byte,search_term) | |
finally: | |
f.close() | |
print 'Found %d occurences of the term \'%s\' found in the stream'%(occurences,search_term) | |
return occurences | |
def read_stream_buffer(buffer_data, search_term): | |
occurences = 0 | |
# if the stream has not ended, | |
if (len(buffer_data) != 0): | |
# if there was a partial match in the previous buffers | |
if len(previous_buffer_data) != 0: | |
occurences = search_for_term(previous_buffer_data+buffer_data, search_term) | |
else: | |
occurences = search_for_term(buffer_data, search_term) | |
return occurences | |
# if the stream has ended return the number | |
# of occurences of the string we found in the stream | |
else: | |
return occurences | |
def search_for_term(content_to_search, search_term): | |
#print "searching:%s"%content_to_search | |
length_of_content = len(content_to_search) | |
length_of_search_term = len(search_term) | |
# we need access to previous_buffer incase we | |
# have a partial match at the end of the buffer | |
global previous_buffer_data | |
# lets just make sure its empty as we've already | |
# passed the previous buffer throuvh via | |
# the content_to_search variable | |
previous_buffer_data = "" | |
# how many matches did we find within this buffer | |
occurances = 0 | |
# if the search term is longer than the | |
# content append the context to the previous_buffer_data | |
if length_of_search_term > length_of_content: | |
previous_buffer_data = content_to_search | |
# there should be enough data to search through | |
else: | |
# counter is like a pivot for the content_to_search | |
# we've searched everything to the left of it, if | |
# we find a match, we then move the pivot to the end | |
# of the match, this stop having sub matches such as | |
# how many times does 'aa' occur in 'aaa', should be | |
# once, not twice. | |
counter = 0 | |
# try looping through each letter in content | |
while counter < length_of_content: | |
# character we are trying to match with | |
# with the first item of the search string | |
c = content_to_search[counter] | |
# this is used for tracking the progress of | |
# the search with the current pivot point | |
matched_letters_from_pivot = 0 | |
# if the letter matches the first letter in the search term | |
if (c == search_term[0]): | |
# now for each letter in the search term | |
for i in range(0,length_of_search_term): | |
# if search term length is still within content length | |
if counter+i<length_of_content: | |
# if characters match, plus matched items | |
if content_to_search[counter+i]==search_term[i]: | |
matched_letters_from_pivot+=1 | |
#print 'match so far' | |
# if number of matched characters equals the | |
# length of search term, its a full match | |
if matched_letters_from_pivot == length_of_search_term: | |
occurances +=1 | |
#print 'match found in:',content_to_search | |
# now move the counter to the end of the match | |
previous_buffer_data = "" | |
counter = counter+length_of_search_term-1 | |
# if the counter postion + search term length exceeds | |
# the content add the content to previous buffer data | |
else: | |
previous_buffer_data = content_to_search[counter:] | |
return occurances | |
counter +=1 | |
return occurances |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment