jonhurlock · August 29, 2015 14:13
diff --git a/count_occurances_in_stream.py b/count_occurances_in_stream.py
 """
 Imagine you have a stream of data.
 As long as the stream keeps streaming you must keep reading the data.
 You can only read in x bytes of data at a time.

 You must count all occurances of a search term y in the stream until
 the stream stops.

 Edge cases:
 search term is empty
 search term is larger than the number of bytes currently being read.
 need to account for partial matches
 multiple matches - stream: "aaa" term: "aa", how many times does aa appear?
 do we need to hand "aa\na" as a match for "aa"?
 do you want case and white space sensitivity?

 Explanation:
 I used a file as my stream of data, where consume_stream, reads in a file
 'stream_source', we can specify x bytes by using byte_size, and the search
 term y.

 It will print out found n occurences of the term y.
 and return the number of matches as an integer.

 To run
 >>> print consume_stream('mytest.txt',1,'jon')

 I have purposesly not used 'str.count()' or 'in' or regex's
 as i wanted to use arrays only. I have used slicing, however,
 this i feel can quickly be achieved in other langues.
 see: http://bit.ly/1KOKGh9 for a quick explanation of slicing
 """

 # we've set this as a global variable
 previous_buffer_data = ""

 def consume_stream(stream_source, byte_size, search_term):
 	global stream_size
 	occurences = 0
 	# we are using a file here just for testing rather
 	# than using an actual 'stream' of data.
 	f = open(stream_source, "rb")
 	try:
 		byte = f.read(byte_size)
 		occurences += read_stream_buffer(byte,search_term)
 		while(byte != ""):
 			byte = f.read(byte_size)
 			occurences += read_stream_buffer(byte,search_term)
 	finally:
 			f.close()
 			print 'Found %d occurences of the term \'%s\' found in the stream'%(occurences,search_term)
 			return occurences


 def read_stream_buffer(buffer_data, search_term):
 	occurences = 0
 	# if the stream has not ended,
 	if (len(buffer_data) != 0):
 		# if there was a partial match in the previous buffers
 		if len(previous_buffer_data) != 0:
 			occurences = search_for_term(previous_buffer_data+buffer_data, search_term)
 		else:
 			occurences = search_for_term(buffer_data, search_term)
 		return occurences
 	# if the stream has ended return the number
 	# of occurences of the string we found in the stream
 	else:
 		return occurences


 def search_for_term(content_to_search, search_term):
 	#print "searching:%s"%content_to_search
 	length_of_content = len(content_to_search)
 	length_of_search_term = len(search_term)
 	# we need access to previous_buffer incase we
 	# have a partial match at the end of the buffer
 	global previous_buffer_data
 	# lets just make sure its empty as we've already
 	# passed the previous buffer throuvh via
 	# the content_to_search variable
 	previous_buffer_data = ""
 	# how many matches did we find within this buffer
 	occurances = 0
 	# if the search term is longer than the
 	# content append the context to the previous_buffer_data
 	if length_of_search_term > length_of_content:
 		previous_buffer_data = content_to_search
 	# there should be enough data to search through
 	else:
 		# counter is like a pivot for the content_to_search
 		# we've searched everything to the left of it, if
 		# we find a match, we then move the pivot to the end
 		# of the match, this stop having sub matches such as
 		# how many times does 'aa' occur in 'aaa', should be
 		# once, not twice.
 		counter = 0
 		# try looping through each letter in content
 		while counter < length_of_content:
 			# character we are trying to match with
 			# with the first item of the search string
 			c = content_to_search[counter]
 			# this is used for tracking the progress of
 			# the search with the current pivot point
 			matched_letters_from_pivot = 0
 			# if the letter matches the first letter in the search term
 			if (c == search_term[0]):
 				# now for each letter in the search term
 				for i in range(0,length_of_search_term):
 					# if search term length is still within content length
 					if counter+i<length_of_content:
 						# if characters match, plus matched items
 						if content_to_search[counter+i]==search_term[i]:
 							matched_letters_from_pivot+=1
 							#print 'match so far'
 							# if number of matched characters equals the
 							# length of search term, its a full match
 							if matched_letters_from_pivot == length_of_search_term:
 								occurances +=1
 								#print 'match found in:',content_to_search
 								# now move the counter to the end of the match
 								previous_buffer_data = ""
 								counter = counter+length_of_search_term-1
 					# if the counter postion + search term length exceeds
 					# the content add the content to previous buffer data
 					else:
 						previous_buffer_data = content_to_search[counter:]
 						return occurances
 			counter +=1
 	return occurances
	"""
	Imagine you have a stream of data.
	As long as the stream keeps streaming you must keep reading the data.
	You can only read in x bytes of data at a time.

	You must count all occurances of a search term y in the stream until
	the stream stops.

	Edge cases:
	search term is empty
	search term is larger than the number of bytes currently being read.
	need to account for partial matches
	multiple matches - stream: "aaa" term: "aa", how many times does aa appear?
	do we need to hand "aa\na" as a match for "aa"?
	do you want case and white space sensitivity?

	Explanation:
	I used a file as my stream of data, where consume_stream, reads in a file
	'stream_source', we can specify x bytes by using byte_size, and the search
	term y.

	It will print out found n occurences of the term y.
	and return the number of matches as an integer.

	To run
	>>> print consume_stream('mytest.txt',1,'jon')

	I have purposesly not used 'str.count()' or 'in' or regex's
	as i wanted to use arrays only. I have used slicing, however,
	this i feel can quickly be achieved in other langues.
	see: http://bit.ly/1KOKGh9 for a quick explanation of slicing
	"""

	# we've set this as a global variable
	previous_buffer_data = ""

	def consume_stream(stream_source, byte_size, search_term):
	global stream_size
	occurences = 0
	# we are using a file here just for testing rather
	# than using an actual 'stream' of data.
	f = open(stream_source, "rb")
	try:
	byte = f.read(byte_size)
	occurences += read_stream_buffer(byte,search_term)
	while(byte != ""):
	byte = f.read(byte_size)
	occurences += read_stream_buffer(byte,search_term)
	finally:
	f.close()
	print 'Found %d occurences of the term \'%s\' found in the stream'%(occurences,search_term)
	return occurences


	def read_stream_buffer(buffer_data, search_term):
	occurences = 0
	# if the stream has not ended,
	if (len(buffer_data) != 0):
	# if there was a partial match in the previous buffers
	if len(previous_buffer_data) != 0:
	occurences = search_for_term(previous_buffer_data+buffer_data, search_term)
	else:
	occurences = search_for_term(buffer_data, search_term)
	return occurences
	# if the stream has ended return the number
	# of occurences of the string we found in the stream
	else:
	return occurences


	def search_for_term(content_to_search, search_term):
	#print "searching:%s"%content_to_search
	length_of_content = len(content_to_search)
	length_of_search_term = len(search_term)
	# we need access to previous_buffer incase we
	# have a partial match at the end of the buffer
	global previous_buffer_data
	# lets just make sure its empty as we've already
	# passed the previous buffer throuvh via
	# the content_to_search variable
	previous_buffer_data = ""
	# how many matches did we find within this buffer
	occurances = 0
	# if the search term is longer than the
	# content append the context to the previous_buffer_data
	if length_of_search_term > length_of_content:
	previous_buffer_data = content_to_search
	# there should be enough data to search through
	else:
	# counter is like a pivot for the content_to_search
	# we've searched everything to the left of it, if
	# we find a match, we then move the pivot to the end
	# of the match, this stop having sub matches such as
	# how many times does 'aa' occur in 'aaa', should be
	# once, not twice.
	counter = 0
	# try looping through each letter in content
	while counter < length_of_content:
	# character we are trying to match with
	# with the first item of the search string
	c = content_to_search[counter]
	# this is used for tracking the progress of
	# the search with the current pivot point
	matched_letters_from_pivot = 0
	# if the letter matches the first letter in the search term
	if (c == search_term[0]):
	# now for each letter in the search term
	for i in range(0,length_of_search_term):
	# if search term length is still within content length
	if counter+i<length_of_content:
	# if characters match, plus matched items
	if content_to_search[counter+i]==search_term[i]:
	matched_letters_from_pivot+=1
	#print 'match so far'
	# if number of matched characters equals the
	# length of search term, its a full match
	if matched_letters_from_pivot == length_of_search_term:
	occurances +=1
	#print 'match found in:',content_to_search
	# now move the counter to the end of the match
	previous_buffer_data = ""
	counter = counter+length_of_search_term-1
	# if the counter postion + search term length exceeds
	# the content add the content to previous buffer data
	else:
	previous_buffer_data = content_to_search[counter:]
	return occurances
	counter +=1
	return occurances