Last active
April 23, 2025 18:46
-
-
Save raxoft/74a046197b54fde51a81be3b2884fe51 to your computer and use it in GitHub Desktop.
Simple Aho-Corasick implementation I wrote when testing blacklisting algorithms.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env ruby | |
class Node | |
attr_reader :parent | |
attr_reader :children | |
attr_accessor :failure | |
attr_accessor :results | |
# Initialize new node. | |
def initialize(parent = nil) | |
@parent = parent | |
@children = {} | |
@failure = nil | |
@results = [] | |
end | |
# Add word to the trie. | |
def add_word(word) | |
node = self | |
word.each_char do |ch| | |
node = node.get_child(ch) | |
end | |
node.results << word | |
end | |
# Get child for given character. | |
def get_child(ch) | |
children[ch] ||= self.class.new(self) | |
end | |
# Link all missing children edges to the root node itself. | |
def link_root | |
children.default = self | |
end | |
# Create links for not found characters to the longest proper suffix which exists as a prefix in the trie. | |
def link_failures | |
queue = [] | |
children.each_value do |child| | |
child.failure = self | |
queue << child | |
end | |
while node = queue.shift | |
node.children.each do |ch, child| | |
failure = node.failure | |
until suffix = failure.children[ch] | |
failure = failure.failure | |
end | |
child.failure = suffix | |
child.results.concat(suffix.results) | |
queue << child | |
end | |
end | |
end | |
# Build the trie for given list of words. | |
def build_trie(words) | |
for word in words | |
add_word(word) | |
end | |
link_root | |
link_failures | |
self | |
end | |
# Find next node for given character. | |
def find_next_state(ch) | |
node = self | |
until child = node.children[ch] | |
node = node.failure | |
end | |
child | |
end | |
# Find all trie words in given text. | |
def find_words(text) | |
node = self | |
for i in 0...text.size | |
node = node.find_next_state(text[i]) | |
node.results.each do |word| | |
yield word, i | |
end | |
end | |
self | |
end | |
# Find given words in given text and yield matches to given block. | |
def self.find_words(words, text, &block) | |
new.build_trie(words).find_words(text, &block) | |
end | |
end | |
words = %w[he she hers his] | |
text = "ahishers" | |
Node.find_words(words, text) do |word, index| | |
puts "Word #{word} appears from #{index - word.size + 1} to #{index}" | |
end | |
# EOF # |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment