-
-
Save matklad/2626115 to your computer and use it in GitHub Desktop.
C# Suffix tree implementation based on Ukkonen's algorithm
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using System; | |
using System.Collections.Generic; | |
using System.IO; | |
using System.Linq; | |
namespace StringAlgorithms | |
{ | |
public class SuffixTree | |
{ | |
// http://stackoverflow.com/questions/9452701/can-someone-please-explain-ukkonens-suffix-tree-algorithm-in-plain-english | |
private ActivePoint _activePoint; | |
public string Word { get; set; } | |
public Node RootNode { get; set; } | |
public int InsertionPoint { get; set; } | |
public int NextNodeNumber { get; set; } | |
public SuffixTree(string word) | |
{ | |
Word = word; | |
RootNode = new Node { NodeNumber = NextNodeNumber++ }; | |
Build(); | |
} | |
private void Build() | |
{ | |
_activePoint = new ActivePoint | |
{ | |
ActiveNode = RootNode, | |
ActiveEdgeStartsWith = '\0', | |
ActiveLength = 0 | |
}; | |
var remainder = 1; | |
InsertionPoint = 0; | |
for(InsertionPoint = 0; InsertionPoint < Word.Length; InsertionPoint++) | |
{ | |
var c = Word[InsertionPoint]; | |
Console.WriteLine("Adding: '" + c.ToString().ToUpper() + "'"); | |
var suffixesToInsert = remainder; | |
Node previousNode = null; | |
var implicitComplete = false; | |
Console.WriteLine("Remainder is " + remainder); | |
for(var i = 0; i < suffixesToInsert && !implicitComplete; i++) | |
{ | |
Console.WriteLine("The active point is currently: " + _activePoint); | |
Edge edge; | |
if(_activePoint.ActiveEdgeStartsWith != '\0') | |
{ | |
Console.WriteLine("Looking for edge starting with '" + _activePoint.ActiveEdgeStartsWith.ToString().ToUpper() + "'"); | |
edge = _activePoint.ActiveNode.Edges[_activePoint.ActiveEdgeStartsWith]; | |
if(Word[edge.StartIndex + _activePoint.ActiveLength] == c) | |
{ | |
Console.WriteLine("Found implicit match in edge " + edge); | |
_activePoint.ActiveLength++; | |
remainder++; | |
if(_activePoint.ActiveLength == edge.Length) | |
{ | |
Console.WriteLine("Match is at end of edge; active point will be moved to the connecting node."); | |
_activePoint.ActiveNode = edge.Tail; | |
_activePoint.ActiveEdgeStartsWith = '\0'; | |
_activePoint.ActiveLength = 0; | |
} | |
implicitComplete = true; | |
} | |
else | |
{ | |
Console.Write("Splitting edge " + edge + " --> "); | |
edge.EndIndex = edge.StartIndex + _activePoint.ActiveLength - 1; | |
edge.Tail = new Node { Stem = edge, NodeNumber = NextNodeNumber++ }; | |
var subdivisionStartIndex = edge.EndIndex.Value + 1; | |
var subdividedStartChar = Word[subdivisionStartIndex]; | |
edge.Tail.Edges.Add(subdividedStartChar, new Edge(this) { StartIndex = subdivisionStartIndex, Head = edge.Tail }); | |
edge.Tail.Edges.Add(c, new Edge(this) { StartIndex = InsertionPoint, Head = edge.Tail }); | |
Console.WriteLine(edge + " => " + edge.Tail.NodeNumber + " => " + edge.Tail.Edges.First().Value + " / " + edge.Tail.Edges.Last().Value); | |
if(i > 0 && previousNode != null) | |
{ | |
Console.WriteLine("Connected node " + previousNode.NodeNumber + " ---> node " + edge.Tail.NodeNumber); | |
previousNode.LinkedNode = edge.Tail; | |
} | |
previousNode = edge.Tail; | |
remainder--; | |
if(_activePoint.ActiveNode.NodeNumber > 0) | |
{ | |
Console.Write("Changed the active node from " + _activePoint.ActiveNode.NodeNumber + " to "); | |
_activePoint.ActiveNode = _activePoint.ActiveNode.LinkedNode ?? RootNode; | |
Console.WriteLine(_activePoint.ActiveNode.NodeNumber); | |
} | |
else | |
{ | |
_activePoint.ActiveLength--; | |
_activePoint.ActiveEdgeStartsWith = _activePoint.ActiveLength == 0 ? '\0' : Word[InsertionPoint - _activePoint.ActiveLength]; | |
} | |
} | |
} | |
else if(_activePoint.ActiveNode.Edges.TryGetValue(c, out edge)) | |
{ | |
Console.WriteLine("Found '" + c.ToString().ToUpper() + "' on edge " + edge + "; updating active point."); | |
_activePoint.ActiveEdgeStartsWith = c; | |
_activePoint.ActiveLength = 1; | |
remainder++; | |
implicitComplete = true; | |
} | |
else | |
{ | |
Console.WriteLine("Inserting singular edge for '" + c.ToString().ToUpper() + "' from node " + _activePoint.ActiveNode.NodeNumber); | |
edge = new Edge(this) { StartIndex = InsertionPoint, Head = _activePoint.ActiveNode }; | |
_activePoint.ActiveNode.Edges.Add(c, edge); | |
} | |
Console.WriteLine(WriteText()); | |
} | |
} | |
} | |
public string WriteText() | |
{ | |
var writer = new StringWriter(); | |
RootNode.WriteText(writer, 0); | |
return writer.ToString(); | |
} | |
} | |
public class Node | |
{ | |
public Dictionary<char, Edge> Edges { get; private set; } | |
public Node LinkedNode { get; set; } | |
public Edge Stem { get; set; } | |
public int NodeNumber { get; set; } | |
public Node() | |
{ | |
Edges = new Dictionary<char, Edge>(); | |
} | |
public void WriteText(TextWriter writer, int indent) | |
{ | |
writer.Write(NodeNumber); | |
var i = 0; | |
var maxEdgeLength = Edges.Values.Max(e => (e.EndIndex ?? e.Tree.InsertionPoint) - e.StartIndex); | |
foreach(var edge in Edges.Values) | |
{ | |
if(i++ == 0) | |
writer.Write(" +"); | |
else | |
{ | |
writer.Write(" "); | |
if(indent > 0) | |
{ | |
writer.Write("|"); | |
writer.Write(new string(' ', indent - 1)); | |
} | |
writer.Write("+"); | |
} | |
edge.WriteText(writer, indent + 3, maxEdgeLength); | |
} | |
} | |
public int Depth | |
{ | |
get { return Stem == null ? 0 : Stem.Length + Stem.Head.Depth; } | |
} | |
public override string ToString() | |
{ | |
return "Node " + NodeNumber; | |
} | |
} | |
public class Edge | |
{ | |
public Edge(SuffixTree tree) | |
{ | |
Tree = tree; | |
} | |
public SuffixTree Tree { get; set; } | |
public Node Head { get; set; } | |
public Node Tail { get; set; } | |
public int StartIndex { get; set; } | |
public int? EndIndex { get; set; } | |
public override string ToString() | |
{ | |
return string.Concat("[", Head.NodeNumber, ":'", Tree.Word.Substring(StartIndex, Length).ToUpper(), "']"); | |
} | |
public int Length | |
{ | |
get { return (EndIndex ?? Math.Min(Tree.Word.Length - 1, Tree.InsertionPoint)) - StartIndex + 1; } | |
} | |
public void WriteText(TextWriter writer, int indent, int maxEdgeLength) | |
{ | |
writer.Write(" - "); | |
var str = Tree.Word.Substring(StartIndex, Length).ToUpper(); | |
writer.Write(str); | |
if(Tail == null) | |
writer.WriteLine(""); | |
else | |
{ | |
writer.Write(" "); | |
writer.Write(new string('-', 1 + maxEdgeLength - str.Length)); | |
writer.Write(" "); | |
Tail.WriteText(writer, indent + maxEdgeLength + 6); | |
} | |
} | |
} | |
public class ActivePoint | |
{ | |
public Node ActiveNode { get; set; } | |
public char ActiveEdgeStartsWith { get; set; } | |
public int ActiveLength { get; set; } | |
public override string ToString() | |
{ | |
return string.Concat("{", ActiveNode, ",'", ActiveEdgeStartsWith == '\0' ? "\\0" : ActiveEdgeStartsWith.ToString(), "',", ActiveLength, "}"); | |
} | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
static void Main() | |
{ | |
var tree = new SuffixTree("abcabxabcd"); | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Adding: 'A' | |
Remainder is 1 | |
The active point is currently: {Node 0,'\0',0} | |
Inserting singular edge for 'A' from node 0 | |
0 + - A | |
Adding: 'B' | |
Remainder is 1 | |
The active point is currently: {Node 0,'\0',0} | |
Inserting singular edge for 'B' from node 0 | |
0 + - AB | |
+ - B | |
Adding: 'C' | |
Remainder is 1 | |
The active point is currently: {Node 0,'\0',0} | |
Inserting singular edge for 'C' from node 0 | |
0 + - ABC | |
+ - BC | |
+ - C | |
Adding: 'A' | |
Remainder is 1 | |
The active point is currently: {Node 0,'\0',0} | |
Found 'A' on edge [0:'ABCA']; updating active point. | |
0 + - ABCA | |
+ - BCA | |
+ - CA | |
Adding: 'B' | |
Remainder is 2 | |
The active point is currently: {Node 0,'a',1} | |
Looking for edge starting with 'A' | |
Found implicit match in edge [0:'ABCAB'] | |
0 + - ABCAB | |
+ - BCAB | |
+ - CAB | |
Adding: 'X' | |
Remainder is 3 | |
The active point is currently: {Node 0,'a',2} | |
Looking for edge starting with 'A' | |
Splitting edge [0:'ABCABX'] --> [0:'AB'] => 1 => [1:'CABX'] / [1:'X'] | |
0 + - AB --- 1 + - CABX | |
| + - X | |
+ - BCABX | |
+ - CABX | |
The active point is currently: {Node 0,'b',1} | |
Looking for edge starting with 'B' | |
Splitting edge [0:'BCABX'] --> [0:'B'] => 2 => [2:'CABX'] / [2:'X'] | |
Connected node 1 ---> node 2 | |
0 + - AB -- 1 + - CABX | |
| + - X | |
+ - B --- 2 + - CABX | |
| + - X | |
+ - CABX | |
The active point is currently: {Node 0,'\0',0} | |
Inserting singular edge for 'X' from node 0 | |
0 + - AB -- 1 + - CABX | |
| + - X | |
+ - B --- 2 + - CABX | |
| + - X | |
+ - CABX | |
+ - X | |
Adding: 'A' | |
Remainder is 1 | |
The active point is currently: {Node 0,'\0',0} | |
Found 'A' on edge [0:'AB']; updating active point. | |
0 + - AB --- 1 + - CABXA | |
| + - XA | |
+ - B ---- 2 + - CABXA | |
| + - XA | |
+ - CABXA | |
+ - XA | |
Adding: 'B' | |
Remainder is 2 | |
The active point is currently: {Node 0,'a',1} | |
Looking for edge starting with 'A' | |
Found implicit match in edge [0:'AB'] | |
Match is at end of edge; active point will be moved to the connecting node. | |
0 + - AB ---- 1 + - CABXAB | |
| + - XAB | |
+ - B ----- 2 + - CABXAB | |
| + - XAB | |
+ - CABXAB | |
+ - XAB | |
Adding: 'C' | |
Remainder is 3 | |
The active point is currently: {Node 1,'\0',0} | |
Found 'C' on edge [1:'CABXABC']; updating active point. | |
0 + - AB ----- 1 + - CABXABC | |
| + - XABC | |
+ - B ------ 2 + - CABXABC | |
| + - XABC | |
+ - CABXABC | |
+ - XABC | |
Adding: 'D' | |
Remainder is 4 | |
The active point is currently: {Node 1,'c',1} | |
Looking for edge starting with 'C' | |
Splitting edge [1:'CABXABCD'] --> [1:'C'] => 3 => [3:'ABXABCD'] / [3:'D'] | |
Changed the active node from 1 to 2 | |
0 + - AB ------ 1 + - C ---- 3 + - ABXABCD | |
| + - D | |
| + - XABCD | |
+ - B ------- 2 + - CABXABCD | |
| + - XABCD | |
+ - CABXABCD | |
+ - XABCD | |
The active point is currently: {Node 2,'c',1} | |
Looking for edge starting with 'C' | |
Splitting edge [2:'CABXABCD'] --> [2:'C'] => 4 => [4:'ABXABCD'] / [4:'D'] | |
Connected node 3 ---> node 4 | |
Changed the active node from 2 to 0 | |
0 + - AB ------ 1 + - C ---- 3 + - ABXABCD | |
| + - D | |
| + - XABCD | |
+ - B ------- 2 + - C ---- 4 + - ABXABCD | |
| + - D | |
| + - XABCD | |
+ - CABXABCD | |
+ - XABCD | |
The active point is currently: {Node 0,'c',1} | |
Looking for edge starting with 'C' | |
Splitting edge [0:'CABXABCD'] --> [0:'C'] => 5 => [5:'ABXABCD'] / [5:'D'] | |
Connected node 4 ---> node 5 | |
0 + - AB --- 1 + - C ---- 3 + - ABXABCD | |
| + - D | |
| + - XABCD | |
+ - B ---- 2 + - C ---- 4 + - ABXABCD | |
| + - D | |
| + - XABCD | |
+ - C ---- 5 + - ABXABCD | |
| + - D | |
+ - XABCD | |
The active point is currently: {Node 0,'\0',0} | |
Inserting singular edge for 'D' from node 0 | |
0 + - AB --- 1 + - C ---- 3 + - ABXABCD | |
| + - D | |
| + - XABCD | |
+ - B ---- 2 + - C ---- 4 + - ABXABCD | |
| + - D | |
| + - XABCD | |
+ - C ---- 5 + - ABXABCD | |
| + - D | |
+ - XABCD | |
+ - D |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment