Skip to content

Instantly share code, notes, and snippets.

@matklad
Forked from axefrog/0.suffixtree.cs
Created May 7, 2012 05:34
Show Gist options
  • Save matklad/2626115 to your computer and use it in GitHub Desktop.
Save matklad/2626115 to your computer and use it in GitHub Desktop.
C# Suffix tree implementation based on Ukkonen's algorithm
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
namespace StringAlgorithms
{
public class SuffixTree
{
// http://stackoverflow.com/questions/9452701/can-someone-please-explain-ukkonens-suffix-tree-algorithm-in-plain-english
private ActivePoint _activePoint;
public string Word { get; set; }
public Node RootNode { get; set; }
public int InsertionPoint { get; set; }
public int NextNodeNumber { get; set; }
public SuffixTree(string word)
{
Word = word;
RootNode = new Node { NodeNumber = NextNodeNumber++ };
Build();
}
private void Build()
{
_activePoint = new ActivePoint
{
ActiveNode = RootNode,
ActiveEdgeStartsWith = '\0',
ActiveLength = 0
};
var remainder = 1;
InsertionPoint = 0;
for(InsertionPoint = 0; InsertionPoint < Word.Length; InsertionPoint++)
{
var c = Word[InsertionPoint];
Console.WriteLine("Adding: '" + c.ToString().ToUpper() + "'");
var suffixesToInsert = remainder;
Node previousNode = null;
var implicitComplete = false;
Console.WriteLine("Remainder is " + remainder);
for(var i = 0; i < suffixesToInsert && !implicitComplete; i++)
{
Console.WriteLine("The active point is currently: " + _activePoint);
Edge edge;
if(_activePoint.ActiveEdgeStartsWith != '\0')
{
Console.WriteLine("Looking for edge starting with '" + _activePoint.ActiveEdgeStartsWith.ToString().ToUpper() + "'");
edge = _activePoint.ActiveNode.Edges[_activePoint.ActiveEdgeStartsWith];
if(Word[edge.StartIndex + _activePoint.ActiveLength] == c)
{
Console.WriteLine("Found implicit match in edge " + edge);
_activePoint.ActiveLength++;
remainder++;
if(_activePoint.ActiveLength == edge.Length)
{
Console.WriteLine("Match is at end of edge; active point will be moved to the connecting node.");
_activePoint.ActiveNode = edge.Tail;
_activePoint.ActiveEdgeStartsWith = '\0';
_activePoint.ActiveLength = 0;
}
implicitComplete = true;
}
else
{
Console.Write("Splitting edge " + edge + " --> ");
edge.EndIndex = edge.StartIndex + _activePoint.ActiveLength - 1;
edge.Tail = new Node { Stem = edge, NodeNumber = NextNodeNumber++ };
var subdivisionStartIndex = edge.EndIndex.Value + 1;
var subdividedStartChar = Word[subdivisionStartIndex];
edge.Tail.Edges.Add(subdividedStartChar, new Edge(this) { StartIndex = subdivisionStartIndex, Head = edge.Tail });
edge.Tail.Edges.Add(c, new Edge(this) { StartIndex = InsertionPoint, Head = edge.Tail });
Console.WriteLine(edge + " => " + edge.Tail.NodeNumber + " => " + edge.Tail.Edges.First().Value + " / " + edge.Tail.Edges.Last().Value);
if(i > 0 && previousNode != null)
{
Console.WriteLine("Connected node " + previousNode.NodeNumber + " ---> node " + edge.Tail.NodeNumber);
previousNode.LinkedNode = edge.Tail;
}
previousNode = edge.Tail;
remainder--;
if(_activePoint.ActiveNode.NodeNumber > 0)
{
Console.Write("Changed the active node from " + _activePoint.ActiveNode.NodeNumber + " to ");
_activePoint.ActiveNode = _activePoint.ActiveNode.LinkedNode ?? RootNode;
Console.WriteLine(_activePoint.ActiveNode.NodeNumber);
}
else
{
_activePoint.ActiveLength--;
_activePoint.ActiveEdgeStartsWith = _activePoint.ActiveLength == 0 ? '\0' : Word[InsertionPoint - _activePoint.ActiveLength];
}
}
}
else if(_activePoint.ActiveNode.Edges.TryGetValue(c, out edge))
{
Console.WriteLine("Found '" + c.ToString().ToUpper() + "' on edge " + edge + "; updating active point.");
_activePoint.ActiveEdgeStartsWith = c;
_activePoint.ActiveLength = 1;
remainder++;
implicitComplete = true;
}
else
{
Console.WriteLine("Inserting singular edge for '" + c.ToString().ToUpper() + "' from node " + _activePoint.ActiveNode.NodeNumber);
edge = new Edge(this) { StartIndex = InsertionPoint, Head = _activePoint.ActiveNode };
_activePoint.ActiveNode.Edges.Add(c, edge);
}
Console.WriteLine(WriteText());
}
}
}
public string WriteText()
{
var writer = new StringWriter();
RootNode.WriteText(writer, 0);
return writer.ToString();
}
}
public class Node
{
public Dictionary<char, Edge> Edges { get; private set; }
public Node LinkedNode { get; set; }
public Edge Stem { get; set; }
public int NodeNumber { get; set; }
public Node()
{
Edges = new Dictionary<char, Edge>();
}
public void WriteText(TextWriter writer, int indent)
{
writer.Write(NodeNumber);
var i = 0;
var maxEdgeLength = Edges.Values.Max(e => (e.EndIndex ?? e.Tree.InsertionPoint) - e.StartIndex);
foreach(var edge in Edges.Values)
{
if(i++ == 0)
writer.Write(" +");
else
{
writer.Write(" ");
if(indent > 0)
{
writer.Write("|");
writer.Write(new string(' ', indent - 1));
}
writer.Write("+");
}
edge.WriteText(writer, indent + 3, maxEdgeLength);
}
}
public int Depth
{
get { return Stem == null ? 0 : Stem.Length + Stem.Head.Depth; }
}
public override string ToString()
{
return "Node " + NodeNumber;
}
}
public class Edge
{
public Edge(SuffixTree tree)
{
Tree = tree;
}
public SuffixTree Tree { get; set; }
public Node Head { get; set; }
public Node Tail { get; set; }
public int StartIndex { get; set; }
public int? EndIndex { get; set; }
public override string ToString()
{
return string.Concat("[", Head.NodeNumber, ":'", Tree.Word.Substring(StartIndex, Length).ToUpper(), "']");
}
public int Length
{
get { return (EndIndex ?? Math.Min(Tree.Word.Length - 1, Tree.InsertionPoint)) - StartIndex + 1; }
}
public void WriteText(TextWriter writer, int indent, int maxEdgeLength)
{
writer.Write(" - ");
var str = Tree.Word.Substring(StartIndex, Length).ToUpper();
writer.Write(str);
if(Tail == null)
writer.WriteLine("");
else
{
writer.Write(" ");
writer.Write(new string('-', 1 + maxEdgeLength - str.Length));
writer.Write(" ");
Tail.WriteText(writer, indent + maxEdgeLength + 6);
}
}
}
public class ActivePoint
{
public Node ActiveNode { get; set; }
public char ActiveEdgeStartsWith { get; set; }
public int ActiveLength { get; set; }
public override string ToString()
{
return string.Concat("{", ActiveNode, ",'", ActiveEdgeStartsWith == '\0' ? "\\0" : ActiveEdgeStartsWith.ToString(), "',", ActiveLength, "}");
}
}
}
static void Main()
{
var tree = new SuffixTree("abcabxabcd");
}
Adding: 'A'
Remainder is 1
The active point is currently: {Node 0,'\0',0}
Inserting singular edge for 'A' from node 0
0 + - A
Adding: 'B'
Remainder is 1
The active point is currently: {Node 0,'\0',0}
Inserting singular edge for 'B' from node 0
0 + - AB
+ - B
Adding: 'C'
Remainder is 1
The active point is currently: {Node 0,'\0',0}
Inserting singular edge for 'C' from node 0
0 + - ABC
+ - BC
+ - C
Adding: 'A'
Remainder is 1
The active point is currently: {Node 0,'\0',0}
Found 'A' on edge [0:'ABCA']; updating active point.
0 + - ABCA
+ - BCA
+ - CA
Adding: 'B'
Remainder is 2
The active point is currently: {Node 0,'a',1}
Looking for edge starting with 'A'
Found implicit match in edge [0:'ABCAB']
0 + - ABCAB
+ - BCAB
+ - CAB
Adding: 'X'
Remainder is 3
The active point is currently: {Node 0,'a',2}
Looking for edge starting with 'A'
Splitting edge [0:'ABCABX'] --> [0:'AB'] => 1 => [1:'CABX'] / [1:'X']
0 + - AB --- 1 + - CABX
| + - X
+ - BCABX
+ - CABX
The active point is currently: {Node 0,'b',1}
Looking for edge starting with 'B'
Splitting edge [0:'BCABX'] --> [0:'B'] => 2 => [2:'CABX'] / [2:'X']
Connected node 1 ---> node 2
0 + - AB -- 1 + - CABX
| + - X
+ - B --- 2 + - CABX
| + - X
+ - CABX
The active point is currently: {Node 0,'\0',0}
Inserting singular edge for 'X' from node 0
0 + - AB -- 1 + - CABX
| + - X
+ - B --- 2 + - CABX
| + - X
+ - CABX
+ - X
Adding: 'A'
Remainder is 1
The active point is currently: {Node 0,'\0',0}
Found 'A' on edge [0:'AB']; updating active point.
0 + - AB --- 1 + - CABXA
| + - XA
+ - B ---- 2 + - CABXA
| + - XA
+ - CABXA
+ - XA
Adding: 'B'
Remainder is 2
The active point is currently: {Node 0,'a',1}
Looking for edge starting with 'A'
Found implicit match in edge [0:'AB']
Match is at end of edge; active point will be moved to the connecting node.
0 + - AB ---- 1 + - CABXAB
| + - XAB
+ - B ----- 2 + - CABXAB
| + - XAB
+ - CABXAB
+ - XAB
Adding: 'C'
Remainder is 3
The active point is currently: {Node 1,'\0',0}
Found 'C' on edge [1:'CABXABC']; updating active point.
0 + - AB ----- 1 + - CABXABC
| + - XABC
+ - B ------ 2 + - CABXABC
| + - XABC
+ - CABXABC
+ - XABC
Adding: 'D'
Remainder is 4
The active point is currently: {Node 1,'c',1}
Looking for edge starting with 'C'
Splitting edge [1:'CABXABCD'] --> [1:'C'] => 3 => [3:'ABXABCD'] / [3:'D']
Changed the active node from 1 to 2
0 + - AB ------ 1 + - C ---- 3 + - ABXABCD
| + - D
| + - XABCD
+ - B ------- 2 + - CABXABCD
| + - XABCD
+ - CABXABCD
+ - XABCD
The active point is currently: {Node 2,'c',1}
Looking for edge starting with 'C'
Splitting edge [2:'CABXABCD'] --> [2:'C'] => 4 => [4:'ABXABCD'] / [4:'D']
Connected node 3 ---> node 4
Changed the active node from 2 to 0
0 + - AB ------ 1 + - C ---- 3 + - ABXABCD
| + - D
| + - XABCD
+ - B ------- 2 + - C ---- 4 + - ABXABCD
| + - D
| + - XABCD
+ - CABXABCD
+ - XABCD
The active point is currently: {Node 0,'c',1}
Looking for edge starting with 'C'
Splitting edge [0:'CABXABCD'] --> [0:'C'] => 5 => [5:'ABXABCD'] / [5:'D']
Connected node 4 ---> node 5
0 + - AB --- 1 + - C ---- 3 + - ABXABCD
| + - D
| + - XABCD
+ - B ---- 2 + - C ---- 4 + - ABXABCD
| + - D
| + - XABCD
+ - C ---- 5 + - ABXABCD
| + - D
+ - XABCD
The active point is currently: {Node 0,'\0',0}
Inserting singular edge for 'D' from node 0
0 + - AB --- 1 + - C ---- 3 + - ABXABCD
| + - D
| + - XABCD
+ - B ---- 2 + - C ---- 4 + - ABXABCD
| + - D
| + - XABCD
+ - C ---- 5 + - ABXABCD
| + - D
+ - XABCD
+ - D
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment