Skip to content

Instantly share code, notes, and snippets.

@matklad
Forked from axefrog/0.suffixtree.cs
Created May 7, 2012 05:34
Show Gist options
  • Save matklad/2626115 to your computer and use it in GitHub Desktop.
Save matklad/2626115 to your computer and use it in GitHub Desktop.
C# Suffix tree implementation based on Ukkonen's algorithm
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
namespace SuffixTreeAlgorithm
{
public class SuffixTree
{
public string Word { get; private set; }
private int CurrentSuffixStartIndex { get; set; }
private int CurrentSuffixEndIndex { get; set; }
private Node LastCreatedNodeInCurrentIteration { get; set; }
private int UnresolvedSuffixes { get; set; }
public Node RootNode { get; private set; }
private Node ActiveNode { get; set; }
private Edge ActiveEdge { get; set; }
private int DistanceIntoActiveEdge { get; set; }
private char LastCharacterOfCurrentSuffix { get; set; }
private int NextNodeNumber { get; set; }
private SuffixTree(string word)
{
Word = word;
RootNode = new Node(this);
ActiveNode = RootNode;
}
public static SuffixTree Create(string word, char canonizationChar = '$')
{
var tree = new SuffixTree(word);
tree.Build(canonizationChar);
return tree;
}
private void Build(char canonizationChar)
{
var n = Word.IndexOf(Word[Word.Length - 1]);
var mustCanonize = n < Word.Length - 1;
if(mustCanonize)
Word = string.Concat(Word, canonizationChar);
for(CurrentSuffixEndIndex = 0; CurrentSuffixEndIndex < Word.Length; CurrentSuffixEndIndex++)
{
Console.WriteLine("=== ITERATION {0} ===", CurrentSuffixEndIndex);
LastCreatedNodeInCurrentIteration = null;
LastCharacterOfCurrentSuffix = Word[CurrentSuffixEndIndex];
for(CurrentSuffixStartIndex = CurrentSuffixEndIndex - UnresolvedSuffixes; CurrentSuffixStartIndex <= CurrentSuffixEndIndex; CurrentSuffixStartIndex++)
{
var wasImplicitlyAdded = !AddNextSuffix();
Console.WriteLine();
Console.WriteLine(RenderTree());
if(wasImplicitlyAdded)
{
UnresolvedSuffixes++;
break;
}
if(UnresolvedSuffixes > 0)
UnresolvedSuffixes--;
}
}
}
private bool AddNextSuffix()
{
var suffix = string.Concat(Word.Substring(CurrentSuffixStartIndex, CurrentSuffixEndIndex - CurrentSuffixStartIndex), "{", Word[CurrentSuffixEndIndex], "}");
Console.WriteLine("The next suffix of '{0}' to add is '{1}' at indices {2},{3}", Word, suffix, CurrentSuffixStartIndex, CurrentSuffixEndIndex);
Console.WriteLine(" => ActiveNode: {0}", ActiveNode);
Console.WriteLine(" => ActiveEdge: {0}", ActiveEdge == null ? "none" : ActiveEdge.ToString());
Console.WriteLine(" => DistanceIntoActiveEdge: {0}", DistanceIntoActiveEdge);
Console.WriteLine(" => UnresolvedSuffixes: {0}", UnresolvedSuffixes);
if(ActiveEdge != null && DistanceIntoActiveEdge >= ActiveEdge.Length)
throw new Exception("BOUNDARY EXCEEDED");
if(ActiveEdge != null)
return AddCurrentSuffixToActiveEdge();
if(GetExistingEdgeAndSetAsActive())
return false;
ActiveNode.AddNewEdge();
UpdateActivePointAfterAddingNewEdge();
return true;
}
private bool GetExistingEdgeAndSetAsActive()
{
Edge edge;
if(ActiveNode.Edges.TryGetValue(LastCharacterOfCurrentSuffix, out edge))
{
Console.WriteLine("Existing edge for {0} starting with '{1}' found. Values adjusted to:", ActiveNode, LastCharacterOfCurrentSuffix);
ActiveEdge = edge;
DistanceIntoActiveEdge = 1;
NormalizeActivePointIfNowAtOrBeyondEdgeBoundary(ActiveEdge.StartIndex);
Console.WriteLine(" => ActiveEdge is now: {0}", ActiveEdge);
Console.WriteLine(" => DistanceIntoActiveEdge is now: {0}", DistanceIntoActiveEdge);
Console.WriteLine(" => UnresolvedSuffixes is now: {0}", UnresolvedSuffixes);
return true;
}
Console.WriteLine("Existing edge for {0} starting with '{1}' not found", ActiveNode, LastCharacterOfCurrentSuffix);
return false;
}
private bool AddCurrentSuffixToActiveEdge()
{
var nextCharacterOnEdge = Word[ActiveEdge.StartIndex + DistanceIntoActiveEdge];
if(nextCharacterOnEdge == LastCharacterOfCurrentSuffix)
{
Console.WriteLine("The next character on the current edge is '{0}' (suffix added implicitly)", LastCharacterOfCurrentSuffix);
DistanceIntoActiveEdge++;
Console.WriteLine(" => DistanceIntoActiveEdge is now: {0}", DistanceIntoActiveEdge);
NormalizeActivePointIfNowAtOrBeyondEdgeBoundary(ActiveEdge.StartIndex);
return false;
}
SplitActiveEdge();
ActiveEdge.Tail.AddNewEdge();
UpdateActivePointAfterAddingNewEdge();
return true;
}
private void UpdateActivePointAfterAddingNewEdge()
{
if(ReferenceEquals(ActiveNode, RootNode))
{
if(DistanceIntoActiveEdge > 0)
{
DistanceIntoActiveEdge--;
var firstIndex = ActiveEdge.StartIndex;
ActiveEdge = DistanceIntoActiveEdge == 0 ? null : ActiveNode.Edges[Word[CurrentSuffixStartIndex + 1]];
NormalizeActivePointIfNowAtOrBeyondEdgeBoundary(firstIndex);
}
}
else
UpdateActivePointToLinkedNodeOrRoot();
}
private void NormalizeActivePointIfNowAtOrBeyondEdgeBoundary(int firstIndexOfOriginalActiveEdge)
{
var walkDistance = 0;
while(ActiveEdge != null && DistanceIntoActiveEdge >= ActiveEdge.Length)
{
Console.WriteLine("Active point is now at or beyond edge boundary and will be moved until it falls inside an edge boundary");
DistanceIntoActiveEdge -= ActiveEdge.Length;
ActiveNode = ActiveEdge.Tail;
if(DistanceIntoActiveEdge == 0)
ActiveEdge = null;
else
{
walkDistance += ActiveEdge.Length;
var c = Word[firstIndexOfOriginalActiveEdge + walkDistance];
ActiveEdge = ActiveNode.Edges[c];
}
}
}
private void SplitActiveEdge()
{
ActiveEdge = ActiveEdge.SplitAtIndex(ActiveEdge.StartIndex + DistanceIntoActiveEdge);
Console.WriteLine(" => ActiveEdge is now: {0}", ActiveEdge);
if(LastCreatedNodeInCurrentIteration != null)
{
LastCreatedNodeInCurrentIteration.LinkedNode = ActiveEdge.Tail;
Console.WriteLine(" => Connected {0} to {1}", LastCreatedNodeInCurrentIteration, ActiveEdge.Tail);
}
LastCreatedNodeInCurrentIteration = ActiveEdge.Tail;
}
private void UpdateActivePointToLinkedNodeOrRoot()
{
Console.WriteLine("The linked node for active node {0} is {1}", ActiveNode, ActiveNode.LinkedNode == null ? "[null]" : ActiveNode.LinkedNode.ToString());
if(ActiveNode.LinkedNode != null)
{
ActiveNode = ActiveNode.LinkedNode;
Console.WriteLine(" => ActiveNode is now: {0}", ActiveNode);
}
else
{
ActiveNode = RootNode;
}
if(ActiveEdge != null)
{
var firstIndexOfOriginalActiveEdge = ActiveEdge.StartIndex;
ActiveEdge = ActiveNode.Edges[Word[ActiveEdge.StartIndex]];
NormalizeActivePointIfNowAtOrBeyondEdgeBoundary(firstIndexOfOriginalActiveEdge);
}
}
public string RenderTree()
{
var writer = new StringWriter();
RootNode.RenderTree(writer, "");
return writer.ToString();
}
public class Edge
{
private readonly SuffixTree _tree;
public Edge(SuffixTree tree, Node head)
{
_tree = tree;
Head = head;
StartIndex = tree.CurrentSuffixEndIndex;
}
public Node Head { get; private set; }
public Node Tail { get; private set; }
public int StartIndex { get; private set; }
public int? EndIndex { get; set; }
public int Length { get { return (EndIndex ?? _tree.Word.Length - 1) - StartIndex + 1; } }
public Edge SplitAtIndex(int index)
{
Console.WriteLine("Splitting edge {0} at index {1} ('{2}')", this, index, _tree.Word[index]);
var newEdge = new Edge(_tree, Head);
var newNode = new Node(_tree);
newEdge.Tail = newNode;
newEdge.StartIndex = StartIndex;
newEdge.EndIndex = index - 1;
Head = newNode;
StartIndex = index;
newNode.Edges.Add(_tree.Word[StartIndex], this);
newEdge.Head.Edges[_tree.Word[newEdge.StartIndex]] = newEdge;
Console.WriteLine(" => Hierarchy is now: {0} --> {1} --> {2} --> {3}", newEdge.Head, newEdge, newNode, this);
return newEdge;
}
public override string ToString()
{
return string.Concat(_tree.Word.Substring(StartIndex, (EndIndex ?? _tree.CurrentSuffixEndIndex) - StartIndex + 1), "(",
StartIndex, ",", EndIndex.HasValue ? EndIndex.ToString() : "#", ")");
}
public string String
{
get { return _tree.Word.Substring(StartIndex, (EndIndex ?? _tree.Word.Length - 1) - StartIndex + 1); }
}
public void RenderTree(TextWriter writer, string prefix, int maxEdgeLength)
{
var strEdge = _tree.Word.Substring(StartIndex, (EndIndex ?? _tree.CurrentSuffixEndIndex) - StartIndex + 1);
writer.Write(strEdge);
if(Tail == null)
writer.WriteLine();
else
{
var line = new string(RenderChars.HorizontalLine, maxEdgeLength - strEdge.Length + 1);
writer.Write(line);
Tail.RenderTree(writer, string.Concat(prefix, new string(' ', strEdge.Length + line.Length)));
}
}
}
public class Node
{
private readonly SuffixTree _tree;
public Node(SuffixTree tree)
{
_tree = tree;
Edges = new Dictionary<char, Edge>();
NodeNumber = _tree.NextNodeNumber++;
}
public Dictionary<char, Edge> Edges { get; private set; }
public Node LinkedNode { get; set; }
public int NodeNumber { get; private set; }
public void AddNewEdge()
{
Console.WriteLine("Adding new edge to {0}", this);
var edge = new Edge(_tree, this);
Edges.Add(_tree.Word[_tree.CurrentSuffixEndIndex], edge);
Console.WriteLine(" => {0} --> {1}", this, edge);
}
public void RenderTree(TextWriter writer, string prefix)
{
var strNode = string.Concat("(", NodeNumber.ToString(new string('0', _tree.NextNodeNumber.ToString().Length)), ")");
writer.Write(strNode);
var edges = Edges.Select(kvp => kvp.Value).OrderBy(e => _tree.Word[e.StartIndex]).ToArray();
if(edges.Any())
{
var prefixWithNodePadding = prefix + new string(' ', strNode.Length);
var maxEdgeLength = edges.Max(e => (e.EndIndex ?? _tree.CurrentSuffixEndIndex) - e.StartIndex + 1);
for(var i = 0; i < edges.Length; i++)
{
char connector, extender = ' ';
if(i == 0)
{
if(edges.Length > 1)
{
connector = RenderChars.TJunctionDown;
extender = RenderChars.VerticalLine;
}
else
connector = RenderChars.HorizontalLine;
}
else
{
writer.Write(prefixWithNodePadding);
if(i == edges.Length - 1)
connector = RenderChars.CornerRight;
else
{
connector = RenderChars.TJunctionRight;
extender = RenderChars.VerticalLine;
}
}
writer.Write(string.Concat(connector, RenderChars.HorizontalLine));
var newPrefix = string.Concat(prefixWithNodePadding, extender, ' ');
edges[i].RenderTree(writer, newPrefix, maxEdgeLength);
}
}
}
public override string ToString()
{
return string.Concat("node #", NodeNumber);
}
}
public static class RenderChars
{
public const char TJunctionDown = '┬';
public const char HorizontalLine = '─';
public const char VerticalLine = '│';
public const char TJunctionRight = '├';
public const char CornerRight = '└';
}
}
}
static void Main()
{
var tree = SuffixTree.Create("abcabxabcd");
}
=== ITERATION 0 ===
The next suffix of 'abcabxabcd' to add is '{a}' at indices 0,0
=> ActiveNode: node #0
=> ActiveEdge: none
=> DistanceIntoActiveEdge: 0
=> UnresolvedSuffixes: 0
Existing edge for node #0 starting with 'a' not found
Adding new edge to node #0
=> node #0 --> a(0,#)
(0)──a
=== ITERATION 1 ===
The next suffix of 'abcabxabcd' to add is '{b}' at indices 1,1
=> ActiveNode: node #0
=> ActiveEdge: none
=> DistanceIntoActiveEdge: 0
=> UnresolvedSuffixes: 0
Existing edge for node #0 starting with 'b' not found
Adding new edge to node #0
=> node #0 --> b(1,#)
(0)┬─ab
└─b
=== ITERATION 2 ===
The next suffix of 'abcabxabcd' to add is '{c}' at indices 2,2
=> ActiveNode: node #0
=> ActiveEdge: none
=> DistanceIntoActiveEdge: 0
=> UnresolvedSuffixes: 0
Existing edge for node #0 starting with 'c' not found
Adding new edge to node #0
=> node #0 --> c(2,#)
(0)┬─abc
├─bc
└─c
=== ITERATION 3 ===
The next suffix of 'abcabxabcd' to add is '{a}' at indices 3,3
=> ActiveNode: node #0
=> ActiveEdge: none
=> DistanceIntoActiveEdge: 0
=> UnresolvedSuffixes: 0
Existing edge for node #0 starting with 'a' found. Values adjusted to:
=> ActiveEdge is now: abca(0,#)
=> DistanceIntoActiveEdge is now: 1
=> UnresolvedSuffixes is now: 0
(0)┬─abca
├─bca
└─ca
=== ITERATION 4 ===
The next suffix of 'abcabxabcd' to add is 'a{b}' at indices 3,4
=> ActiveNode: node #0
=> ActiveEdge: abcab(0,#)
=> DistanceIntoActiveEdge: 1
=> UnresolvedSuffixes: 1
The next character on the current edge is 'b' (suffix added implicitly)
=> DistanceIntoActiveEdge is now: 2
(0)┬─abcab
├─bcab
└─cab
=== ITERATION 5 ===
The next suffix of 'abcabxabcd' to add is 'ab{x}' at indices 3,5
=> ActiveNode: node #0
=> ActiveEdge: abcabx(0,#)
=> DistanceIntoActiveEdge: 2
=> UnresolvedSuffixes: 2
Splitting edge abcabx(0,#) at index 2 ('c')
=> Hierarchy is now: node #0 --> ab(0,1) --> node #1 --> cabx(2,#)
=> ActiveEdge is now: ab(0,1)
Adding new edge to node #1
=> node #1 --> x(5,#)
(0)┬─ab────(1)┬─cabx
│ └─x
├─bcabx
└─cabx
The next suffix of 'abcabxabcd' to add is 'b{x}' at indices 4,5
=> ActiveNode: node #0
=> ActiveEdge: bcabx(1,#)
=> DistanceIntoActiveEdge: 1
=> UnresolvedSuffixes: 1
Splitting edge bcabx(1,#) at index 2 ('c')
=> Hierarchy is now: node #0 --> b(1,1) --> node #2 --> cabx(2,#)
=> ActiveEdge is now: b(1,1)
=> Connected node #1 to node #2
Adding new edge to node #2
=> node #2 --> x(5,#)
(0)┬─ab───(1)┬─cabx
│ └─x
├─b────(2)┬─cabx
│ └─x
└─cabx
The next suffix of 'abcabxabcd' to add is '{x}' at indices 5,5
=> ActiveNode: node #0
=> ActiveEdge: none
=> DistanceIntoActiveEdge: 0
=> UnresolvedSuffixes: 0
Existing edge for node #0 starting with 'x' not found
Adding new edge to node #0
=> node #0 --> x(5,#)
(0)┬─ab───(1)┬─cabx
│ └─x
├─b────(2)┬─cabx
│ └─x
├─cabx
└─x
=== ITERATION 6 ===
The next suffix of 'abcabxabcd' to add is '{a}' at indices 6,6
=> ActiveNode: node #0
=> ActiveEdge: none
=> DistanceIntoActiveEdge: 0
=> UnresolvedSuffixes: 0
Existing edge for node #0 starting with 'a' found. Values adjusted to:
=> ActiveEdge is now: ab(0,1)
=> DistanceIntoActiveEdge is now: 1
=> UnresolvedSuffixes is now: 0
(0)┬─ab────(1)┬─cabxa
│ └─xa
├─b─────(2)┬─cabxa
│ └─xa
├─cabxa
└─xa
=== ITERATION 7 ===
The next suffix of 'abcabxabcd' to add is 'a{b}' at indices 6,7
=> ActiveNode: node #0
=> ActiveEdge: ab(0,1)
=> DistanceIntoActiveEdge: 1
=> UnresolvedSuffixes: 1
The next character on the current edge is 'b' (suffix added implicitly)
=> DistanceIntoActiveEdge is now: 2
Active point is now at or beyond edge boundary and will be moved until it falls inside an edge boundary
(0)┬─ab─────(1)┬─cabxab
│ └─xab
├─b──────(2)┬─cabxab
│ └─xab
├─cabxab
└─xab
=== ITERATION 8 ===
The next suffix of 'abcabxabcd' to add is 'ab{c}' at indices 6,8
=> ActiveNode: node #1
=> ActiveEdge: none
=> DistanceIntoActiveEdge: 0
=> UnresolvedSuffixes: 2
Existing edge for node #1 starting with 'c' found. Values adjusted to:
=> ActiveEdge is now: cabxabc(2,#)
=> DistanceIntoActiveEdge is now: 1
=> UnresolvedSuffixes is now: 2
(0)┬─ab──────(1)┬─cabxabc
│ └─xabc
├─b───────(2)┬─cabxabc
│ └─xabc
├─cabxabc
└─xabc
=== ITERATION 9 ===
The next suffix of 'abcabxabcd' to add is 'abc{d}' at indices 6,9
=> ActiveNode: node #1
=> ActiveEdge: cabxabcd(2,#)
=> DistanceIntoActiveEdge: 1
=> UnresolvedSuffixes: 3
Splitting edge cabxabcd(2,#) at index 3 ('a')
=> Hierarchy is now: node #1 --> c(2,2) --> node #3 --> abxabcd(3,#)
=> ActiveEdge is now: c(2,2)
Adding new edge to node #3
=> node #3 --> d(9,#)
The linked node for active node node #1 is node #2
=> ActiveNode is now: node #2
(0)┬─ab───────(1)┬─c─────(3)┬─abxabcd
│ │ └─d
│ └─xabcd
├─b────────(2)┬─cabxabcd
│ └─xabcd
├─cabxabcd
└─xabcd
The next suffix of 'abcabxabcd' to add is 'bc{d}' at indices 7,9
=> ActiveNode: node #2
=> ActiveEdge: cabxabcd(2,#)
=> DistanceIntoActiveEdge: 1
=> UnresolvedSuffixes: 2
Splitting edge cabxabcd(2,#) at index 3 ('a')
=> Hierarchy is now: node #2 --> c(2,2) --> node #4 --> abxabcd(3,#)
=> ActiveEdge is now: c(2,2)
=> Connected node #3 to node #4
Adding new edge to node #4
=> node #4 --> d(9,#)
The linked node for active node node #2 is [null]
(0)┬─ab───────(1)┬─c─────(3)┬─abxabcd
│ │ └─d
│ └─xabcd
├─b────────(2)┬─c─────(4)┬─abxabcd
│ │ └─d
│ └─xabcd
├─cabxabcd
└─xabcd
The next suffix of 'abcabxabcd' to add is 'c{d}' at indices 8,9
=> ActiveNode: node #0
=> ActiveEdge: cabxabcd(2,#)
=> DistanceIntoActiveEdge: 1
=> UnresolvedSuffixes: 1
Splitting edge cabxabcd(2,#) at index 3 ('a')
=> Hierarchy is now: node #0 --> c(2,2) --> node #5 --> abxabcd(3,#)
=> ActiveEdge is now: c(2,2)
=> Connected node #4 to node #5
Adding new edge to node #5
=> node #5 --> d(9,#)
(0)┬─ab────(1)┬─c─────(3)┬─abxabcd
│ │ └─d
│ └─xabcd
├─b─────(2)┬─c─────(4)┬─abxabcd
│ │ └─d
│ └─xabcd
├─c─────(5)┬─abxabcd
│ └─d
└─xabcd
The next suffix of 'abcabxabcd' to add is '{d}' at indices 9,9
=> ActiveNode: node #0
=> ActiveEdge: none
=> DistanceIntoActiveEdge: 0
=> UnresolvedSuffixes: 0
Existing edge for node #0 starting with 'd' not found
Adding new edge to node #0
=> node #0 --> d(9,#)
(0)┬─ab────(1)┬─c─────(3)┬─abxabcd
│ │ └─d
│ └─xabcd
├─b─────(2)┬─c─────(4)┬─abxabcd
│ │ └─d
│ └─xabcd
├─c─────(5)┬─abxabcd
│ └─d
├─d
└─xabcd
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment