Skip to content

Instantly share code, notes, and snippets.

@cs
Created June 20, 2012 08:34
Show Gist options
  • Save cs/2958812 to your computer and use it in GitHub Desktop.
Save cs/2958812 to your computer and use it in GitHub Desktop.
Implementation of the Hierachical Clustering Algroithm (as presented in the lecture "Knowledge Discovery in Databases" at LMU Munich) in Ruby.
require "rubygems"
gem "activesupport"
require "active_support/all"
def manhattenDistance objectA, objectB
return (objectA.first - objectB.first).abs + (objectA.second - objectB.second).abs
end
def allInterClusterObjectDistances clusterA, clusterB
distances = []
for a in clusterA
for b in clusterB
distances << manhattenDistance(a, b).to_f
end
end
return distances
end
def singleLinkDistance clusterA, clusterB
return allInterClusterObjectDistances(clusterA, clusterB).min
end
def averageLinkDistance clusterA, clusterB
return allInterClusterObjectDistances(clusterA, clusterB).sum / clusterA.size / clusterB.size
end
def distanceMatrix clusters
matrix = []
for a in clusters.values
matrix << []
for b in clusters.values
matrix.last << (yield a, b)
end
end
return matrix
end
def singleLinkDistanceMatrix clusters
return distanceMatrix clusters do |a, b|
singleLinkDistance(a, b)
end
end
def averageLinkDistanceMatrix clusters
return distanceMatrix clusters do |a, b|
averageLinkDistance(a, b)
end
end
def findClustersWithSmallestDistance distMatrix
group = []
minDist = 1.0 / 0 # aka Infintiy
distMatrix.each_with_index do |row, i|
row.each_with_index do |dist, j|
if i != j
if dist < minDist
group = [i,j]
minDist = dist
elsif dist == minDist
group += [i,j] if group.include?(i) || group.include?(j)
end
end
end
end
return group.uniq
end
def mergeClusters(clusters, labels)
newLabel = labels.join.chars.sort.join
clusters[newLabel] = []
labels.each do |l|
clusters[newLabel] += clusters[l]
clusters.delete(l)
end
return clusters
end
# Input Data:
allClusters = {
"A" => [[1,1]],
"B" => [[2,1]],
"C" => [[1,2]],
"D" => [[2,2]],
"E" => [[3,5]],
"F" => [[3,9]],
"G" => [[3,10]],
"H" => [[4,10]],
"I" => [[4,11]],
"J" => [[5,10]],
"K" => [[7,10]],
"L" => [[10,9]],
"M" => [[10,6]],
"N" => [[9,5]],
"O" => [[10,5]],
"P" => [[11,5]],
"Q" => [[9,4]],
"R" => [[10,4]],
"S" => [[11,4]],
"T" => [[10,3]]
}
# Core Algorithm (Agglomerative Clustering):
until allClusters.size == 1
puts allClusters.keys.inspect + " => "
# With Single Link Distance Matrix:
# matrix = singleLinkDistanceMatrix allClusters
# With Average Link Distance Matrix:
matrix = averageLinkDistanceMatrix allClusters
toMerge = findClustersWithSmallestDistance matrix
toMergeLabels = toMerge.map { |i| allClusters.keys[i] }
# aLabel, bLabel = allClusters.keys[aIndex], allClusters.keys[bIndex]
puts "merging #{toMergeLabels.to_sentence} at dist = #{matrix[toMerge.first][toMerge.second]} => "
allClusters = mergeClusters(allClusters, toMergeLabels)
end
puts allClusters.keys.inspect
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment