Created
June 20, 2012 08:34
-
-
Save cs/2958812 to your computer and use it in GitHub Desktop.
Implementation of the Hierachical Clustering Algroithm (as presented in the lecture "Knowledge Discovery in Databases" at LMU Munich) in Ruby.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require "rubygems" | |
gem "activesupport" | |
require "active_support/all" | |
def manhattenDistance objectA, objectB | |
return (objectA.first - objectB.first).abs + (objectA.second - objectB.second).abs | |
end | |
def allInterClusterObjectDistances clusterA, clusterB | |
distances = [] | |
for a in clusterA | |
for b in clusterB | |
distances << manhattenDistance(a, b).to_f | |
end | |
end | |
return distances | |
end | |
def singleLinkDistance clusterA, clusterB | |
return allInterClusterObjectDistances(clusterA, clusterB).min | |
end | |
def averageLinkDistance clusterA, clusterB | |
return allInterClusterObjectDistances(clusterA, clusterB).sum / clusterA.size / clusterB.size | |
end | |
def distanceMatrix clusters | |
matrix = [] | |
for a in clusters.values | |
matrix << [] | |
for b in clusters.values | |
matrix.last << (yield a, b) | |
end | |
end | |
return matrix | |
end | |
def singleLinkDistanceMatrix clusters | |
return distanceMatrix clusters do |a, b| | |
singleLinkDistance(a, b) | |
end | |
end | |
def averageLinkDistanceMatrix clusters | |
return distanceMatrix clusters do |a, b| | |
averageLinkDistance(a, b) | |
end | |
end | |
def findClustersWithSmallestDistance distMatrix | |
group = [] | |
minDist = 1.0 / 0 # aka Infintiy | |
distMatrix.each_with_index do |row, i| | |
row.each_with_index do |dist, j| | |
if i != j | |
if dist < minDist | |
group = [i,j] | |
minDist = dist | |
elsif dist == minDist | |
group += [i,j] if group.include?(i) || group.include?(j) | |
end | |
end | |
end | |
end | |
return group.uniq | |
end | |
def mergeClusters(clusters, labels) | |
newLabel = labels.join.chars.sort.join | |
clusters[newLabel] = [] | |
labels.each do |l| | |
clusters[newLabel] += clusters[l] | |
clusters.delete(l) | |
end | |
return clusters | |
end | |
# Input Data: | |
allClusters = { | |
"A" => [[1,1]], | |
"B" => [[2,1]], | |
"C" => [[1,2]], | |
"D" => [[2,2]], | |
"E" => [[3,5]], | |
"F" => [[3,9]], | |
"G" => [[3,10]], | |
"H" => [[4,10]], | |
"I" => [[4,11]], | |
"J" => [[5,10]], | |
"K" => [[7,10]], | |
"L" => [[10,9]], | |
"M" => [[10,6]], | |
"N" => [[9,5]], | |
"O" => [[10,5]], | |
"P" => [[11,5]], | |
"Q" => [[9,4]], | |
"R" => [[10,4]], | |
"S" => [[11,4]], | |
"T" => [[10,3]] | |
} | |
# Core Algorithm (Agglomerative Clustering): | |
until allClusters.size == 1 | |
puts allClusters.keys.inspect + " => " | |
# With Single Link Distance Matrix: | |
# matrix = singleLinkDistanceMatrix allClusters | |
# With Average Link Distance Matrix: | |
matrix = averageLinkDistanceMatrix allClusters | |
toMerge = findClustersWithSmallestDistance matrix | |
toMergeLabels = toMerge.map { |i| allClusters.keys[i] } | |
# aLabel, bLabel = allClusters.keys[aIndex], allClusters.keys[bIndex] | |
puts "merging #{toMergeLabels.to_sentence} at dist = #{matrix[toMerge.first][toMerge.second]} => " | |
allClusters = mergeClusters(allClusters, toMergeLabels) | |
end | |
puts allClusters.keys.inspect |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment