Skip to content

Instantly share code, notes, and snippets.

@cs
Created June 20, 2012 08:34

Revisions

  1. Christoph Schiessl created this gist Jun 20, 2012.
    118 changes: 118 additions & 0 deletions hierachical_clustering.rb
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,118 @@
    require "rubygems"
    gem "activesupport"
    require "active_support/all"

    def manhattenDistance objectA, objectB
    return (objectA.first - objectB.first).abs + (objectA.second - objectB.second).abs
    end

    def allInterClusterObjectDistances clusterA, clusterB
    distances = []
    for a in clusterA
    for b in clusterB
    distances << manhattenDistance(a, b).to_f
    end
    end
    return distances
    end

    def singleLinkDistance clusterA, clusterB
    return allInterClusterObjectDistances(clusterA, clusterB).min
    end

    def averageLinkDistance clusterA, clusterB
    return allInterClusterObjectDistances(clusterA, clusterB).sum / clusterA.size / clusterB.size
    end

    def distanceMatrix clusters
    matrix = []
    for a in clusters.values
    matrix << []
    for b in clusters.values
    matrix.last << (yield a, b)
    end
    end
    return matrix
    end

    def singleLinkDistanceMatrix clusters
    return distanceMatrix clusters do |a, b|
    singleLinkDistance(a, b)
    end
    end

    def averageLinkDistanceMatrix clusters
    return distanceMatrix clusters do |a, b|
    averageLinkDistance(a, b)
    end
    end

    def findClustersWithSmallestDistance distMatrix
    group = []
    minDist = 1.0 / 0 # aka Infintiy

    distMatrix.each_with_index do |row, i|
    row.each_with_index do |dist, j|
    if i != j
    if dist < minDist
    group = [i,j]
    minDist = dist
    elsif dist == minDist
    group += [i,j] if group.include?(i) || group.include?(j)
    end
    end
    end
    end
    return group.uniq
    end

    def mergeClusters(clusters, labels)
    newLabel = labels.join.chars.sort.join
    clusters[newLabel] = []
    labels.each do |l|
    clusters[newLabel] += clusters[l]
    clusters.delete(l)
    end
    return clusters
    end

    # Input Data:
    allClusters = {
    "A" => [[1,1]],
    "B" => [[2,1]],
    "C" => [[1,2]],
    "D" => [[2,2]],
    "E" => [[3,5]],
    "F" => [[3,9]],
    "G" => [[3,10]],
    "H" => [[4,10]],
    "I" => [[4,11]],
    "J" => [[5,10]],
    "K" => [[7,10]],
    "L" => [[10,9]],
    "M" => [[10,6]],
    "N" => [[9,5]],
    "O" => [[10,5]],
    "P" => [[11,5]],
    "Q" => [[9,4]],
    "R" => [[10,4]],
    "S" => [[11,4]],
    "T" => [[10,3]]
    }

    # Core Algorithm (Agglomerative Clustering):
    until allClusters.size == 1
    puts allClusters.keys.inspect + " => "

    # With Single Link Distance Matrix:
    # matrix = singleLinkDistanceMatrix allClusters
    # With Average Link Distance Matrix:
    matrix = averageLinkDistanceMatrix allClusters

    toMerge = findClustersWithSmallestDistance matrix
    toMergeLabels = toMerge.map { |i| allClusters.keys[i] }
    # aLabel, bLabel = allClusters.keys[aIndex], allClusters.keys[bIndex]
    puts "merging #{toMergeLabels.to_sentence} at dist = #{matrix[toMerge.first][toMerge.second]} => "
    allClusters = mergeClusters(allClusters, toMergeLabels)
    end
    puts allClusters.keys.inspect