jurgenvinju · January 28, 2023 13:24
diff --git a/KMeans.rsc b/KMeans.rsc
 module analysis::learning::KMeans

 import util::Math;
 import List;
 import Set;
 import IO;

 data Point = point(list[real] vec);

 Point arbPoint(int d) = point([arbReal() | _ <- [0..d]]);

 private real sqr(real r) = r * r;

 @doc{
 .Synopsis A measure of distance (the square of the Euclidean distance in fact)
 .Description

 We compute pairwise the square of the absolute distance of each dimension to arrive
 at the square of the Euclidian distance between two n-dimensional points.

 The dist function is robust against points with different dimensionalities and will
 fill the shorter point with .0 values until the dimensions are the same.
 }
 real dist(Point p, Point q)
  = (.0 | it + sqr(abs((p.vec[i]?.0) - (q.vec[i]?.0))) | i <- [0..max(size(p.vec),size(q.vec))]);
     
 @doc{
 .Synopsis: computes the average of all dimensions of all points
 .Description:
 A centroid is the "average" point of a set of points, composed as the
 pairwise mean value of all points for each dimenions.
 }
 @memo  
 private Point centroid(set[Point] cl:{Point h, *_}) =
  point([ sum([p.vec[i] | p <- cl]) / size(cl) | i <- index(h.vec)]); 

 alias Clusters = set[set[Point]];

 @doc{
 .Synopsis: simple partition simply breaks the input up in almost even parts, in order of appearance
 .Description:
 This partioning algorithm serves as a demonstration of what an initial partioning in clusters
 should do before the kmeans algorithm is applied.

 The current algorithm is not very smart and will lead to longer running times if kmeans
 is applied to it, because the chances are that many points have to be moved to new clusters
 in many iterations before the optimum is reached.

 The advice is to create your own partioning function. Such a function would first guess
 a set of initial centroids using any distribution (using knowledge on the input data), 
 and then sample efficiently sample around these centroids using another 
 distribution function.
 }
 Clusters simplePartition([], 0)             = {};
 Clusters simplePartition(list[Point] cl, 1) = {{*cl}};
 default Clusters simplePartition(list[Point] cl, int k) 
  = {{*cl[..s/k]}, *simplePartition(cl[s/k..], k - 1)} 
  when s := size(cl); 


 @doc{
 .Synopsis Kmeans optimizes the initial arbitrary clustering by reassigning points to a nearest averaged middle point for each cluster (centroid).
 .Description

 The efficiency of kmeans is greatly influenced by the initial clustering provided. The closer
 the initial cluster is to the optimum which kmeans computes, the fewer steps it takes to reach it.

 Kmeans will return no more clusters than the initial clustering, but it may return fewer
 clusters. This may happen when the final elements in a cluster are closer to another centroid
 than they were to the previous centroid of that cluster: then one cluster completely eats up another.
 It's an indication that the initial amount of clusters was not appropriate for the input data, 
 but it can also happen accidentally in weird cases when the initial clustering is completely 
 unbalanced.
 }
 Clusters kmeans(Clusters prop) {
  // we assign each point to a cluster, identified by its current centroid
  assignment = {<c, p> | cl <- prop, c := centroid(cl), p <- cl};
  
  solve (assignment) {
    // the current centroids are used to index each cluster for each iteration
    centroids  = assignment<0>;
    
    // we find which other cluster is currently closer for each point in each cluster:
    movement   = {<(c | dist(d, p) < dist(it, p) ? d : it | d <- centroids), p> | c <- centroids, p <- assignment[c]};
    
    // then we compute for each new cluster what the new centroid should be
    assignment = {<nc, p>  | c <- centroids, cl := movement[c], cl != {}, nc := centroid(cl), p <- cl};
  }
  
  // drop the centroids and simply return a set of clusters
  return { assignment[cl] | cl <- assignment<0>};
 }
	module analysis::learning::KMeans

	import util::Math;
	import List;
	import Set;
	import IO;

	data Point = point(list[real] vec);

	Point arbPoint(int d) = point([arbReal() \| _ <- [0..d]]);

	private real sqr(real r) = r * r;

	@doc{
	.Synopsis A measure of distance (the square of the Euclidean distance in fact)
	.Description

	We compute pairwise the square of the absolute distance of each dimension to arrive
	at the square of the Euclidian distance between two n-dimensional points.

	The dist function is robust against points with different dimensionalities and will
	fill the shorter point with .0 values until the dimensions are the same.
	}
	real dist(Point p, Point q)
	= (.0 \| it + sqr(abs((p.vec[i]?.0) - (q.vec[i]?.0))) \| i <- [0..max(size(p.vec),size(q.vec))]);

	@doc{
	.Synopsis: computes the average of all dimensions of all points
	.Description:
	A centroid is the "average" point of a set of points, composed as the
	pairwise mean value of all points for each dimenions.
	}
	@memo
	private Point centroid(set[Point] cl:{Point h, *_}) =
	point([ sum([p.vec[i] \| p <- cl]) / size(cl) \| i <- index(h.vec)]);

	alias Clusters = set[set[Point]];

	@doc{
	.Synopsis: simple partition simply breaks the input up in almost even parts, in order of appearance
	.Description:
	This partioning algorithm serves as a demonstration of what an initial partioning in clusters
	should do before the kmeans algorithm is applied.

	The current algorithm is not very smart and will lead to longer running times if kmeans
	is applied to it, because the chances are that many points have to be moved to new clusters
	in many iterations before the optimum is reached.

	The advice is to create your own partioning function. Such a function would first guess
	a set of initial centroids using any distribution (using knowledge on the input data),
	and then sample efficiently sample around these centroids using another
	distribution function.
	}
	Clusters simplePartition([], 0) = {};
	Clusters simplePartition(list[Point] cl, 1) = {{*cl}};
	default Clusters simplePartition(list[Point] cl, int k)
	= {{cl[..s/k]}, simplePartition(cl[s/k..], k - 1)}
	when s := size(cl);


	@doc{
	.Synopsis Kmeans optimizes the initial arbitrary clustering by reassigning points to a nearest averaged middle point for each cluster (centroid).
	.Description

	The efficiency of kmeans is greatly influenced by the initial clustering provided. The closer
	the initial cluster is to the optimum which kmeans computes, the fewer steps it takes to reach it.

	Kmeans will return no more clusters than the initial clustering, but it may return fewer
	clusters. This may happen when the final elements in a cluster are closer to another centroid
	than they were to the previous centroid of that cluster: then one cluster completely eats up another.
	It's an indication that the initial amount of clusters was not appropriate for the input data,
	but it can also happen accidentally in weird cases when the initial clustering is completely
	unbalanced.
	}
	Clusters kmeans(Clusters prop) {
	// we assign each point to a cluster, identified by its current centroid
	assignment = {<c, p> \| cl <- prop, c := centroid(cl), p <- cl};

	solve (assignment) {
	// the current centroids are used to index each cluster for each iteration
	centroids = assignment<0>;

	// we find which other cluster is currently closer for each point in each cluster:
	movement = {<(c \| dist(d, p) < dist(it, p) ? d : it \| d <- centroids), p> \| c <- centroids, p <- assignment[c]};

	// then we compute for each new cluster what the new centroid should be
	assignment = {<nc, p> \| c <- centroids, cl := movement[c], cl != {}, nc := centroid(cl), p <- cl};
	}

	// drop the centroids and simply return a set of clusters
	return { assignment[cl] \| cl <- assignment<0>};
	}