Last Updated on October 3, 2014 by nghiaho12
Googling for K-Means for Octave brought me to this Matlab/Octave script http://www.christianherta.de/kmeans.html by Christian Herta. Works great, except that it ran very slow. Here is my improved version to run faster, most of the expensive for loops have been replaced with faster ones. The new version runs orders of magnitude faster.
 
function[centroid, pointsInCluster, assignment]= myKmeans(data, nbCluster)
% usage
% function[centroid, pointsInCluster, assignment]=
% myKmeans(data, nbCluster)
%
% Output:
% centroid: matrix in each row are the Coordinates of a centroid
% pointsInCluster: row vector with the nbDatapoints belonging to
% the centroid
% assignment: row Vector with clusterAssignment of the dataRows
%
% Input:
% data in rows
% nbCluster : nb of centroids to determine
%
% (c) by Christian Herta ( www.christianherta.de )
% Modified by Nghia Ho to improve speed
data_dim = length(data(1,:));
nbData   = length(data(:,1));
% init the centroids randomly
data_min = min(data);
data_max = max(data);
data_diff = data_max .- data_min ;
% every row is a centroid
centroid = rand(nbCluster, data_dim);
centroid = centroid .* repmat(data_diff, nbCluster, 1) + repmat(data_min, nbCluster, 1);
% no stopping at start
pos_diff = 1.;
% main loop until
while pos_diff > 0.0
  % E-Step
  assignment = [];
  % assign each datapoint to the closest centroid
  if(nbCluster == 1) % special case
	assignment = ones(size(data,1), 1);
  else
	  dists = [];
	  for c = 1: nbCluster
		d = data - repmat(centroid(c,:), size(data,1), 1);
		d = d .* d;
		d = sum(d, 2); % sum the row values
		dists = [dists d];
	  end
	  [a, assignment] = min(dists');
	  assignment = assignment';
  end
  % for the stoppingCriterion
  oldPositions = centroid;
  % M-Step
  % recalculate the positions of the centroids
  centroid = zeros(nbCluster, data_dim);
  pointsInCluster = zeros(nbCluster, 1);
  for c = 1: nbCluster
	indexes = find(assignment == c);
	d = data(indexes,:);
	centroid(c,:) = sum(d,1);
	pointsInCluster(c,1) = size(d,1);
    if( pointsInCluster(c, 1) != 0)
      centroid( c , : ) = centroid( c, : ) / pointsInCluster(c, 1);
    else
      % set cluster randomly to new position
      centroid( c , : ) = (rand( 1, data_dim) .* data_diff) + data_min;
    end
  end
  %stoppingCriterion
  pos_diff = sum (sum( (centroid .- oldPositions).^2 ) );
end
end