JWS C Library
C language utility library
kmeans.c
Go to the documentation of this file.
00001 /*
00002  * This work is licensed under a Creative Commons 
00003  * Attribution-Noncommercial-Share Alike 3.0 United States License.
00004  * 
00005  *    http://creativecommons.org/licenses/by-nc-sa/3.0/us/
00006  * 
00007  * You are free:
00008  * 
00009  *    to Share - to copy, distribute, display, and perform the work
00010  *    to Remix - to make derivative works
00011  * 
00012  * Under the following conditions:
00013  * 
00014  *    Attribution. You must attribute the work in the manner specified by the
00015  *    author or licensor (but not in any way that suggests that they endorse you
00016  *    or your use of the work).
00017  * 
00018  *    Noncommercial. You may not use this work for commercial purposes.
00019  * 
00020  *    Share Alike. If you alter, transform, or build upon this work, you may
00021  *    distribute the resulting work only under the same or similar license to
00022  *    this one.
00023  * 
00024  * For any reuse or distribution, you must make clear to others the license
00025  * terms of this work. The best way to do this is by including this header.
00026  * 
00027  * Any of the above conditions can be waived if you get permission from the
00028  * copyright holder.
00029  * 
00030  * Apart from the remix rights granted under this license, nothing in this
00031  * license impairs or restricts the author's moral rights.
00032  */
00033 
00034 
00046 #include <jwsc/config.h>
00047 
00048 #include <stdlib.h>
00049 #include <stdio.h>
00050 #include <math.h>
00051 #include <assert.h>
00052 #include <limits.h>
00053 #include <inttypes.h>
00054 
00055 #include "jwsc/vector/vector.h"
00056 #include "jwsc/matrix/matrix.h"
00057 #include "jwsc/prob/pdf.h"
00058 #include "jwsc/stat/kmeans.h"
00059 
00060 
00084 void kmeans_d
00085 (
00086     Matrix_d**      means_out, 
00087     Vector_i32**    members_out, 
00088     const Matrix_d* features,
00089     uint32_t        K,
00090     uint32_t        N
00091 )
00092 {
00093     uint32_t mean, mean2, feature;
00094     uint32_t num_features = features->num_cols;
00095     uint32_t num_records = features->num_rows;
00096     uint32_t n;
00097     int64_t record;
00098 
00099     double objective = INT_MAX;
00100     double delta_objective = 1.0;
00101     double d, sum, min;
00102 
00103     Matrix_d*   means;
00104     Vector_i32* members;
00105     Vector_i32* member_counts;
00106 
00107     assert(N <= floor(features->num_rows / (double)K));
00108 
00109     if (*means_out == NULL)
00110     {
00111         init_kmeans_randomly_d(means_out, features, K);
00112     }
00113     means = *means_out;
00114 
00115     create_zero_vector_i32(members_out, num_records);
00116     members = *members_out;
00117 
00118     member_counts = NULL;
00119 
00120     /* Until there is no change in the means, iterate. */
00121     while (delta_objective > 1.0e-10)
00122     {
00123         /* Assign membership of each record to its closest mean. */
00124         for (record = 0; record < num_records; record++)
00125         {
00126             min = INT_MAX;
00127 
00128             for (mean = 0; mean < K; mean++)
00129             {
00130                 sum = 0.0;
00131 
00132                 for (feature = 0; feature < num_features; feature++)
00133                 {
00134                     d = means->elts[ mean ][ feature ];
00135                     d -= features->elts[ record ][ feature ];
00136                     sum += d*d;
00137                 }
00138 
00139                 if  (sum < min)
00140                 {
00141                     min = sum;
00142                     members->elts[ record ] = mean;
00143                 }
00144             }
00145         }
00146 
00147         create_zero_vector_i32(&member_counts, K);
00148 
00149         /* Ensure each mean has at least N member records. */
00150         for (record = 0; record < num_records; record++) 
00151         {
00152             mean = members->elts[ record ];
00153             member_counts->elts[ mean ]++;
00154         }
00155         for (mean = 0; mean < K; mean++)
00156         {
00157             if (member_counts->elts[ mean ] < N)
00158             {
00159                 n = N - member_counts->elts[ mean ];
00160 
00161                 while (n > 0)
00162                 {
00163                     record = floor(sample_uniform_pdf_d(0, num_records*0.9999));
00164                     mean2 = members->elts[ record ];
00165 
00166                     if (member_counts->elts[ mean2 ] > N)
00167                     {
00168                         members->elts[ record ] = mean;
00169                         member_counts->elts[ mean2 ]--;
00170                         member_counts->elts[ mean ]++;
00171                         n--;
00172                     }
00173                 }
00174             }
00175 
00176             assert(member_counts->elts[ mean ] >= N);
00177         }
00178 
00179         create_zero_matrix_d(&means, means->num_rows, means->num_cols);
00180 
00181         /* Re-calculate means from their members. */
00182         for (record = 0; record < num_records; record++)
00183         {
00184             mean = members->elts[ record ];
00185 
00186             for (feature = 0; feature < num_features; feature++)
00187             {
00188                 means->elts[ mean ][ feature ] += 
00189                     features->elts[ record ][ feature ];
00190             }
00191         }
00192         for (mean = 0; mean < K; mean++)
00193         {
00194             for (feature = 0; feature < num_features; feature++)
00195             {
00196                 means->elts[ mean ][ feature ] /= member_counts->elts[ mean ];
00197             }
00198         }
00199 
00200         sum = 0.0;
00201 
00202         /* Re-caclculate value of the objective function. */
00203         for (record = 0; record < num_records; record++)
00204         {
00205             mean = members->elts[ record ];
00206 
00207             for (feature = 0; feature < num_features; feature++)
00208             {
00209                 d = means->elts[ mean ][ feature];
00210                 d -= features->elts[ record ][ feature ];
00211                 sum += d*d;
00212             }
00213         }
00214 
00215         /*assert((delta_objective = objective - sum) >= 0);*/
00216         delta_objective = fabs(objective - sum);
00217         objective = sum;
00218 
00219         /*fprintf(stderr, "Objective: %e\n", objective);*/
00220     }
00221 
00222     free_vector_i32(member_counts);
00223 }
00224 
00225 
00236 void init_kmeans_randomly_d
00237 (
00238     Matrix_d**      means_out, 
00239     const Matrix_d* features,
00240     uint32_t        K
00241 )
00242 {
00243     uint32_t mean, feature;
00244     uint32_t num_features = features->num_cols;
00245     uint32_t num_records = features->num_rows;
00246     int64_t  record;
00247     double r;
00248 
00249     Vector_i32* choosen_records = NULL;
00250     Matrix_d*   means;
00251 
00252     create_zero_vector_i32(&choosen_records, num_records);
00253 
00254     create_zero_matrix_d(means_out, K, num_features);
00255     means = *means_out;
00256 
00257     /* Randomly choose k records to initialize the means with. */
00258     for (mean = 0; mean < K; mean++)
00259     {
00260         record = -1;
00261 
00262         while (record < 0)
00263         {
00264             r = rand() / (double)RAND_MAX;
00265             record = floor((num_records-1)*r + 0.5);
00266 
00267             if (!choosen_records->elts[ record ])
00268             {
00269                 choosen_records->elts[ record ] = 1;
00270             }
00271             else
00272             {
00273                 record = -1;
00274             }
00275         }
00276 
00277         for (feature = 0; feature < num_features; feature++)
00278         {
00279             means->elts[ mean ][ feature ] = 
00280                 features->elts[ record ][ feature ];
00281         }
00282     }
00283 
00284     free_vector_i32(choosen_records);
00285 }