doc/html/kmeans_8c_source.html

00001 /*
00002  * This work is licensed under a Creative Commons
00003  * Attribution-Noncommercial-Share Alike 3.0 United States License.
00004  *
00005  *    http://creativecommons.org/licenses/by-nc-sa/3.0/us/
00006  *
00007  * You are free:
00008  *
00009  *    to Share - to copy, distribute, display, and perform the work
00010  *    to Remix - to make derivative works
00011  *
00012  * Under the following conditions:
00013  *
00014  *    Attribution. You must attribute the work in the manner specified by the
00015  *    author or licensor (but not in any way that suggests that they endorse you
00016  *    or your use of the work).
00017  *
00018  *    Noncommercial. You may not use this work for commercial purposes.
00019  *
00020  *    Share Alike. If you alter, transform, or build upon this work, you may
00021  *    distribute the resulting work only under the same or similar license to
00022  *    this one.
00023  *
00024  * For any reuse or distribution, you must make clear to others the license
00025  * terms of this work. The best way to do this is by including this header.
00026  *
00027  * Any of the above conditions can be waived if you get permission from the
00028  * copyright holder.
00029  *
00030  * Apart from the remix rights granted under this license, nothing in this
00031  * license impairs or restricts the author's moral rights.
00032  */
00033
00034
00046 #include <jwsc/config.h>
00047
00048 #include <stdlib.h>
00049 #include <stdio.h>
00050 #include <math.h>
00051 #include <assert.h>
00052 #include <limits.h>
00053 #include <inttypes.h>
00054
00055 #include "jwsc/vector/vector.h"
00056 #include "jwsc/matrix/matrix.h"
00057 #include "jwsc/prob/pdf.h"
00058 #include "jwsc/stat/kmeans.h"
00059
00060
00084 void kmeans_d
00085 (
00086     Matrix_d**      means_out,
00087     Vector_i32**    members_out,
00088     const Matrix_d* features,
00089     uint32_t        K,
00090     uint32_t        N
00091 )
00092 {
00093     uint32_t mean, mean2, feature;
00094     uint32_t num_features = features->num_cols;
00095     uint32_t num_records = features->num_rows;
00096     uint32_t n;
00097     int64_t record;
00098
00099     double objective = INT_MAX;
00100     double delta_objective = 1.0;
00101     double d, sum, min;
00102
00103     Matrix_d*   means;
00104     Vector_i32* members;
00105     Vector_i32* member_counts;
00106
00107     assert(N <= floor(features->num_rows / (double)K));
00108
00109     if (*means_out == NULL)
00110     {
00111         init_kmeans_randomly_d(means_out, features, K);
00112     }
00113     means = *means_out;
00114
00115     create_zero_vector_i32(members_out, num_records);
00116     members = *members_out;
00117
00118     member_counts = NULL;
00119
00120     /* Until there is no change in the means, iterate. */
00121     while (delta_objective > 1.0e-10)
00122     {
00123         /* Assign membership of each record to its closest mean. */
00124         for (record = 0; record < num_records; record++)
00125         {
00126             min = INT_MAX;
00127
00128             for (mean = 0; mean < K; mean++)
00129             {
00130                 sum = 0.0;
00131
00132                 for (feature = 0; feature < num_features; feature++)
00133                 {
00134                     d = means->elts[ mean ][ feature ];
00135                     d -= features->elts[ record ][ feature ];
00136                     sum += d*d;
00137                 }
00138
00139                 if  (sum < min)
00140                 {
00141                     min = sum;
00142                     members->elts[ record ] = mean;
00143                 }
00144             }
00145         }
00146
00147         create_zero_vector_i32(&member_counts, K);
00148
00149         /* Ensure each mean has at least N member records. */
00150         for (record = 0; record < num_records; record++)
00151         {
00152             mean = members->elts[ record ];
00153             member_counts->elts[ mean ]++;
00154         }
00155         for (mean = 0; mean < K; mean++)
00156         {
00157             if (member_counts->elts[ mean ] < N)
00158             {
00159                 n = N - member_counts->elts[ mean ];
00160
00161                 while (n > 0)
00162                 {
00163                     record = floor(sample_uniform_pdf_d(0, num_records*0.9999));
00164                     mean2 = members->elts[ record ];
00165
00166                     if (member_counts->elts[ mean2 ] > N)
00167                     {
00168                         members->elts[ record ] = mean;
00169                         member_counts->elts[ mean2 ]--;
00170                         member_counts->elts[ mean ]++;
00171                         n--;
00172                     }
00173                 }
00174             }
00175
00176             assert(member_counts->elts[ mean ] >= N);
00177         }
00178
00179         create_zero_matrix_d(&means, means->num_rows, means->num_cols);
00180
00181         /* Re-calculate means from their members. */
00182         for (record = 0; record < num_records; record++)
00183         {
00184             mean = members->elts[ record ];
00185
00186             for (feature = 0; feature < num_features; feature++)
00187             {
00188                 means->elts[ mean ][ feature ] +=
00189                     features->elts[ record ][ feature ];
00190             }
00191         }
00192         for (mean = 0; mean < K; mean++)
00193         {
00194             for (feature = 0; feature < num_features; feature++)
00195             {
00196                 means->elts[ mean ][ feature ] /= member_counts->elts[ mean ];
00197             }
00198         }
00199
00200         sum = 0.0;
00201
00202         /* Re-caclculate value of the objective function. */
00203         for (record = 0; record < num_records; record++)
00204         {
00205             mean = members->elts[ record ];
00206
00207             for (feature = 0; feature < num_features; feature++)
00208             {
00209                 d = means->elts[ mean ][ feature];
00210                 d -= features->elts[ record ][ feature ];
00211                 sum += d*d;
00212             }
00213         }
00214
00215         /*assert((delta_objective = objective - sum) >= 0);*/
00216         delta_objective = fabs(objective - sum);
00217         objective = sum;
00218
00219         /*fprintf(stderr, "Objective: %e\n", objective);*/
00220     }
00221
00222     free_vector_i32(member_counts);
00223 }
00224
00225
00236 void init_kmeans_randomly_d
00237 (
00238     Matrix_d**      means_out,
00239     const Matrix_d* features,
00240     uint32_t        K
00241 )
00242 {
00243     uint32_t mean, feature;
00244     uint32_t num_features = features->num_cols;
00245     uint32_t num_records = features->num_rows;
00246     int64_t  record;
00247     double r;
00248
00249     Vector_i32* choosen_records = NULL;
00250     Matrix_d*   means;
00251
00252     create_zero_vector_i32(&choosen_records, num_records);
00253
00254     create_zero_matrix_d(means_out, K, num_features);
00255     means = *means_out;
00256
00257     /* Randomly choose k records to initialize the means with. */
00258     for (mean = 0; mean < K; mean++)
00259     {
00260         record = -1;
00261
00262         while (record < 0)
00263         {
00264             r = rand() / (double)RAND_MAX;
00265             record = floor((num_records-1)*r + 0.5);
00266
00267             if (!choosen_records->elts[ record ])
00268             {
00269                 choosen_records->elts[ record ] = 1;
00270             }
00271             else
00272             {
00273                 record = -1;
00274             }
00275         }
00276
00277         for (feature = 0; feature < num_features; feature++)
00278         {
00279             means->elts[ mean ][ feature ] =
00280                 features->elts[ record ][ feature ];
00281         }
00282     }
00283
00284     free_vector_i32(choosen_records);
00285 }