JWS C Library
C language utility library
|
00001 /* 00002 * This work is licensed under a Creative Commons 00003 * Attribution-Noncommercial-Share Alike 3.0 United States License. 00004 * 00005 * http://creativecommons.org/licenses/by-nc-sa/3.0/us/ 00006 * 00007 * You are free: 00008 * 00009 * to Share - to copy, distribute, display, and perform the work 00010 * to Remix - to make derivative works 00011 * 00012 * Under the following conditions: 00013 * 00014 * Attribution. You must attribute the work in the manner specified by the 00015 * author or licensor (but not in any way that suggests that they endorse you 00016 * or your use of the work). 00017 * 00018 * Noncommercial. You may not use this work for commercial purposes. 00019 * 00020 * Share Alike. If you alter, transform, or build upon this work, you may 00021 * distribute the resulting work only under the same or similar license to 00022 * this one. 00023 * 00024 * For any reuse or distribution, you must make clear to others the license 00025 * terms of this work. The best way to do this is by including this header. 00026 * 00027 * Any of the above conditions can be waived if you get permission from the 00028 * copyright holder. 00029 * 00030 * Apart from the remix rights granted under this license, nothing in this 00031 * license impairs or restricts the author's moral rights. 00032 */ 00033 00034 00046 #include <jwsc/config.h> 00047 00048 #include <stdlib.h> 00049 #include <stdio.h> 00050 #include <math.h> 00051 #include <assert.h> 00052 #include <limits.h> 00053 #include <inttypes.h> 00054 00055 #include "jwsc/vector/vector.h" 00056 #include "jwsc/matrix/matrix.h" 00057 #include "jwsc/prob/pdf.h" 00058 #include "jwsc/stat/kmeans.h" 00059 00060 00084 void kmeans_d 00085 ( 00086 Matrix_d** means_out, 00087 Vector_i32** members_out, 00088 const Matrix_d* features, 00089 uint32_t K, 00090 uint32_t N 00091 ) 00092 { 00093 uint32_t mean, mean2, feature; 00094 uint32_t num_features = features->num_cols; 00095 uint32_t num_records = features->num_rows; 00096 uint32_t n; 00097 int64_t record; 00098 00099 double objective = INT_MAX; 00100 double delta_objective = 1.0; 00101 double d, sum, min; 00102 00103 Matrix_d* means; 00104 Vector_i32* members; 00105 Vector_i32* member_counts; 00106 00107 assert(N <= floor(features->num_rows / (double)K)); 00108 00109 if (*means_out == NULL) 00110 { 00111 init_kmeans_randomly_d(means_out, features, K); 00112 } 00113 means = *means_out; 00114 00115 create_zero_vector_i32(members_out, num_records); 00116 members = *members_out; 00117 00118 member_counts = NULL; 00119 00120 /* Until there is no change in the means, iterate. */ 00121 while (delta_objective > 1.0e-10) 00122 { 00123 /* Assign membership of each record to its closest mean. */ 00124 for (record = 0; record < num_records; record++) 00125 { 00126 min = INT_MAX; 00127 00128 for (mean = 0; mean < K; mean++) 00129 { 00130 sum = 0.0; 00131 00132 for (feature = 0; feature < num_features; feature++) 00133 { 00134 d = means->elts[ mean ][ feature ]; 00135 d -= features->elts[ record ][ feature ]; 00136 sum += d*d; 00137 } 00138 00139 if (sum < min) 00140 { 00141 min = sum; 00142 members->elts[ record ] = mean; 00143 } 00144 } 00145 } 00146 00147 create_zero_vector_i32(&member_counts, K); 00148 00149 /* Ensure each mean has at least N member records. */ 00150 for (record = 0; record < num_records; record++) 00151 { 00152 mean = members->elts[ record ]; 00153 member_counts->elts[ mean ]++; 00154 } 00155 for (mean = 0; mean < K; mean++) 00156 { 00157 if (member_counts->elts[ mean ] < N) 00158 { 00159 n = N - member_counts->elts[ mean ]; 00160 00161 while (n > 0) 00162 { 00163 record = floor(sample_uniform_pdf_d(0, num_records*0.9999)); 00164 mean2 = members->elts[ record ]; 00165 00166 if (member_counts->elts[ mean2 ] > N) 00167 { 00168 members->elts[ record ] = mean; 00169 member_counts->elts[ mean2 ]--; 00170 member_counts->elts[ mean ]++; 00171 n--; 00172 } 00173 } 00174 } 00175 00176 assert(member_counts->elts[ mean ] >= N); 00177 } 00178 00179 create_zero_matrix_d(&means, means->num_rows, means->num_cols); 00180 00181 /* Re-calculate means from their members. */ 00182 for (record = 0; record < num_records; record++) 00183 { 00184 mean = members->elts[ record ]; 00185 00186 for (feature = 0; feature < num_features; feature++) 00187 { 00188 means->elts[ mean ][ feature ] += 00189 features->elts[ record ][ feature ]; 00190 } 00191 } 00192 for (mean = 0; mean < K; mean++) 00193 { 00194 for (feature = 0; feature < num_features; feature++) 00195 { 00196 means->elts[ mean ][ feature ] /= member_counts->elts[ mean ]; 00197 } 00198 } 00199 00200 sum = 0.0; 00201 00202 /* Re-caclculate value of the objective function. */ 00203 for (record = 0; record < num_records; record++) 00204 { 00205 mean = members->elts[ record ]; 00206 00207 for (feature = 0; feature < num_features; feature++) 00208 { 00209 d = means->elts[ mean ][ feature]; 00210 d -= features->elts[ record ][ feature ]; 00211 sum += d*d; 00212 } 00213 } 00214 00215 /*assert((delta_objective = objective - sum) >= 0);*/ 00216 delta_objective = fabs(objective - sum); 00217 objective = sum; 00218 00219 /*fprintf(stderr, "Objective: %e\n", objective);*/ 00220 } 00221 00222 free_vector_i32(member_counts); 00223 } 00224 00225 00236 void init_kmeans_randomly_d 00237 ( 00238 Matrix_d** means_out, 00239 const Matrix_d* features, 00240 uint32_t K 00241 ) 00242 { 00243 uint32_t mean, feature; 00244 uint32_t num_features = features->num_cols; 00245 uint32_t num_records = features->num_rows; 00246 int64_t record; 00247 double r; 00248 00249 Vector_i32* choosen_records = NULL; 00250 Matrix_d* means; 00251 00252 create_zero_vector_i32(&choosen_records, num_records); 00253 00254 create_zero_matrix_d(means_out, K, num_features); 00255 means = *means_out; 00256 00257 /* Randomly choose k records to initialize the means with. */ 00258 for (mean = 0; mean < K; mean++) 00259 { 00260 record = -1; 00261 00262 while (record < 0) 00263 { 00264 r = rand() / (double)RAND_MAX; 00265 record = floor((num_records-1)*r + 0.5); 00266 00267 if (!choosen_records->elts[ record ]) 00268 { 00269 choosen_records->elts[ record ] = 1; 00270 } 00271 else 00272 { 00273 record = -1; 00274 } 00275 } 00276 00277 for (feature = 0; feature < num_features; feature++) 00278 { 00279 means->elts[ mean ][ feature ] = 00280 features->elts[ record ][ feature ]; 00281 } 00282 } 00283 00284 free_vector_i32(choosen_records); 00285 }