Haplo Prediction
predict haplogroups
|
00001 /* 00002 * This work is licensed under a Creative Commons 00003 * Attribution-Noncommercial-Share Alike 3.0 United States License. 00004 * 00005 * http://creativecommons.org/licenses/by-nc-sa/3.0/us/ 00006 * 00007 * You are free: 00008 * 00009 * to Share - to copy, distribute, display, and perform the work 00010 * to Remix - to make derivative works 00011 * 00012 * Under the following conditions: 00013 * 00014 * Attribution. You must attribute the work in the manner specified by the 00015 * author or licensor (but not in any way that suggests that they endorse you 00016 * or your use of the work). 00017 * 00018 * Noncommercial. You may not use this work for commercial purposes. 00019 * 00020 * Share Alike. If you alter, transform, or build upon this work, you may 00021 * distribute the resulting work only under the same or similar license to 00022 * this one. 00023 * 00024 * For any reuse or distribution, you must make clear to others the license 00025 * terms of this work. The best way to do this is by including this header. 00026 * 00027 * Any of the above conditions can be waived if you get permission from the 00028 * copyright holder. 00029 * 00030 * Apart from the remix rights granted under this license, nothing in this 00031 * license impairs or restricts the author's moral rights. 00032 */ 00033 00034 00049 #include <config.h> 00050 00051 #include <stdlib.h> 00052 #include <stdio.h> 00053 #include <assert.h> 00054 #include <inttypes.h> 00055 #include <string.h> 00056 #include <math.h> 00057 00058 #include <libxml/tree.h> 00059 00060 #ifdef HAPLO_HAVE_DMALLOC 00061 #include <dmalloc.h> 00062 #endif 00063 00064 #include <jwsc/base/error.h> 00065 #include <jwsc/base/option.h> 00066 #include <jwsc/base/limits.h> 00067 #include <jwsc/vector/vector.h> 00068 #include <jwsc/matrix/matrix.h> 00069 #include <jwsc/matrix/matrix_io.h> 00070 #include <jwsc/matblock/matblock.h> 00071 #include <jwsc/stat/kmeans.h> 00072 #include <jwsc/stat/gmm.h> 00073 #include <jwsc/stat/mmm.h> 00074 00075 #include "haplo_groups.h" 00076 #include "options.h" 00077 #include "output.h" 00078 #include "input.h" 00079 #include "xml.h" 00080 00081 00082 #define NUM_OPTS_NO_ARG 0 + NUM_SHARED_OPTS_NO_ARG 00083 #define NUM_OPTS_WITH_ARG 12 + NUM_SHARED_OPTS_WITH_ARG 00084 00085 00086 #define LABEL_COL 0 00087 #define NUM_CLUSTERS 4 00088 #define CLUSTER_TYPE HAPLO_CLUSTER_KMEANS 00089 #define MEANS_OUT_FNAME "/dev/null" 00090 #define WEIGHTS_OUT_FNAME "/dev/null" 00091 #define RESPONSES_OUT_FNAME "/dev/null" 00092 #define MEMBERS_OUT_FNAME "/dev/stdout" 00093 00094 00096 typedef enum 00097 { 00098 HAPLO_CLUSTER_KMEANS, 00099 HAPLO_CLUSTER_GMM, 00100 HAPLO_CLUSTER_MMM 00101 } 00102 Haplo_cluster_type; 00103 00104 00106 Option_no_arg opts_no_arg[NUM_OPTS_NO_ARG]; 00107 00109 Option_with_arg opts_with_arg[NUM_OPTS_WITH_ARG]; 00110 00112 static uint32_t num_clusters = NUM_CLUSTERS; 00113 00115 static Haplo_cluster_type cluster_type = CLUSTER_TYPE; 00116 00118 static const char* means_out_fname = MEANS_OUT_FNAME; 00119 00121 static const char* weights_out_fname = WEIGHTS_OUT_FNAME; 00122 00124 static const char* responses_out_fname = RESPONSES_OUT_FNAME; 00125 00127 static const char* members_out_fname = MEMBERS_OUT_FNAME; 00128 00129 00131 uint32_t get_num_opts_no_arg() 00132 { 00133 return NUM_OPTS_NO_ARG; 00134 } 00135 00137 uint32_t get_num_opts_with_arg() 00138 { 00139 return NUM_OPTS_WITH_ARG; 00140 } 00141 00143 void print_usage() 00144 { 00145 fprintf(stderr, "usage: haplo-cluster OPTIONS [data-fname | <stdin>]\n"); 00146 print_options(stderr, 27, NUM_OPTS_NO_ARG, opts_no_arg, NUM_OPTS_WITH_ARG, 00147 opts_with_arg); 00148 } 00149 00151 static Error* process_num_clusters_opt(Option_arg arg) 00152 { 00153 if (arg == NULL) 00154 { 00155 return JWSC_EARG("Option 'num-clusters' requires an argument"); 00156 } 00157 if (sscanf(arg, "%u", &num_clusters) != 1 || num_clusters < 1) 00158 { 00159 return JWSC_EARG("Option 'num-clusters' must be > 0"); 00160 } 00161 return NULL; 00162 } 00163 00165 static Error* process_cluster_type_opt(Option_arg arg) 00166 { 00167 if (arg == NULL) 00168 { 00169 return JWSC_EARG("Option 'cluster-type' requires an argument"); 00170 } 00171 if (strncmp(arg, "kmeans", 6) == 0) 00172 { 00173 cluster_type = HAPLO_CLUSTER_KMEANS; 00174 } 00175 else if (strncmp(arg, "gmm", 3) == 0) 00176 { 00177 cluster_type = HAPLO_CLUSTER_GMM; 00178 } 00179 else if (strncmp(arg, "mmm", 3) == 0) 00180 { 00181 cluster_type = HAPLO_CLUSTER_MMM; 00182 } 00183 else 00184 { 00185 return JWSC_EARG("Option 'cluster-type' must be one of {kmeans,gmm,mmm}"); 00186 } 00187 return NULL; 00188 } 00189 00191 Error* process_means_out_opt(Option_arg arg) 00192 { 00193 if (arg == NULL) 00194 { 00195 return JWSC_EARG("Option 'means-out' requires an argument"); 00196 } 00197 means_out_fname = arg; 00198 return NULL; 00199 } 00200 00202 Error* process_weights_out_opt(Option_arg arg) 00203 { 00204 if (arg == NULL) 00205 { 00206 return JWSC_EARG("Option 'weights-out' requires an argument"); 00207 } 00208 weights_out_fname = arg; 00209 return NULL; 00210 } 00211 00213 Error* process_responses_out_opt(Option_arg arg) 00214 { 00215 if (arg == NULL) 00216 { 00217 return JWSC_EARG("Option 'responses-out' requires an argument"); 00218 } 00219 responses_out_fname = arg; 00220 return NULL; 00221 } 00222 00224 Error* process_members_out_opt(Option_arg arg) 00225 { 00226 if (arg == NULL) 00227 { 00228 return JWSC_EARG("Option 'members-out' requires an argument"); 00229 } 00230 members_out_fname = arg; 00231 return NULL; 00232 } 00233 00235 static void init_cluster_options() 00236 { 00237 uint32_t i; 00238 00239 char s_name; 00240 const char* l_name; 00241 const char* desc; 00242 00243 Error* (*farg)(const char*); 00244 00245 init_options(opts_no_arg, opts_with_arg); 00246 00247 opts.label_col = LABEL_COL; 00248 00249 i = NUM_SHARED_OPTS_WITH_ARG; 00250 l_name = "aux-input"; 00251 s_name = 0; 00252 desc = "Auxiliary input file. Provides marker information used for imputing missing values."; 00253 farg = process_aux_input_opt; 00254 init_option_with_arg(&(opts_with_arg[i++]), l_name, s_name, desc, farg); 00255 00256 l_name = "aux-id-cols"; 00257 s_name = 0; 00258 desc = "Auxiliary input id-cols. Comma separated ordered list of columns to use for sample identification. Prefixes the output of each sample. Count begins with 1 at the first column of the file. Set to zero to ignore the id column"; 00259 farg = process_aux_id_cols_opt; 00260 init_option_with_arg(&(opts_with_arg[i++]), l_name, s_name, desc, farg); 00261 00262 l_name = "aux-label-col"; 00263 s_name = 0; 00264 desc = "Auxiliary input label-col. Column containing the haplo group labels. Count begins with 1 at the first column of the file. Set to zero to ignore the label column."; 00265 farg = process_aux_label_col_opt; 00266 init_option_with_arg(&(opts_with_arg[i++]), l_name, s_name, desc, farg); 00267 00268 l_name = "aux-1st-marker-col"; 00269 s_name = 0; 00270 desc = "Auxiliary input 1st-marker-col. Column containing the first marker. Use in conjunction with num-markers to specify the markers for reading. All other markers are assumed to follow this one. Count begins with 1."; 00271 farg = process_aux_first_marker_col_opt; 00272 init_option_with_arg(&(opts_with_arg[i++]), l_name, s_name, desc, farg); 00273 00274 l_name = "aux-num-markers"; 00275 s_name = 0; 00276 desc = "Auxiliary input num-markers. Number of markers to read. Use in conjunction with 1st-marker-col to specify the markers for reading."; 00277 farg = process_aux_num_markers_opt; 00278 init_option_with_arg(&(opts_with_arg[i++]), l_name, s_name, desc, farg); 00279 00280 l_name = "aux-marker-cols"; 00281 s_name = 0; 00282 desc = "Auxiliary input marker-cols. Comma separated ordered list of markers to use for training. Use instead of 1st-marker-col and num-markers. Count begins with 1 at the first column of the CSV file."; 00283 farg = process_aux_marker_cols_opt; 00284 init_option_with_arg(&(opts_with_arg[i++]), l_name, s_name, desc, farg); 00285 00286 l_name = "num-clusters"; 00287 s_name = 'k'; 00288 desc = "Number of clusters to use in the k-means algorithm."; 00289 farg = process_num_clusters_opt; 00290 init_option_with_arg(&(opts_with_arg[i++]), l_name, s_name, desc, farg); 00291 00292 l_name = "cluster-type"; 00293 s_name = 0; 00294 desc = "Type of clustering to use. Must be one of {kmeans,gmm,mmm}."; 00295 farg = process_cluster_type_opt; 00296 init_option_with_arg(&(opts_with_arg[i++]), l_name, s_name, desc, farg); 00297 00298 l_name = "means-out"; 00299 s_name = 0; 00300 desc = "Cluster means output file name. The default is no output."; 00301 farg = process_means_out_opt; 00302 init_option_with_arg(&(opts_with_arg[i++]), l_name, s_name, desc, farg); 00303 00304 l_name = "weights-out"; 00305 s_name = 0; 00306 desc = "Cluster weights output file name. The default is no output."; 00307 farg = process_weights_out_opt; 00308 init_option_with_arg(&(opts_with_arg[i++]), l_name, s_name, desc, farg); 00309 00310 l_name = "responses-out"; 00311 s_name = 0; 00312 desc = "Cluster responses output file name. The default is no output."; 00313 farg = process_responses_out_opt; 00314 init_option_with_arg(&(opts_with_arg[i++]), l_name, s_name, desc, farg); 00315 00316 l_name = "members-out"; 00317 s_name = 0; 00318 desc = "Cluster members output file name. The default is stdout."; 00319 farg = process_members_out_opt; 00320 init_option_with_arg(&(opts_with_arg[i++]), l_name, s_name, desc, farg); 00321 assert(i == NUM_OPTS_WITH_ARG); 00322 } 00323 00325 static void write_means(const Matrix_d* means) 00326 { 00327 FILE* fp; 00328 uint32_t i, j; 00329 xmlDoc* xml_doc = NULL; 00330 xmlNode* xml_root = NULL; 00331 xmlNode* xml_node = NULL; 00332 xmlNode* xml_child = NULL; 00333 char xml_buf[256] = {0}; 00334 Error* err; 00335 00336 if ((err = open_output(&fp, &xml_doc, "haplo-cluster-means-out", 00337 "haplo-cluster-means-out.dtd", means_out_fname))) 00338 { 00339 print_error_msg_exit("haplo-cluster", err->msg); 00340 } 00341 00342 if (opts.output_format == HAPLO_OUTPUT_XML) 00343 { 00344 xml_root = xmlDocGetRootElement(xml_doc); 00345 } 00346 00347 for (i = 0; i < means->num_rows; i++) 00348 { 00349 switch (opts.output_format) 00350 { 00351 case HAPLO_OUTPUT_TXT: 00352 fprintf(fp, "%-4d", i+1); 00353 for (j = 0; j < means->num_cols; j++) 00354 { 00355 fprintf(fp, " %6.3f", means->elts[ i ][ j ]); 00356 } 00357 fprintf(fp, "\n"); 00358 break; 00359 case HAPLO_OUTPUT_CSV: 00360 fprintf(fp, "%d", i+1); 00361 for (j = 0; j < means->num_cols; j++) 00362 { 00363 fprintf(fp, ",%.3f", means->elts[ i ][ j ]); 00364 } 00365 fprintf(fp, "\n"); 00366 break; 00367 case HAPLO_OUTPUT_XML: 00368 xml_node = XMLNewChild(xml_root, "mean", NULL); 00369 snprintf(xml_buf, 256, "%d", i+1); 00370 XMLNewProp(xml_node, "number", xml_buf); 00371 for (j = 0; j < means->num_cols; j++) 00372 { 00373 snprintf(xml_buf, 256, "%.3f", means->elts[ i ][ j ]); 00374 xml_child = XMLNewChild(xml_node, "marker", xml_buf); 00375 snprintf(xml_buf, 256, "%d", j+1); 00376 XMLNewProp(xml_child, "number", xml_buf); 00377 } 00378 break; 00379 } 00380 } 00381 00382 if ((err = close_output(fp, xml_doc, means_out_fname))) 00383 { 00384 print_error_msg_exit("haplo-cluster", err->msg); 00385 } 00386 } 00387 00389 static void write_mmm_means(const Matblock_d* means) 00390 { 00391 FILE* fp; 00392 uint32_t i, j, k; 00393 uint32_t n; 00394 xmlDoc* xml_doc = NULL; 00395 xmlNode* xml_root = NULL; 00396 xmlNode* xml_node = NULL; 00397 xmlNode* xml_child = NULL; 00398 char xml_buf[256] = {0}; 00399 Error* err; 00400 00401 if ((err = open_output(&fp, &xml_doc, "haplo-cluster-means-out", 00402 "haplo-cluster-means-out.dtd", means_out_fname))) 00403 { 00404 print_error_msg_exit("haplo-cluster", err->msg); 00405 } 00406 00407 if (opts.output_format == HAPLO_OUTPUT_XML) 00408 { 00409 xml_root = xmlDocGetRootElement(xml_doc); 00410 } 00411 00412 for (i = 0; i < means->num_mats; i++) 00413 { 00414 switch (opts.output_format) 00415 { 00416 case HAPLO_OUTPUT_TXT: 00417 fprintf(fp, "%-4d\n", i+1); 00418 for (j = 0; j < means->num_rows; j++) 00419 { 00420 fprintf(fp, "%-4d", j+1); 00421 for (k = 0; k < means->num_cols; k++) 00422 { 00423 fprintf(fp, " %6.3f", means->elts[ i ][ j ][ k ]); 00424 } 00425 fprintf(fp, "\n"); 00426 } 00427 break; 00428 case HAPLO_OUTPUT_CSV: 00429 fprintf(fp, "%d\n", i+1); 00430 for (j = 0; j < means->num_rows; j++) 00431 { 00432 fprintf(fp, "%d", j+1); 00433 for (k = 0; k < means->num_cols; k++) 00434 { 00435 fprintf(fp, ",%.3f", means->elts[ i ][ j ][ k ]); 00436 } 00437 fprintf(fp, "\n"); 00438 } 00439 break; 00440 case HAPLO_OUTPUT_XML: 00441 xml_node = XMLNewChild(xml_root, "mean", NULL); 00442 snprintf(xml_buf, 256, "%d", i+1); 00443 XMLNewProp(xml_node, "number", xml_buf); 00444 for (j = 0; j < means->num_rows; j++) 00445 { 00446 n = 0; 00447 for (k = 0; k < means->num_cols; k++) 00448 { 00449 n += snprintf(xml_buf+n, 256, "%.3f ", 00450 means->elts[ i ][ j ][ k ]); 00451 assert(n < 256); 00452 } 00453 xml_child = XMLNewChild(xml_node, "marker", xml_buf); 00454 snprintf(xml_buf, 256, "%d", j+1); 00455 XMLNewProp(xml_child, "number", xml_buf); 00456 } 00457 break; 00458 } 00459 } 00460 00461 if ((err = close_output(fp, xml_doc, means_out_fname))) 00462 { 00463 print_error_msg_exit("haplo-cluster", err->msg); 00464 } 00465 } 00466 00468 static void write_members_header 00469 ( 00470 const Matblock_u8* ids, 00471 const Vector_u32* labels, 00472 FILE* fp 00473 ) 00474 { 00475 uint32_t id; 00476 00477 if (!opts.header_out || opts.output_format == HAPLO_OUTPUT_XML) 00478 return; 00479 00480 if (ids) 00481 { 00482 for (id = 0; id < ids->num_rows; id++) 00483 { 00484 switch (opts.output_format) 00485 { 00486 case HAPLO_OUTPUT_TXT: 00487 fprintf(fp, "ID %-7d ", id+1); 00488 break; 00489 case HAPLO_OUTPUT_CSV: 00490 fprintf(fp, "ID %d,", id+1); 00491 break; 00492 case HAPLO_OUTPUT_XML: 00493 break; 00494 } 00495 } 00496 } 00497 00498 if (labels) 00499 { 00500 switch (opts.output_format) 00501 { 00502 case HAPLO_OUTPUT_TXT: 00503 fprintf(fp, "%-10s ", "Label"); 00504 break; 00505 case HAPLO_OUTPUT_CSV: 00506 fprintf(fp, "%s,", "Label"); 00507 break; 00508 case HAPLO_OUTPUT_XML: 00509 break; 00510 } 00511 } 00512 00513 switch (opts.output_format) 00514 { 00515 case HAPLO_OUTPUT_TXT: 00516 fprintf(fp, "%6s\n", "Member"); 00517 break; 00518 case HAPLO_OUTPUT_CSV: 00519 fprintf(fp, "%s\n", "Member"); 00520 break; 00521 case HAPLO_OUTPUT_XML: 00522 break; 00523 } 00524 } 00525 00527 static void write_members 00528 ( 00529 const Matblock_u8* ids, 00530 const Vector_u32* labels, 00531 const Vector_i32* members 00532 ) 00533 { 00534 FILE* fp; 00535 uint32_t i; 00536 xmlDoc* xml_doc = NULL; 00537 xmlNode* xml_root = NULL; 00538 xmlNode* xml_node = NULL; 00539 char* xml_buf = NULL; 00540 uint32_t xml_len; 00541 Error* err; 00542 00543 if ((err = open_output(&fp, &xml_doc, "haplo-cluster-members-out", 00544 "haplo-cluster-members-out.dtd", members_out_fname))) 00545 { 00546 print_error_msg_exit("haplo-cluster", err->msg); 00547 } 00548 00549 write_members_header(ids, labels, fp); 00550 00551 if (opts.output_format == HAPLO_OUTPUT_XML) 00552 { 00553 xml_root = xmlDocGetRootElement(xml_doc); 00554 } 00555 00556 for (i = 0; i < members->num_elts; i++) 00557 { 00558 if (opts.output_format == HAPLO_OUTPUT_XML) 00559 { 00560 xml_node = XMLNewChild(xml_root, "sample", NULL); 00561 xml_len = 256; 00562 xml_buf = malloc(xml_len*sizeof(char)); 00563 snprintf(xml_buf, xml_len, "%d", i+1); 00564 XMLNewProp(xml_node, "number", xml_buf); 00565 free(xml_buf); 00566 } 00567 00568 write_ids(ids, i, HAPLO_SEP_SUFFIX, fp, xml_node); 00569 write_label(labels, i, HAPLO_SEP_SUFFIX, fp, xml_node); 00570 00571 switch (opts.output_format) 00572 { 00573 case HAPLO_OUTPUT_TXT: 00574 fprintf(fp, "%-6d\n", members->elts[ i ]+1); 00575 break; 00576 case HAPLO_OUTPUT_CSV: 00577 fprintf(fp, "%d\n", members->elts[ i ]+1); 00578 break; 00579 case HAPLO_OUTPUT_XML: 00580 xml_len = 256; 00581 xml_buf = malloc(xml_len*sizeof(char)); 00582 snprintf(xml_buf, xml_len, "%d", members->elts[ i ]+1); 00583 XMLNewChild(xml_node, "member", xml_buf); 00584 free(xml_buf); 00585 break; 00586 } 00587 } 00588 00589 if ((err = close_output(fp, xml_doc, members_out_fname))) 00590 { 00591 print_error_msg_exit("haplo-cluster", err->msg); 00592 } 00593 } 00594 00596 static void write_weights(const Vector_d* weights) 00597 { 00598 FILE* fp; 00599 uint32_t i; 00600 xmlDoc* xml_doc = NULL; 00601 xmlNode* xml_root = NULL; 00602 xmlNode* xml_node = NULL; 00603 char xml_buf[256] = {0}; 00604 Error* err; 00605 00606 if ((err = open_output(&fp, &xml_doc, "haplo-cluster-weights-out", 00607 "haplo-cluster-weights-out.dtd", weights_out_fname))) 00608 { 00609 print_error_msg_exit("haplo-cluster", err->msg); 00610 } 00611 00612 if (opts.output_format == HAPLO_OUTPUT_XML) 00613 { 00614 xml_root = xmlDocGetRootElement(xml_doc); 00615 } 00616 00617 for (i = 0; i < weights->num_elts; i++) 00618 { 00619 switch (opts.output_format) 00620 { 00621 case HAPLO_OUTPUT_TXT: 00622 fprintf(fp, "%-4d %5.3f\n", i+1, weights->elts[ i ]); 00623 break; 00624 case HAPLO_OUTPUT_CSV: 00625 fprintf(fp, "%d,%.3f\n", i+1, weights->elts[ i ]); 00626 break; 00627 case HAPLO_OUTPUT_XML: 00628 snprintf(xml_buf, 256, "%.3f", weights->elts[ i ]); 00629 xml_node = XMLNewChild(xml_root, "weight", xml_buf); 00630 snprintf(xml_buf, 256, "%d", i+1); 00631 XMLNewProp(xml_node, "number", xml_buf); 00632 break; 00633 } 00634 } 00635 00636 if ((err = close_output(fp, xml_doc, weights_out_fname))) 00637 { 00638 print_error_msg_exit("haplo-cluster", err->msg); 00639 } 00640 } 00641 00643 static void write_responses_header 00644 ( 00645 const Matblock_u8* ids, 00646 const Vector_u32* labels, 00647 const Matrix_d* responses, 00648 FILE* fp 00649 ) 00650 { 00651 uint32_t i, j; 00652 00653 if (!opts.header_out || opts.output_format == HAPLO_OUTPUT_XML) 00654 return; 00655 00656 if (ids) 00657 { 00658 for (i = 0; i < ids->num_rows; i++) 00659 { 00660 switch (opts.output_format) 00661 { 00662 case HAPLO_OUTPUT_TXT: 00663 fprintf(fp, "ID %-7d ", i+1); 00664 break; 00665 case HAPLO_OUTPUT_CSV: 00666 fprintf(fp, "ID %d,", i+1); 00667 break; 00668 case HAPLO_OUTPUT_XML: 00669 break; 00670 } 00671 } 00672 } 00673 00674 if (labels) 00675 { 00676 switch (opts.output_format) 00677 { 00678 case HAPLO_OUTPUT_TXT: 00679 fprintf(fp, "%-10s ", "Label"); 00680 break; 00681 case HAPLO_OUTPUT_CSV: 00682 fprintf(fp, "%s,", "Label"); 00683 break; 00684 case HAPLO_OUTPUT_XML: 00685 break; 00686 } 00687 } 00688 00689 switch (opts.output_format) 00690 { 00691 case HAPLO_OUTPUT_TXT: 00692 fprintf(fp, "%8s %2d", "Response", 1); 00693 for (j = 1; j < responses->num_cols; j++) 00694 { 00695 fprintf(fp, " %8s %2d", "Response", j+1); 00696 } 00697 break; 00698 case HAPLO_OUTPUT_CSV: 00699 fprintf(fp, "%s %d", "Response", 1); 00700 for (j = 1; j < responses->num_cols; j++) 00701 { 00702 fprintf(fp, ",%s %d", "Response", j+1); 00703 } 00704 break; 00705 case HAPLO_OUTPUT_XML: 00706 break; 00707 } 00708 fprintf(fp, "\n"); 00709 } 00710 00712 static void write_responses 00713 ( 00714 const Matblock_u8* ids, 00715 const Vector_u32* labels, 00716 const Matrix_d* responses 00717 ) 00718 { 00719 FILE* fp; 00720 uint32_t i, j; 00721 xmlDoc* xml_doc = NULL; 00722 xmlNode* xml_root = NULL; 00723 xmlNode* xml_node = NULL; 00724 xmlNode* xml_child = NULL; 00725 char* xml_buf = NULL; 00726 uint32_t xml_len; 00727 Error* err; 00728 00729 if ((err = open_output(&fp, &xml_doc, "haplo-cluster-responses-out", 00730 "haplo-cluster-responses-out.dtd", responses_out_fname))) 00731 { 00732 print_error_msg_exit("haplo-cluster", err->msg); 00733 } 00734 00735 write_responses_header(ids, labels, responses, fp); 00736 00737 if (opts.output_format == HAPLO_OUTPUT_XML) 00738 { 00739 xml_root = xmlDocGetRootElement(xml_doc); 00740 } 00741 00742 for (i = 0; i < responses->num_rows; i++) 00743 { 00744 if (opts.output_format == HAPLO_OUTPUT_XML) 00745 { 00746 xml_node = XMLNewChild(xml_root, "sample", NULL); 00747 xml_len = 256; 00748 xml_buf = malloc(xml_len*sizeof(char)); 00749 snprintf(xml_buf, xml_len, "%d", i+1); 00750 XMLNewProp(xml_node, "number", xml_buf); 00751 free(xml_buf); 00752 } 00753 00754 write_ids(ids, i, HAPLO_SEP_SUFFIX, fp, xml_node); 00755 write_label(labels, i, HAPLO_SEP_SUFFIX, fp, xml_node); 00756 00757 switch (opts.output_format) 00758 { 00759 case HAPLO_OUTPUT_TXT: 00760 fprintf(fp, "%-11.4f", responses->elts[ i ][ 0 ]); 00761 for (j = 1; j < responses->num_cols; j++) 00762 { 00763 fprintf(fp, " %-11.4f", responses->elts[ i ][ j ]); 00764 } 00765 fprintf(fp, "\n"); 00766 break; 00767 case HAPLO_OUTPUT_CSV: 00768 fprintf(fp, "%.3f", responses->elts[ i ][ 0 ]); 00769 for (j = 1; j < responses->num_cols; j++) 00770 { 00771 fprintf(fp, ",%.4f", responses->elts[ i ][ j ]); 00772 } 00773 fprintf(fp, "\n"); 00774 break; 00775 case HAPLO_OUTPUT_XML: 00776 xml_len = 256; 00777 xml_buf = malloc(xml_len*sizeof(char)); 00778 for (j = 0; j < responses->num_cols; j++) 00779 { 00780 snprintf(xml_buf, xml_len, "%.4f", responses->elts[i][j]); 00781 xml_child = XMLNewChild(xml_node, "response", xml_buf); 00782 snprintf(xml_buf, xml_len, "%d", j+1); 00783 XMLNewProp(xml_child, "number", xml_buf); 00784 } 00785 free(xml_buf); 00786 break; 00787 } 00788 } 00789 00790 if ((err = close_output(fp, xml_doc, responses_out_fname))) 00791 { 00792 print_error_msg_exit("haplo-cluster", err->msg); 00793 } 00794 } 00795 00797 static void write_kmeans_results 00798 ( 00799 const Matblock_u8* ids, 00800 const Vector_u32* labels, 00801 const Matrix_d* means, 00802 const Vector_i32* members 00803 ) 00804 { 00805 write_means(means); 00806 write_members(ids, labels, members); 00807 } 00808 00810 static void assign_gmm_members 00811 ( 00812 Vector_i32** members_out, 00813 const Matrix_d* responses 00814 ) 00815 { 00816 uint32_t i, j, k; 00817 00818 create_vector_i32(members_out, responses->num_rows); 00819 00820 for (i = 0; i < responses->num_rows; i++) 00821 { 00822 k = 0; 00823 for (j = 1; j < responses->num_cols; j++) 00824 { 00825 if (responses->elts[ i ][ j ] > responses->elts[ i ][ k ]) 00826 { 00827 k = j; 00828 } 00829 } 00830 00831 (*members_out)->elts[ i ] = k; 00832 } 00833 } 00834 00836 static void assign_mmm_members 00837 ( 00838 Vector_i32** members_out, 00839 const Matrix_d* responses 00840 ) 00841 { 00842 uint32_t i, j, k; 00843 00844 create_vector_i32(members_out, responses->num_rows); 00845 00846 for (i = 0; i < responses->num_rows; i++) 00847 { 00848 k = 0; 00849 for (j = 1; j < responses->num_cols; j++) 00850 { 00851 if (responses->elts[ i ][ j ] > responses->elts[ i ][ k ]) 00852 { 00853 k = j; 00854 } 00855 } 00856 00857 (*members_out)->elts[ i ] = k; 00858 } 00859 } 00860 00862 static void assign_mmm_aux_members 00863 ( 00864 Vector_i32** members_out, 00865 const Matblock_d* means, 00866 const Vector_d* weights, 00867 const Matblock_u32* markers 00868 ) 00869 { 00870 uint32_t k, K; 00871 uint32_t n, N; 00872 uint32_t d, D; 00873 uint32_t m, M; 00874 double p_min; 00875 double p, q; 00876 00877 Matrix_u32* x_n = NULL; 00878 Matrix_d* mu_k = NULL; 00879 00880 K = weights->num_elts; 00881 N = markers->num_mats; 00882 D = markers->num_rows; 00883 M = markers->num_cols; 00884 00885 create_vector_i32(members_out, N); 00886 00887 for (n = 0; n < N; n++) 00888 { 00889 p_min = 0; 00890 copy_matrix_from_matblock_u32(&x_n, markers, MATBLOCK_MAT_MATRIX, n); 00891 00892 for (k = 0; k < K; k++) 00893 { 00894 copy_matrix_from_matblock_d(&mu_k, means, MATBLOCK_MAT_MATRIX, k); 00895 00896 q = 0.0; 00897 for (d = 0; d < D; d++) 00898 { 00899 for (m = 0; m < M; m++) 00900 { 00901 if (x_n->elts[ d ][ m ]) 00902 { 00903 q += log((mu_k->elts[ d ][ m ] < JWSC_MIN_LOG_ARG) ? JWSC_MIN_LOG_ARG : mu_k->elts[ d ][ m ]); 00904 } 00905 } 00906 } 00907 00908 p = log((weights->elts[ k ] < JWSC_MIN_LOG_ARG) ? JWSC_MIN_LOG_ARG : weights->elts[ k ]) + q; 00909 #if defined(JWSC_HAVE_ISNAN) && defined(JWSC_HAVE_ISINF) 00910 assert(!isnan(p) && !isinf(p)); 00911 #endif 00912 00913 if (p < p_min) 00914 { 00915 p_min = p; 00916 (*members_out)->elts[ n ] = k; 00917 } 00918 } 00919 } 00920 00921 free_matrix_u32(x_n); 00922 free_matrix_d(mu_k); 00923 } 00924 00926 static void write_gmm_results 00927 ( 00928 const Matblock_u8* ids, 00929 const Vector_u32* labels, 00930 const Matrix_d* means, 00931 const Matblock_d* covars, 00932 const Vector_d* weights, 00933 const Matrix_d* responses, 00934 const Vector_i32* members 00935 ) 00936 { 00937 write_means(means); 00938 write_weights(weights); 00939 write_responses(ids, labels, responses); 00940 write_members(ids, labels, members); 00941 } 00942 00944 static void write_mmm_results 00945 ( 00946 const Matblock_u8* ids, 00947 const Vector_u32* labels, 00948 const Matblock_d* means, 00949 const Vector_d* weights, 00950 const Matrix_d* responses, 00951 const Vector_i32* members 00952 ) 00953 { 00954 write_mmm_means(means); 00955 write_weights(weights); 00956 write_responses(ids, labels, responses); 00957 write_members(ids, labels, members); 00958 } 00959 00961 static void write_mmm_aux_results 00962 ( 00963 const Matblock_u8* ids, 00964 const Vector_u32* labels, 00965 const Vector_i32* members 00966 ) 00967 { 00968 write_members(ids, labels, members); 00969 } 00970 00971 static void cluster_kmeans 00972 ( 00973 const Matblock_u8* ids, 00974 const Vector_u32* labels, 00975 const Matrix_i32* markers 00976 ) 00977 { 00978 uint32_t num_rows, num_cols; 00979 uint32_t row, col; 00980 00981 Matrix_d* markers_d = NULL; 00982 Matrix_d* means = NULL; 00983 Vector_i32* members = NULL; 00984 00985 num_rows = markers->num_rows; 00986 num_cols = markers->num_cols; 00987 create_matrix_d(&markers_d, num_rows, num_cols); 00988 00989 for (row = 0; row < num_rows; row++) 00990 { 00991 for (col = 0; col < num_cols; col++) 00992 { 00993 markers_d->elts[ row ][ col ] = (double)markers->elts[ row ][ col ]; 00994 } 00995 } 00996 00997 kmeans_d(&means, &members, markers_d, num_clusters, 1); 00998 write_kmeans_results(ids, labels, means, members); 00999 01000 free_matrix_d(markers_d); 01001 free_matrix_d(means); 01002 free_vector_i32(members); 01003 } 01004 01005 static void cluster_gmm 01006 ( 01007 const Matblock_u8* ids, 01008 const Vector_u32* labels, 01009 const Matrix_i32* markers 01010 ) 01011 { 01012 uint32_t num_rows, num_cols; 01013 uint32_t row, col; 01014 01015 Matrix_d* markers_d = NULL; 01016 Matrix_d* means = NULL; 01017 Matblock_d* covars = NULL; 01018 Vector_d* weights = NULL; 01019 Matrix_d* responses = NULL; 01020 Vector_i32* members = NULL; 01021 01022 num_rows = markers->num_rows; 01023 num_cols = markers->num_cols; 01024 create_matrix_d(&markers_d, num_rows, num_cols); 01025 01026 for (row = 0; row < num_rows; row++) 01027 { 01028 for (col = 0; col < num_cols; col++) 01029 { 01030 markers_d->elts[ row ][ col ] = (double)markers->elts[ row ][ col ]; 01031 } 01032 } 01033 01034 train_gmm_d(&means, &covars, &weights, &responses, markers_d, num_clusters); 01035 assign_gmm_members(&members, responses); 01036 write_gmm_results(ids, labels, means, covars, weights, responses, members); 01037 01038 free_matrix_d(markers_d); 01039 free_matrix_d(means); 01040 free_matblock_d(covars); 01041 free_vector_d(weights); 01042 free_matrix_d(responses); 01043 free_vector_i32(members); 01044 } 01045 01046 static void cluster_mmm 01047 ( 01048 const Matblock_u8* ids, 01049 const Vector_u32* labels, 01050 const Matrix_i32* markers, 01051 const Matblock_u8* aux_ids, 01052 const Vector_u32* aux_labels, 01053 const Matrix_i32* aux_markers 01054 ) 01055 { 01056 uint32_t s, num_samples; 01057 uint32_t m, num_markers; 01058 uint32_t num_marker_vals; 01059 01060 Matblock_u32* markers_d = NULL; 01061 Matblock_d* means = NULL; 01062 Vector_d* weights = NULL; 01063 Matrix_d* responses = NULL; 01064 Vector_i32* members = NULL; 01065 01066 num_marker_vals = 0; 01067 num_samples = markers->num_rows; 01068 num_markers = markers->num_cols; 01069 01070 for (s = 0; s < num_samples; s++) 01071 { 01072 for (m = 0; m < num_markers; m++) 01073 { 01074 if (markers->elts[ s ][ m ] >= num_marker_vals) 01075 { 01076 num_marker_vals = markers->elts[ s ][ m ]+1; 01077 } 01078 } 01079 } 01080 assert(num_marker_vals); 01081 01082 create_zero_matblock_u32(&markers_d, num_samples, num_markers, 01083 num_marker_vals); 01084 for (s = 0; s < num_samples; s++) 01085 { 01086 for (m = 0; m < num_markers; m++) 01087 { 01088 markers_d->elts[ s ][ m ][ markers->elts[s][m] ] = 1; 01089 } 01090 } 01091 01092 // DEBUG 01093 for (s = 0; s < num_samples; s++) 01094 { 01095 for (m = 0; m < num_markers; m++) 01096 { 01097 uint32_t v, sum = 0; 01098 for (v = 0; v < num_marker_vals; v++) 01099 { 01100 if (markers_d->elts[ s ][ m ][ v ]) 01101 { 01102 sum++; 01103 } 01104 } 01105 assert(sum == 1); 01106 } 01107 } 01108 01109 train_mmm_d(&means, &weights, &responses, markers_d, num_clusters); 01110 assign_mmm_members(&members, responses); 01111 write_mmm_results(ids, labels, means, weights, responses, members); 01112 01113 if (aux_markers) 01114 { 01115 num_samples = aux_markers->num_rows; 01116 assert(num_markers == aux_markers->num_cols); 01117 01118 create_zero_matblock_u32(&markers_d, num_samples, num_markers, 01119 num_marker_vals); 01120 for (s = 0; s < num_samples; s++) 01121 { 01122 for (m = 0; m < num_markers; m++) 01123 { 01124 markers_d->elts[ s ][ m ][ aux_markers->elts[s][m] ] = 1; 01125 } 01126 } 01127 01128 assign_mmm_aux_members(&members, means, weights, markers_d); 01129 write_mmm_aux_results(aux_ids, aux_labels, members); 01130 } 01131 01132 free_matblock_u32(markers_d); 01133 free_matblock_d(means); 01134 free_vector_d(weights); 01135 free_matrix_d(responses); 01136 free_vector_i32(members); 01137 } 01138 01140 int main(int argc, const char** argv) 01141 { 01142 int argi; 01143 const char* data_fname = "/dev/stdin"; 01144 Error* err; 01145 01146 Matblock_u8* ids = NULL; 01147 Vector_u32* labels = NULL; 01148 Matrix_i32* markers = NULL; 01149 01150 Matblock_u8* aux_ids = NULL; 01151 Vector_u32* aux_labels = NULL; 01152 Matrix_i32* aux_markers = NULL; 01153 01154 init_cluster_options(); 01155 01156 if ((err = process_options(argc, argv, &argi, NUM_OPTS_NO_ARG, opts_no_arg, 01157 NUM_OPTS_WITH_ARG, opts_with_arg)) != NULL) 01158 { 01159 print_error_msg_exit("haplo-cluster", err->msg); 01160 } 01161 01162 if ((argc - argi) == 1) 01163 { 01164 data_fname = argv[ argi ]; 01165 } 01166 01167 if ((err = read_haplo_groups(opts.labels_fname))) 01168 { 01169 print_error_msg_exit("haplo-cluster", err->msg); 01170 } 01171 01172 if ((err = read_input(&ids, &labels, &markers, data_fname))) 01173 { 01174 print_error_msg_exit("haplo-cluster", err->msg); 01175 } 01176 01177 if (opts.aux_input_fname) 01178 { 01179 if ((err = read_aux_input(&aux_ids, &aux_labels, &aux_markers, 01180 opts.aux_input_fname))) 01181 { 01182 print_error_msg_exit("haplo-impute", err->msg); 01183 } 01184 } 01185 01186 switch (cluster_type) 01187 { 01188 case HAPLO_CLUSTER_KMEANS: 01189 cluster_kmeans(ids, labels, markers); 01190 break; 01191 case HAPLO_CLUSTER_GMM: 01192 cluster_gmm(ids, labels, markers); 01193 break; 01194 case HAPLO_CLUSTER_MMM: 01195 cluster_mmm(ids, labels, markers, aux_ids, aux_labels, aux_markers); 01196 break; 01197 } 01198 01199 free_matblock_u8(ids); 01200 free_vector_u32(labels); 01201 free_matrix_i32(markers); 01202 01203 if (get_num_unhandled_errors() > 0) 01204 { 01205 print_error_msg_exit("haplo-cluster", "Unhandled errors"); 01206 } 01207 01208 return EXIT_SUCCESS; 01209 }