Haplo Prediction
predict haplogroups
haplo_cluster.c
Go to the documentation of this file.
00001 /*
00002  * This work is licensed under a Creative Commons 
00003  * Attribution-Noncommercial-Share Alike 3.0 United States License.
00004  * 
00005  *    http://creativecommons.org/licenses/by-nc-sa/3.0/us/
00006  * 
00007  * You are free:
00008  * 
00009  *    to Share - to copy, distribute, display, and perform the work
00010  *    to Remix - to make derivative works
00011  * 
00012  * Under the following conditions:
00013  * 
00014  *    Attribution. You must attribute the work in the manner specified by the
00015  *    author or licensor (but not in any way that suggests that they endorse you
00016  *    or your use of the work).
00017  * 
00018  *    Noncommercial. You may not use this work for commercial purposes.
00019  * 
00020  *    Share Alike. If you alter, transform, or build upon this work, you may
00021  *    distribute the resulting work only under the same or similar license to
00022  *    this one.
00023  * 
00024  * For any reuse or distribution, you must make clear to others the license
00025  * terms of this work. The best way to do this is by including this header.
00026  * 
00027  * Any of the above conditions can be waived if you get permission from the
00028  * copyright holder.
00029  * 
00030  * Apart from the remix rights granted under this license, nothing in this
00031  * license impairs or restricts the author's moral rights.
00032  */
00033 
00034 
00049 #include <config.h>
00050 
00051 #include <stdlib.h>
00052 #include <stdio.h>
00053 #include <assert.h>
00054 #include <inttypes.h>
00055 #include <string.h>
00056 #include <math.h>
00057 
00058 #include <libxml/tree.h>
00059 
00060 #ifdef HAPLO_HAVE_DMALLOC
00061 #include <dmalloc.h>
00062 #endif
00063 
00064 #include <jwsc/base/error.h>
00065 #include <jwsc/base/option.h>
00066 #include <jwsc/base/limits.h>
00067 #include <jwsc/vector/vector.h>
00068 #include <jwsc/matrix/matrix.h>
00069 #include <jwsc/matrix/matrix_io.h>
00070 #include <jwsc/matblock/matblock.h>
00071 #include <jwsc/stat/kmeans.h>
00072 #include <jwsc/stat/gmm.h>
00073 #include <jwsc/stat/mmm.h>
00074 
00075 #include "haplo_groups.h"
00076 #include "options.h"
00077 #include "output.h"
00078 #include "input.h"
00079 #include "xml.h"
00080 
00081 
00082 #define  NUM_OPTS_NO_ARG    0  + NUM_SHARED_OPTS_NO_ARG
00083 #define  NUM_OPTS_WITH_ARG  12 + NUM_SHARED_OPTS_WITH_ARG
00084 
00085 
00086 #define  LABEL_COL            0
00087 #define  NUM_CLUSTERS         4
00088 #define  CLUSTER_TYPE         HAPLO_CLUSTER_KMEANS
00089 #define  MEANS_OUT_FNAME      "/dev/null"
00090 #define  WEIGHTS_OUT_FNAME    "/dev/null"
00091 #define  RESPONSES_OUT_FNAME  "/dev/null"
00092 #define  MEMBERS_OUT_FNAME    "/dev/stdout"
00093 
00094 
00096 typedef enum
00097 {
00098     HAPLO_CLUSTER_KMEANS,
00099     HAPLO_CLUSTER_GMM,
00100     HAPLO_CLUSTER_MMM
00101 }
00102 Haplo_cluster_type;
00103 
00104 
00106 Option_no_arg opts_no_arg[NUM_OPTS_NO_ARG];
00107 
00109 Option_with_arg opts_with_arg[NUM_OPTS_WITH_ARG];
00110 
00112 static uint32_t num_clusters = NUM_CLUSTERS;
00113 
00115 static Haplo_cluster_type cluster_type = CLUSTER_TYPE;
00116 
00118 static const char* means_out_fname = MEANS_OUT_FNAME;
00119 
00121 static const char* weights_out_fname = WEIGHTS_OUT_FNAME;
00122 
00124 static const char* responses_out_fname = RESPONSES_OUT_FNAME;
00125 
00127 static const char* members_out_fname = MEMBERS_OUT_FNAME;
00128 
00129 
00131 uint32_t get_num_opts_no_arg()
00132 {
00133     return NUM_OPTS_NO_ARG;
00134 }
00135 
00137 uint32_t get_num_opts_with_arg()
00138 {
00139     return NUM_OPTS_WITH_ARG;
00140 }
00141 
00143 void print_usage()
00144 {
00145     fprintf(stderr, "usage: haplo-cluster OPTIONS [data-fname | <stdin>]\n");
00146     print_options(stderr, 27, NUM_OPTS_NO_ARG, opts_no_arg, NUM_OPTS_WITH_ARG,
00147             opts_with_arg);
00148 }
00149 
00151 static Error* process_num_clusters_opt(Option_arg arg)
00152 {
00153     if (arg == NULL)
00154     {
00155         return JWSC_EARG("Option 'num-clusters' requires an argument");
00156     }
00157     if (sscanf(arg, "%u", &num_clusters) != 1 || num_clusters < 1)
00158     {
00159         return JWSC_EARG("Option 'num-clusters' must be > 0");
00160     }
00161     return NULL;
00162 }
00163 
00165 static Error* process_cluster_type_opt(Option_arg arg)
00166 {
00167     if (arg == NULL)
00168     {
00169         return JWSC_EARG("Option 'cluster-type' requires an argument");
00170     }
00171     if (strncmp(arg, "kmeans", 6) == 0)
00172     {
00173         cluster_type = HAPLO_CLUSTER_KMEANS;
00174     }
00175     else if (strncmp(arg, "gmm", 3) == 0)
00176     {
00177         cluster_type = HAPLO_CLUSTER_GMM;
00178     }
00179     else if (strncmp(arg, "mmm", 3) == 0)
00180     {
00181         cluster_type = HAPLO_CLUSTER_MMM;
00182     }
00183     else
00184     {
00185         return JWSC_EARG("Option 'cluster-type' must be one of {kmeans,gmm,mmm}");
00186     }
00187     return NULL;
00188 }
00189 
00191 Error* process_means_out_opt(Option_arg arg)
00192 {
00193     if (arg == NULL)
00194     {
00195         return JWSC_EARG("Option 'means-out' requires an argument");
00196     }
00197     means_out_fname = arg;
00198     return NULL;
00199 }
00200 
00202 Error* process_weights_out_opt(Option_arg arg)
00203 {
00204     if (arg == NULL)
00205     {
00206         return JWSC_EARG("Option 'weights-out' requires an argument");
00207     }
00208     weights_out_fname = arg;
00209     return NULL;
00210 }
00211 
00213 Error* process_responses_out_opt(Option_arg arg)
00214 {
00215     if (arg == NULL)
00216     {
00217         return JWSC_EARG("Option 'responses-out' requires an argument");
00218     }
00219     responses_out_fname = arg;
00220     return NULL;
00221 }
00222 
00224 Error* process_members_out_opt(Option_arg arg)
00225 {
00226     if (arg == NULL)
00227     {
00228         return JWSC_EARG("Option 'members-out' requires an argument");
00229     }
00230     members_out_fname = arg;
00231     return NULL;
00232 }
00233 
00235 static void init_cluster_options()
00236 {
00237     uint32_t i;
00238 
00239     char s_name;
00240     const char* l_name;
00241     const char* desc;
00242 
00243     Error* (*farg)(const char*);
00244 
00245     init_options(opts_no_arg, opts_with_arg);
00246 
00247     opts.label_col = LABEL_COL;
00248 
00249     i = NUM_SHARED_OPTS_WITH_ARG;
00250     l_name = "aux-input";
00251     s_name = 0;
00252     desc   = "Auxiliary input file. Provides marker information used for imputing missing values.";
00253     farg   = process_aux_input_opt;
00254     init_option_with_arg(&(opts_with_arg[i++]), l_name, s_name, desc, farg);
00255 
00256     l_name = "aux-id-cols";
00257     s_name = 0;
00258     desc   = "Auxiliary input id-cols. Comma separated ordered list of columns to use for sample identification. Prefixes the output of each sample. Count begins with 1 at the first column of the file. Set to zero to ignore the id column";
00259     farg   = process_aux_id_cols_opt;
00260     init_option_with_arg(&(opts_with_arg[i++]), l_name, s_name, desc, farg);
00261 
00262     l_name = "aux-label-col";
00263     s_name = 0;
00264     desc   = "Auxiliary input label-col. Column containing the haplo group labels. Count begins with 1 at the first column of the file. Set to zero to ignore the label column.";
00265     farg   = process_aux_label_col_opt;
00266     init_option_with_arg(&(opts_with_arg[i++]), l_name, s_name, desc, farg);
00267 
00268     l_name = "aux-1st-marker-col";
00269     s_name = 0;
00270     desc   = "Auxiliary input 1st-marker-col. Column containing the first marker. Use in conjunction with num-markers to specify the markers for reading. All other markers are assumed to follow this one. Count begins with 1.";
00271     farg   = process_aux_first_marker_col_opt;
00272     init_option_with_arg(&(opts_with_arg[i++]), l_name, s_name, desc, farg);
00273 
00274     l_name = "aux-num-markers";
00275     s_name = 0;
00276     desc   = "Auxiliary input num-markers. Number of markers to read. Use in conjunction with 1st-marker-col to specify the markers for reading.";
00277     farg   = process_aux_num_markers_opt;
00278     init_option_with_arg(&(opts_with_arg[i++]), l_name, s_name, desc, farg);
00279 
00280     l_name = "aux-marker-cols";
00281     s_name = 0;
00282     desc   = "Auxiliary input marker-cols. Comma separated ordered list of markers to use for training. Use instead of 1st-marker-col and num-markers. Count begins with 1 at the first column of the CSV file.";
00283     farg   = process_aux_marker_cols_opt;
00284     init_option_with_arg(&(opts_with_arg[i++]), l_name, s_name, desc, farg);
00285 
00286     l_name = "num-clusters";
00287     s_name = 'k';
00288     desc   = "Number of clusters to use in the k-means algorithm.";
00289     farg   = process_num_clusters_opt;
00290     init_option_with_arg(&(opts_with_arg[i++]), l_name, s_name, desc, farg);
00291 
00292     l_name = "cluster-type";
00293     s_name = 0;
00294     desc   = "Type of clustering to use. Must be one of {kmeans,gmm,mmm}.";
00295     farg   = process_cluster_type_opt;
00296     init_option_with_arg(&(opts_with_arg[i++]), l_name, s_name, desc, farg);
00297 
00298     l_name = "means-out";
00299     s_name = 0;
00300     desc   = "Cluster means output file name. The default is no output.";
00301     farg   = process_means_out_opt;
00302     init_option_with_arg(&(opts_with_arg[i++]), l_name, s_name, desc, farg);
00303 
00304     l_name = "weights-out";
00305     s_name = 0;
00306     desc   = "Cluster weights output file name. The default is no output.";
00307     farg   = process_weights_out_opt;
00308     init_option_with_arg(&(opts_with_arg[i++]), l_name, s_name, desc, farg);
00309 
00310     l_name = "responses-out";
00311     s_name = 0;
00312     desc   = "Cluster responses output file name. The default is no output.";
00313     farg   = process_responses_out_opt;
00314     init_option_with_arg(&(opts_with_arg[i++]), l_name, s_name, desc, farg);
00315 
00316     l_name = "members-out";
00317     s_name = 0;
00318     desc   = "Cluster members output file name. The default is stdout.";
00319     farg   = process_members_out_opt;
00320     init_option_with_arg(&(opts_with_arg[i++]), l_name, s_name, desc, farg);
00321     assert(i == NUM_OPTS_WITH_ARG);
00322 }
00323 
00325 static void write_means(const Matrix_d* means)
00326 {
00327     FILE*       fp;
00328     uint32_t    i, j;
00329     xmlDoc*     xml_doc   = NULL;
00330     xmlNode*    xml_root  = NULL;
00331     xmlNode*    xml_node  = NULL;
00332     xmlNode*    xml_child = NULL;
00333     char        xml_buf[256] = {0};
00334     Error*      err;
00335 
00336     if ((err = open_output(&fp, &xml_doc, "haplo-cluster-means-out",
00337                     "haplo-cluster-means-out.dtd", means_out_fname)))
00338     {
00339         print_error_msg_exit("haplo-cluster", err->msg);
00340     }
00341 
00342     if (opts.output_format == HAPLO_OUTPUT_XML)
00343     {
00344         xml_root = xmlDocGetRootElement(xml_doc);
00345     }
00346 
00347     for (i = 0; i < means->num_rows; i++)
00348     {
00349         switch (opts.output_format)
00350         {
00351             case HAPLO_OUTPUT_TXT:
00352                 fprintf(fp, "%-4d", i+1);
00353                 for (j = 0; j < means->num_cols; j++)
00354                 {
00355                     fprintf(fp, "  %6.3f", means->elts[ i ][ j ]);
00356                 }
00357                 fprintf(fp, "\n");
00358                 break;
00359             case HAPLO_OUTPUT_CSV:
00360                 fprintf(fp, "%d", i+1);
00361                 for (j = 0; j < means->num_cols; j++)
00362                 {
00363                     fprintf(fp, ",%.3f", means->elts[ i ][ j ]);
00364                 }
00365                 fprintf(fp, "\n");
00366                 break;
00367             case HAPLO_OUTPUT_XML:
00368                 xml_node = XMLNewChild(xml_root, "mean", NULL);
00369                 snprintf(xml_buf, 256, "%d", i+1);
00370                 XMLNewProp(xml_node, "number", xml_buf);
00371                 for (j = 0; j < means->num_cols; j++)
00372                 {
00373                     snprintf(xml_buf, 256, "%.3f", means->elts[ i ][ j ]);
00374                     xml_child = XMLNewChild(xml_node, "marker", xml_buf);
00375                     snprintf(xml_buf, 256, "%d", j+1);
00376                     XMLNewProp(xml_child, "number", xml_buf);
00377                 }
00378                 break;
00379         }
00380     }
00381 
00382     if ((err = close_output(fp, xml_doc, means_out_fname)))
00383     {
00384         print_error_msg_exit("haplo-cluster", err->msg);
00385     }
00386 }
00387 
00389 static void write_mmm_means(const Matblock_d* means)
00390 {
00391     FILE*       fp;
00392     uint32_t    i, j, k;
00393     uint32_t    n;
00394     xmlDoc*     xml_doc   = NULL;
00395     xmlNode*    xml_root  = NULL;
00396     xmlNode*    xml_node  = NULL;
00397     xmlNode*    xml_child = NULL;
00398     char        xml_buf[256] = {0};
00399     Error*      err;
00400 
00401     if ((err = open_output(&fp, &xml_doc, "haplo-cluster-means-out",
00402                     "haplo-cluster-means-out.dtd", means_out_fname)))
00403     {
00404         print_error_msg_exit("haplo-cluster", err->msg);
00405     }
00406 
00407     if (opts.output_format == HAPLO_OUTPUT_XML)
00408     {
00409         xml_root = xmlDocGetRootElement(xml_doc);
00410     }
00411 
00412     for (i = 0; i < means->num_mats; i++)
00413     {
00414         switch (opts.output_format)
00415         {
00416             case HAPLO_OUTPUT_TXT:
00417                 fprintf(fp, "%-4d\n", i+1);
00418                 for (j = 0; j < means->num_rows; j++)
00419                 {
00420                     fprintf(fp, "%-4d", j+1);
00421                     for (k = 0; k < means->num_cols; k++)
00422                     {
00423                         fprintf(fp, "  %6.3f", means->elts[ i ][ j ][ k ]);
00424                     }
00425                     fprintf(fp, "\n");
00426                 }
00427                 break;
00428             case HAPLO_OUTPUT_CSV:
00429                 fprintf(fp, "%d\n", i+1);
00430                 for (j = 0; j < means->num_rows; j++)
00431                 {
00432                     fprintf(fp, "%d", j+1);
00433                     for (k = 0; k < means->num_cols; k++)
00434                     {
00435                         fprintf(fp, ",%.3f", means->elts[ i ][ j ][ k ]);
00436                     }
00437                     fprintf(fp, "\n");
00438                 }
00439                 break;
00440             case HAPLO_OUTPUT_XML:
00441                 xml_node = XMLNewChild(xml_root, "mean", NULL);
00442                 snprintf(xml_buf, 256, "%d", i+1);
00443                 XMLNewProp(xml_node, "number", xml_buf);
00444                 for (j = 0; j < means->num_rows; j++)
00445                 {
00446                     n = 0;
00447                     for (k = 0; k < means->num_cols; k++)
00448                     {
00449                         n += snprintf(xml_buf+n, 256, "%.3f ", 
00450                                 means->elts[ i ][ j ][ k ]);
00451                         assert(n < 256);
00452                     }
00453                     xml_child = XMLNewChild(xml_node, "marker", xml_buf);
00454                     snprintf(xml_buf, 256, "%d", j+1);
00455                     XMLNewProp(xml_child, "number", xml_buf);
00456                 }
00457                 break;
00458         }
00459     }
00460 
00461     if ((err = close_output(fp, xml_doc, means_out_fname)))
00462     {
00463         print_error_msg_exit("haplo-cluster", err->msg);
00464     }
00465 }
00466 
00468 static void write_members_header
00469 (
00470     const Matblock_u8* ids,
00471     const Vector_u32*  labels,
00472     FILE*              fp
00473 )
00474 {
00475     uint32_t id;
00476 
00477     if (!opts.header_out || opts.output_format == HAPLO_OUTPUT_XML)
00478         return;
00479 
00480     if (ids)
00481     {
00482         for (id = 0; id < ids->num_rows; id++)
00483         {
00484             switch (opts.output_format)
00485             {
00486                 case HAPLO_OUTPUT_TXT:
00487                     fprintf(fp, "ID %-7d  ", id+1);
00488                     break;
00489                 case HAPLO_OUTPUT_CSV:
00490                     fprintf(fp, "ID %d,", id+1);
00491                     break;
00492                 case HAPLO_OUTPUT_XML:
00493                     break;
00494             }
00495         }
00496     }
00497 
00498     if (labels)
00499     {
00500         switch (opts.output_format)
00501         {
00502             case HAPLO_OUTPUT_TXT:
00503                 fprintf(fp, "%-10s  ", "Label");
00504                 break;
00505             case HAPLO_OUTPUT_CSV:
00506                 fprintf(fp, "%s,", "Label");
00507                 break;
00508             case HAPLO_OUTPUT_XML:
00509                 break;
00510         }
00511     }
00512 
00513     switch (opts.output_format)
00514     {
00515         case HAPLO_OUTPUT_TXT:
00516             fprintf(fp, "%6s\n", "Member");
00517             break;
00518         case HAPLO_OUTPUT_CSV:
00519             fprintf(fp, "%s\n", "Member");
00520             break;
00521         case HAPLO_OUTPUT_XML:
00522             break;
00523     }
00524 }
00525 
00527 static void write_members
00528 (
00529     const Matblock_u8* ids,
00530     const Vector_u32*  labels,
00531     const Vector_i32*  members
00532 )
00533 {
00534     FILE*       fp;
00535     uint32_t    i;
00536     xmlDoc*     xml_doc   = NULL;
00537     xmlNode*    xml_root  = NULL;
00538     xmlNode*    xml_node  = NULL;
00539     char*       xml_buf   = NULL;
00540     uint32_t    xml_len;
00541     Error*      err;
00542 
00543     if ((err = open_output(&fp, &xml_doc, "haplo-cluster-members-out",
00544                     "haplo-cluster-members-out.dtd", members_out_fname)))
00545     {
00546         print_error_msg_exit("haplo-cluster", err->msg);
00547     }
00548 
00549     write_members_header(ids, labels, fp);
00550 
00551     if (opts.output_format == HAPLO_OUTPUT_XML)
00552     {
00553         xml_root = xmlDocGetRootElement(xml_doc);
00554     }
00555 
00556     for (i = 0; i < members->num_elts; i++)
00557     {
00558         if (opts.output_format == HAPLO_OUTPUT_XML)
00559         {
00560             xml_node = XMLNewChild(xml_root, "sample", NULL);
00561             xml_len = 256;
00562             xml_buf = malloc(xml_len*sizeof(char));
00563             snprintf(xml_buf, xml_len, "%d", i+1);
00564             XMLNewProp(xml_node, "number", xml_buf);
00565             free(xml_buf);
00566         }
00567 
00568         write_ids(ids, i, HAPLO_SEP_SUFFIX, fp, xml_node);
00569         write_label(labels, i, HAPLO_SEP_SUFFIX, fp, xml_node);
00570 
00571         switch (opts.output_format)
00572         {
00573             case HAPLO_OUTPUT_TXT:
00574                 fprintf(fp, "%-6d\n", members->elts[ i ]+1);
00575                 break;
00576             case HAPLO_OUTPUT_CSV:
00577                 fprintf(fp, "%d\n", members->elts[ i ]+1);
00578                 break;
00579             case HAPLO_OUTPUT_XML:
00580                 xml_len = 256;
00581                 xml_buf = malloc(xml_len*sizeof(char));
00582                 snprintf(xml_buf, xml_len, "%d", members->elts[ i ]+1);
00583                 XMLNewChild(xml_node, "member", xml_buf);
00584                 free(xml_buf);
00585                 break;
00586         }
00587     }
00588 
00589     if ((err = close_output(fp, xml_doc, members_out_fname)))
00590     {
00591         print_error_msg_exit("haplo-cluster", err->msg);
00592     }
00593 }
00594 
00596 static void write_weights(const Vector_d* weights)
00597 {
00598     FILE*       fp;
00599     uint32_t    i;
00600     xmlDoc*     xml_doc   = NULL;
00601     xmlNode*    xml_root  = NULL;
00602     xmlNode*    xml_node  = NULL;
00603     char        xml_buf[256] = {0};
00604     Error*      err;
00605 
00606     if ((err = open_output(&fp, &xml_doc, "haplo-cluster-weights-out",
00607                     "haplo-cluster-weights-out.dtd", weights_out_fname)))
00608     {
00609         print_error_msg_exit("haplo-cluster", err->msg);
00610     }
00611 
00612     if (opts.output_format == HAPLO_OUTPUT_XML)
00613     {
00614         xml_root = xmlDocGetRootElement(xml_doc);
00615     }
00616 
00617     for (i = 0; i < weights->num_elts; i++)
00618     {
00619         switch (opts.output_format)
00620         {
00621             case HAPLO_OUTPUT_TXT:
00622                 fprintf(fp, "%-4d  %5.3f\n", i+1, weights->elts[ i ]);
00623                 break;
00624             case HAPLO_OUTPUT_CSV:
00625                 fprintf(fp, "%d,%.3f\n", i+1, weights->elts[ i ]);
00626                 break;
00627             case HAPLO_OUTPUT_XML:
00628                 snprintf(xml_buf, 256, "%.3f", weights->elts[ i ]);
00629                 xml_node = XMLNewChild(xml_root, "weight", xml_buf);
00630                 snprintf(xml_buf, 256, "%d", i+1);
00631                 XMLNewProp(xml_node, "number", xml_buf);
00632                 break;
00633         }
00634     }
00635 
00636     if ((err = close_output(fp, xml_doc, weights_out_fname)))
00637     {
00638         print_error_msg_exit("haplo-cluster", err->msg);
00639     }
00640 }
00641 
00643 static void write_responses_header
00644 (
00645     const Matblock_u8* ids,
00646     const Vector_u32*  labels,
00647     const Matrix_d*    responses,
00648     FILE*              fp
00649 )
00650 {
00651     uint32_t i, j;
00652 
00653     if (!opts.header_out || opts.output_format == HAPLO_OUTPUT_XML)
00654         return;
00655 
00656     if (ids)
00657     {
00658         for (i = 0; i < ids->num_rows; i++)
00659         {
00660             switch (opts.output_format)
00661             {
00662                 case HAPLO_OUTPUT_TXT:
00663                     fprintf(fp, "ID %-7d  ", i+1);
00664                     break;
00665                 case HAPLO_OUTPUT_CSV:
00666                     fprintf(fp, "ID %d,", i+1);
00667                     break;
00668                 case HAPLO_OUTPUT_XML:
00669                     break;
00670             }
00671         }
00672     }
00673 
00674     if (labels)
00675     {
00676         switch (opts.output_format)
00677         {
00678             case HAPLO_OUTPUT_TXT:
00679                 fprintf(fp, "%-10s  ", "Label");
00680                 break;
00681             case HAPLO_OUTPUT_CSV:
00682                 fprintf(fp, "%s,", "Label");
00683                 break;
00684             case HAPLO_OUTPUT_XML:
00685                 break;
00686         }
00687     }
00688 
00689     switch (opts.output_format)
00690     {
00691         case HAPLO_OUTPUT_TXT:
00692             fprintf(fp, "%8s %2d", "Response", 1);
00693             for (j = 1; j < responses->num_cols; j++)
00694             {
00695                 fprintf(fp, "  %8s %2d", "Response", j+1);
00696             }
00697             break;
00698         case HAPLO_OUTPUT_CSV:
00699             fprintf(fp, "%s %d", "Response", 1);
00700             for (j = 1; j < responses->num_cols; j++)
00701             {
00702                 fprintf(fp, ",%s %d", "Response", j+1);
00703             }
00704             break;
00705         case HAPLO_OUTPUT_XML:
00706             break;
00707     }
00708     fprintf(fp, "\n");
00709 }
00710 
00712 static void write_responses
00713 (
00714     const Matblock_u8* ids,
00715     const Vector_u32*  labels,
00716     const Matrix_d*    responses
00717 )
00718 {
00719     FILE*       fp;
00720     uint32_t    i, j;
00721     xmlDoc*     xml_doc   = NULL;
00722     xmlNode*    xml_root  = NULL;
00723     xmlNode*    xml_node  = NULL;
00724     xmlNode*    xml_child = NULL;
00725     char*       xml_buf   = NULL;
00726     uint32_t    xml_len;
00727     Error*      err;
00728 
00729     if ((err = open_output(&fp, &xml_doc, "haplo-cluster-responses-out",
00730                     "haplo-cluster-responses-out.dtd", responses_out_fname)))
00731     {
00732         print_error_msg_exit("haplo-cluster", err->msg);
00733     }
00734 
00735     write_responses_header(ids, labels, responses, fp);
00736 
00737     if (opts.output_format == HAPLO_OUTPUT_XML)
00738     {
00739         xml_root = xmlDocGetRootElement(xml_doc);
00740     }
00741 
00742     for (i = 0; i < responses->num_rows; i++)
00743     {
00744         if (opts.output_format == HAPLO_OUTPUT_XML)
00745         {
00746             xml_node = XMLNewChild(xml_root, "sample", NULL);
00747             xml_len = 256;
00748             xml_buf = malloc(xml_len*sizeof(char));
00749             snprintf(xml_buf, xml_len, "%d", i+1);
00750             XMLNewProp(xml_node, "number", xml_buf);
00751             free(xml_buf);
00752         }
00753 
00754         write_ids(ids, i, HAPLO_SEP_SUFFIX, fp, xml_node);
00755         write_label(labels, i, HAPLO_SEP_SUFFIX, fp, xml_node);
00756 
00757         switch (opts.output_format)
00758         {
00759             case HAPLO_OUTPUT_TXT:
00760                 fprintf(fp, "%-11.4f", responses->elts[ i ][ 0 ]);
00761                 for (j = 1; j < responses->num_cols; j++)
00762                 {
00763                     fprintf(fp, "  %-11.4f", responses->elts[ i ][ j ]);
00764                 }
00765                 fprintf(fp, "\n");
00766                 break;
00767             case HAPLO_OUTPUT_CSV:
00768                 fprintf(fp, "%.3f", responses->elts[ i ][ 0 ]);
00769                 for (j = 1; j < responses->num_cols; j++)
00770                 {
00771                     fprintf(fp, ",%.4f", responses->elts[ i ][ j ]);
00772                 }
00773                 fprintf(fp, "\n");
00774                 break;
00775             case HAPLO_OUTPUT_XML:
00776                 xml_len = 256;
00777                 xml_buf = malloc(xml_len*sizeof(char));
00778                 for (j = 0; j < responses->num_cols; j++)
00779                 {
00780                     snprintf(xml_buf, xml_len, "%.4f", responses->elts[i][j]);
00781                     xml_child = XMLNewChild(xml_node, "response", xml_buf);
00782                     snprintf(xml_buf, xml_len, "%d", j+1);
00783                     XMLNewProp(xml_child, "number", xml_buf);
00784                 }
00785                 free(xml_buf);
00786                 break;
00787         }
00788     }
00789 
00790     if ((err = close_output(fp, xml_doc, responses_out_fname)))
00791     {
00792         print_error_msg_exit("haplo-cluster", err->msg);
00793     }
00794 }
00795 
00797 static void write_kmeans_results
00798 (
00799     const Matblock_u8* ids,
00800     const Vector_u32*  labels,
00801     const Matrix_d*    means, 
00802     const Vector_i32*  members
00803 )
00804 {
00805     write_means(means);
00806     write_members(ids, labels, members);
00807 }
00808 
00810 static void assign_gmm_members
00811 (
00812     Vector_i32**    members_out,
00813     const Matrix_d* responses
00814 )
00815 {
00816     uint32_t i, j, k;
00817 
00818     create_vector_i32(members_out, responses->num_rows);
00819 
00820     for (i = 0; i < responses->num_rows; i++)
00821     {
00822         k = 0;
00823         for (j = 1; j < responses->num_cols; j++)
00824         {
00825             if (responses->elts[ i ][ j ] > responses->elts[ i ][ k ])
00826             {
00827                 k = j;
00828             }
00829         }
00830 
00831         (*members_out)->elts[ i ] = k;
00832     }
00833 }
00834 
00836 static void assign_mmm_members
00837 (
00838     Vector_i32**    members_out,
00839     const Matrix_d* responses
00840 )
00841 {
00842     uint32_t i, j, k;
00843 
00844     create_vector_i32(members_out, responses->num_rows);
00845 
00846     for (i = 0; i < responses->num_rows; i++)
00847     {
00848         k = 0;
00849         for (j = 1; j < responses->num_cols; j++)
00850         {
00851             if (responses->elts[ i ][ j ] > responses->elts[ i ][ k ])
00852             {
00853                 k = j;
00854             }
00855         }
00856 
00857         (*members_out)->elts[ i ] = k;
00858     }
00859 }
00860 
00862 static void assign_mmm_aux_members
00863 (
00864     Vector_i32**        members_out,
00865     const Matblock_d*   means, 
00866     const Vector_d*     weights,
00867     const Matblock_u32* markers
00868 )
00869 {
00870     uint32_t k, K;
00871     uint32_t n, N;
00872     uint32_t d, D;
00873     uint32_t m, M;
00874     double   p_min;
00875     double   p, q;
00876 
00877     Matrix_u32* x_n  = NULL;
00878     Matrix_d*   mu_k = NULL;
00879 
00880     K = weights->num_elts;
00881     N = markers->num_mats;
00882     D = markers->num_rows;
00883     M = markers->num_cols;
00884 
00885     create_vector_i32(members_out, N);
00886 
00887     for (n = 0; n < N; n++)
00888     {
00889         p_min = 0;
00890         copy_matrix_from_matblock_u32(&x_n, markers, MATBLOCK_MAT_MATRIX, n);
00891 
00892         for (k = 0; k < K; k++)
00893         {
00894             copy_matrix_from_matblock_d(&mu_k, means, MATBLOCK_MAT_MATRIX, k);
00895 
00896             q = 0.0;
00897             for (d = 0; d < D; d++)
00898             {
00899                 for (m = 0; m < M; m++)
00900                 {
00901                     if (x_n->elts[ d ][ m ])
00902                     {
00903                         q += log((mu_k->elts[ d ][ m ] < JWSC_MIN_LOG_ARG) ? JWSC_MIN_LOG_ARG : mu_k->elts[ d ][ m ]);
00904                     }
00905                 }
00906             }
00907 
00908             p = log((weights->elts[ k ] < JWSC_MIN_LOG_ARG) ? JWSC_MIN_LOG_ARG : weights->elts[ k ]) + q;
00909 #if defined(JWSC_HAVE_ISNAN) && defined(JWSC_HAVE_ISINF)
00910             assert(!isnan(p) && !isinf(p));
00911 #endif
00912 
00913             if (p < p_min)
00914             {
00915                 p_min = p;
00916                 (*members_out)->elts[ n ] = k;
00917             }
00918         }
00919     }
00920 
00921     free_matrix_u32(x_n);
00922     free_matrix_d(mu_k);
00923 }
00924 
00926 static void write_gmm_results
00927 (
00928     const Matblock_u8* ids,
00929     const Vector_u32*  labels,
00930     const Matrix_d*    means, 
00931     const Matblock_d*  covars, 
00932     const Vector_d*    weights,
00933     const Matrix_d*    responses,
00934     const Vector_i32*  members
00935 )
00936 {
00937     write_means(means);
00938     write_weights(weights);
00939     write_responses(ids, labels, responses);
00940     write_members(ids, labels, members);
00941 }
00942 
00944 static void write_mmm_results
00945 (
00946     const Matblock_u8* ids,
00947     const Vector_u32*  labels,
00948     const Matblock_d*  means, 
00949     const Vector_d*    weights,
00950     const Matrix_d*    responses,
00951     const Vector_i32*  members
00952 )
00953 {
00954     write_mmm_means(means);
00955     write_weights(weights);
00956     write_responses(ids, labels, responses);
00957     write_members(ids, labels, members);
00958 }
00959 
00961 static void write_mmm_aux_results
00962 (
00963     const Matblock_u8* ids,
00964     const Vector_u32*  labels,
00965     const Vector_i32*  members
00966 )
00967 {
00968     write_members(ids, labels, members);
00969 }
00970 
00971 static void cluster_kmeans
00972 (
00973     const Matblock_u8* ids, 
00974     const Vector_u32*  labels,
00975     const Matrix_i32*  markers
00976 )
00977 {
00978     uint32_t num_rows, num_cols;
00979     uint32_t row, col;
00980 
00981     Matrix_d*    markers_d = NULL;
00982     Matrix_d*    means     = NULL;
00983     Vector_i32*  members   = NULL;
00984 
00985     num_rows = markers->num_rows;
00986     num_cols = markers->num_cols;
00987     create_matrix_d(&markers_d, num_rows, num_cols);
00988 
00989     for (row = 0; row < num_rows; row++)
00990     {
00991         for (col = 0; col < num_cols; col++)
00992         {
00993             markers_d->elts[ row ][ col ] = (double)markers->elts[ row ][ col ];
00994         }
00995     }
00996 
00997     kmeans_d(&means, &members, markers_d, num_clusters, 1);
00998     write_kmeans_results(ids, labels, means, members);
00999 
01000     free_matrix_d(markers_d);
01001     free_matrix_d(means);
01002     free_vector_i32(members);
01003 }
01004 
01005 static void cluster_gmm
01006 (
01007     const Matblock_u8* ids, 
01008     const Vector_u32*  labels,
01009     const Matrix_i32*  markers
01010 )
01011 {
01012     uint32_t num_rows, num_cols;
01013     uint32_t row, col;
01014 
01015     Matrix_d*    markers_d = NULL;
01016     Matrix_d*    means     = NULL;
01017     Matblock_d*  covars    = NULL;
01018     Vector_d*    weights   = NULL;
01019     Matrix_d*    responses = NULL;
01020     Vector_i32*  members   = NULL;
01021 
01022     num_rows = markers->num_rows;
01023     num_cols = markers->num_cols;
01024     create_matrix_d(&markers_d, num_rows, num_cols);
01025 
01026     for (row = 0; row < num_rows; row++)
01027     {
01028         for (col = 0; col < num_cols; col++)
01029         {
01030             markers_d->elts[ row ][ col ] = (double)markers->elts[ row ][ col ];
01031         }
01032     }
01033 
01034     train_gmm_d(&means, &covars, &weights, &responses, markers_d, num_clusters);
01035     assign_gmm_members(&members, responses);
01036     write_gmm_results(ids, labels, means, covars, weights, responses, members);
01037 
01038     free_matrix_d(markers_d);
01039     free_matrix_d(means);
01040     free_matblock_d(covars);
01041     free_vector_d(weights);
01042     free_matrix_d(responses);
01043     free_vector_i32(members);
01044 }
01045 
01046 static void cluster_mmm
01047 (
01048     const Matblock_u8* ids, 
01049     const Vector_u32*  labels,
01050     const Matrix_i32*  markers,
01051     const Matblock_u8* aux_ids, 
01052     const Vector_u32*  aux_labels,
01053     const Matrix_i32*  aux_markers
01054 )
01055 {
01056     uint32_t s, num_samples;
01057     uint32_t m, num_markers;
01058     uint32_t num_marker_vals;
01059 
01060     Matblock_u32* markers_d = NULL;
01061     Matblock_d*   means     = NULL;
01062     Vector_d*     weights   = NULL;
01063     Matrix_d*     responses = NULL;
01064     Vector_i32*   members   = NULL;
01065 
01066     num_marker_vals = 0; 
01067     num_samples = markers->num_rows;
01068     num_markers = markers->num_cols;
01069 
01070     for (s = 0; s < num_samples; s++)
01071     {
01072         for (m = 0; m < num_markers; m++)
01073         {
01074             if (markers->elts[ s ][ m ] >= num_marker_vals)
01075             {
01076                 num_marker_vals = markers->elts[ s ][ m ]+1;
01077             }
01078         }
01079     }
01080     assert(num_marker_vals);
01081 
01082     create_zero_matblock_u32(&markers_d, num_samples, num_markers,
01083             num_marker_vals);
01084     for (s = 0; s < num_samples; s++)
01085     {
01086         for (m = 0; m < num_markers; m++)
01087         {
01088             markers_d->elts[ s ][ m ][ markers->elts[s][m] ] = 1;
01089         }
01090     }
01091 
01092     // DEBUG
01093     for (s = 0; s < num_samples; s++)
01094     {
01095         for (m = 0; m < num_markers; m++)
01096         {
01097             uint32_t v, sum = 0;
01098             for (v = 0; v < num_marker_vals; v++)
01099             {
01100                 if (markers_d->elts[ s ][ m ][ v ])
01101                 {
01102                     sum++;
01103                 }
01104             }
01105             assert(sum == 1);
01106         }
01107     }
01108 
01109     train_mmm_d(&means, &weights, &responses, markers_d, num_clusters);
01110     assign_mmm_members(&members, responses);
01111     write_mmm_results(ids, labels, means, weights, responses, members);
01112 
01113     if (aux_markers)
01114     {
01115         num_samples = aux_markers->num_rows;
01116         assert(num_markers == aux_markers->num_cols);
01117 
01118         create_zero_matblock_u32(&markers_d, num_samples, num_markers,
01119                 num_marker_vals);
01120         for (s = 0; s < num_samples; s++)
01121         {
01122             for (m = 0; m < num_markers; m++)
01123             {
01124                 markers_d->elts[ s ][ m ][ aux_markers->elts[s][m] ] = 1;
01125             }
01126         }
01127 
01128         assign_mmm_aux_members(&members, means, weights, markers_d);
01129         write_mmm_aux_results(aux_ids, aux_labels, members);
01130     }
01131 
01132     free_matblock_u32(markers_d);
01133     free_matblock_d(means);
01134     free_vector_d(weights);
01135     free_matrix_d(responses);
01136     free_vector_i32(members);
01137 }
01138 
01140 int main(int argc, const char** argv)
01141 {
01142     int         argi;
01143     const char* data_fname = "/dev/stdin";
01144     Error*      err;
01145 
01146     Matblock_u8* ids       = NULL;
01147     Vector_u32*  labels    = NULL;
01148     Matrix_i32*  markers   = NULL;
01149 
01150     Matblock_u8* aux_ids       = NULL;
01151     Vector_u32*  aux_labels    = NULL;
01152     Matrix_i32*  aux_markers   = NULL;
01153 
01154     init_cluster_options();
01155 
01156     if ((err = process_options(argc, argv, &argi, NUM_OPTS_NO_ARG, opts_no_arg,
01157                     NUM_OPTS_WITH_ARG, opts_with_arg)) != NULL)
01158     {
01159         print_error_msg_exit("haplo-cluster", err->msg);
01160     }
01161 
01162     if ((argc - argi) == 1)
01163     {
01164         data_fname = argv[ argi ];
01165     }
01166 
01167     if ((err = read_haplo_groups(opts.labels_fname)))
01168     {
01169         print_error_msg_exit("haplo-cluster", err->msg);
01170     }
01171 
01172     if ((err = read_input(&ids, &labels, &markers, data_fname)))
01173     {
01174         print_error_msg_exit("haplo-cluster", err->msg);
01175     }
01176 
01177     if (opts.aux_input_fname)
01178     {
01179         if ((err = read_aux_input(&aux_ids, &aux_labels, &aux_markers,
01180                         opts.aux_input_fname)))
01181         {
01182             print_error_msg_exit("haplo-impute", err->msg);
01183         }
01184     }
01185 
01186     switch (cluster_type)
01187     {
01188         case HAPLO_CLUSTER_KMEANS:
01189             cluster_kmeans(ids, labels, markers);
01190             break;
01191         case HAPLO_CLUSTER_GMM:
01192             cluster_gmm(ids, labels, markers);
01193             break;
01194         case HAPLO_CLUSTER_MMM:
01195             cluster_mmm(ids, labels, markers, aux_ids, aux_labels, aux_markers);
01196             break;
01197     }
01198 
01199     free_matblock_u8(ids);
01200     free_vector_u32(labels);
01201     free_matrix_i32(markers);
01202 
01203     if (get_num_unhandled_errors() > 0)
01204     {
01205         print_error_msg_exit("haplo-cluster", "Unhandled errors");
01206     }
01207 
01208     return EXIT_SUCCESS;
01209 }