Haplo Prediction
predict haplogroups
|
00001 /* 00002 * This work is licensed under a Creative Commons 00003 * Attribution-Noncommercial-Share Alike 3.0 United States License. 00004 * 00005 * http://creativecommons.org/licenses/by-nc-sa/3.0/us/ 00006 * 00007 * You are free: 00008 * 00009 * to Share - to copy, distribute, display, and perform the work 00010 * to Remix - to make derivative works 00011 * 00012 * Under the following conditions: 00013 * 00014 * Attribution. You must attribute the work in the manner specified by the 00015 * author or licensor (but not in any way that suggests that they endorse you 00016 * or your use of the work). 00017 * 00018 * Noncommercial. You may not use this work for commercial purposes. 00019 * 00020 * Share Alike. If you alter, transform, or build upon this work, you may 00021 * distribute the resulting work only under the same or similar license to 00022 * this one. 00023 * 00024 * For any reuse or distribution, you must make clear to others the license 00025 * terms of this work. The best way to do this is by including this header. 00026 * 00027 * Any of the above conditions can be waived if you get permission from the 00028 * copyright holder. 00029 * 00030 * Apart from the remix rights granted under this license, nothing in this 00031 * license impairs or restricts the author's moral rights. 00032 */ 00033 00034 00055 #include <config.h> 00056 00057 #include <stdlib.h> 00058 #include <stdio.h> 00059 #include <string.h> 00060 #include <assert.h> 00061 #include <inttypes.h> 00062 #include <math.h> 00063 #include <unistd.h> 00064 00065 #include <libxml/tree.h> 00066 00067 #ifdef HAPLO_HAVE_DMALLOC 00068 #include <dmalloc.h> 00069 #endif 00070 00071 #include <jwsc/base/error.h> 00072 #include <jwsc/base/option.h> 00073 #include <jwsc/base/file_io.h> 00074 #include <jwsc/vector/vector.h> 00075 #include <jwsc/vector/vector_math.h> 00076 #include <jwsc/matrix/matrix.h> 00077 #include <jwsc/matrix/matrix_math.h> 00078 #include <jwsc/matblock/matblock.h> 00079 #include <jwsc/stat/stat.h> 00080 00081 #include "haplo_groups.h" 00082 #include "options.h" 00083 #include "output.h" 00084 #include "input.h" 00085 #include "xml.h" 00086 #include "nb_freq.h" 00087 #include "nb_gauss.h" 00088 #include "nb_gmm.h" 00089 #include "mv_gmm.h" 00090 #ifdef HAPLO_ENABLE_SVM 00091 #include "svm_tree.h" 00092 #endif 00093 #ifdef HAPLO_ENABLE_WEKA 00094 #include "weka.h" 00095 #endif 00096 #include "nearest.h" 00097 00098 00099 #ifdef HAPLO_ENABLE_SVM 00100 #define NUM_SVM_OPTS 8 00101 #else 00102 #define NUM_SVM_OPTS 0 00103 #endif 00104 00105 #ifdef HAPLO_ENABLE_WEKA 00106 #define NUM_WEKA_OPTS 16 00107 #else 00108 #define NUM_WEKA_OPTS 0 00109 #endif 00110 00111 #define NUM_OPTS_NO_ARG 2 + NUM_SHARED_OPTS_NO_ARG 00112 #define NUM_OPTS_WITH_ARG 54 + NUM_SVM_OPTS + NUM_WEKA_OPTS + NUM_SHARED_OPTS_WITH_ARG 00113 00114 00115 #define LABEL_COL 1 00116 #define TANDEM 0 00117 #define TEST_TYPE HAPLO_TEST_LEAVE_ONE_OUT 00118 #define NUM_CV_FOLDS 5 00119 #define NUM_CV_ITERS 10 00120 #define TMP_DIRNAME "/tmp" 00121 #define NB_FREQ_SUMMARY_FNAME "/dev/stdout" 00122 #define NB_FREQ_DETAILS_CNT_FNAME "/dev/null" 00123 #define NB_FREQ_DETAILS_PCT_FNAME "/dev/null" 00124 #define NB_FREQ_CONFUSION_CNT_FNAME "/dev/null" 00125 #define NB_FREQ_CONFUSION_PCT_FNAME "/dev/null" 00126 #define NB_FREQ_PREDS_FNAME "/dev/null" 00127 #define NB_GAUSS_SUMMARY_FNAME "/dev/stdout" 00128 #define NB_GAUSS_DETAILS_CNT_FNAME "/dev/null" 00129 #define NB_GAUSS_DETAILS_PCT_FNAME "/dev/null" 00130 #define NB_GAUSS_CONFUSION_CNT_FNAME "/dev/null" 00131 #define NB_GAUSS_CONFUSION_PCT_FNAME "/dev/null" 00132 #define NB_GAUSS_PREDS_FNAME "/dev/null" 00133 #define NB_GMM_SUMMARY_FNAME "/dev/stdout" 00134 #define NB_GMM_DETAILS_CNT_FNAME "/dev/null" 00135 #define NB_GMM_DETAILS_PCT_FNAME "/dev/null" 00136 #define NB_GMM_CONFUSION_CNT_FNAME "/dev/null" 00137 #define NB_GMM_CONFUSION_PCT_FNAME "/dev/null" 00138 #define NB_GMM_PREDS_FNAME "/dev/null" 00139 #define MV_GMM_SUMMARY_FNAME "/dev/stdout" 00140 #define MV_GMM_DETAILS_CNT_FNAME "/dev/null" 00141 #define MV_GMM_DETAILS_PCT_FNAME "/dev/null" 00142 #define MV_GMM_CONFUSION_CNT_FNAME "/dev/null" 00143 #define MV_GMM_CONFUSION_PCT_FNAME "/dev/null" 00144 #define MV_GMM_PREDS_FNAME "/dev/null" 00145 #define SVM_SUMMARY_FNAME "/dev/stdout" 00146 #define SVM_DETAILS_CNT_FNAME "/dev/null" 00147 #define SVM_DETAILS_PCT_FNAME "/dev/null" 00148 #define SVM_CONFUSION_CNT_FNAME "/dev/null" 00149 #define SVM_CONFUSION_PCT_FNAME "/dev/null" 00150 #define SVM_PREDS_FNAME "/dev/null" 00151 #define WEKA_J48_SUMMARY_FNAME "/dev/stdout" 00152 #define WEKA_J48_DETAILS_CNT_FNAME "/dev/null" 00153 #define WEKA_J48_DETAILS_PCT_FNAME "/dev/null" 00154 #define WEKA_J48_CONFUSION_CNT_FNAME "/dev/null" 00155 #define WEKA_J48_CONFUSION_PCT_FNAME "/dev/null" 00156 #define WEKA_J48_PREDS_FNAME "/dev/null" 00157 #define WEKA_PART_SUMMARY_FNAME "/dev/stdout" 00158 #define WEKA_PART_DETAILS_CNT_FNAME "/dev/null" 00159 #define WEKA_PART_DETAILS_PCT_FNAME "/dev/null" 00160 #define WEKA_PART_CONFUSION_CNT_FNAME "/dev/null" 00161 #define WEKA_PART_CONFUSION_PCT_FNAME "/dev/null" 00162 #define WEKA_PART_PREDS_FNAME "/dev/null" 00163 #define NEAREST_SUMMARY_FNAME "/dev/stdout" 00164 #define NEAREST_DETAILS_CNT_FNAME "/dev/null" 00165 #define NEAREST_DETAILS_PCT_FNAME "/dev/null" 00166 #define NEAREST_CONFUSION_CNT_FNAME "/dev/null" 00167 #define NEAREST_CONFUSION_PCT_FNAME "/dev/null" 00168 #define NEAREST_PREDS_FNAME "/dev/null" 00169 #define TANDEM_AGREE_SUMMARY_FNAME "/dev/stdout" 00170 #define TANDEM_AGREE_DETAILS_CNT_FNAME "/dev/null" 00171 #define TANDEM_AGREE_DETAILS_PCT_FNAME "/dev/null" 00172 #define TANDEM_SUMMARY_FNAME "/dev/stdout" 00173 #define TANDEM_DETAILS_CNT_FNAME "/dev/null" 00174 #define TANDEM_DETAILS_PCT_FNAME "/dev/null" 00175 #define TANDEM_CONFUSION_CNT_FNAME "/dev/null" 00176 #define TANDEM_CONFUSION_PCT_FNAME "/dev/null" 00177 #define TANDEM_PREDS_FNAME "/dev/null" 00178 00179 00181 typedef enum 00182 { 00183 HAPLO_TEST_LEAVE_ONE_OUT, 00184 HAPLO_TEST_CROSS_VALIDATE 00185 } 00186 Haplo_test_type; 00187 00188 00190 Option_no_arg opts_no_arg[NUM_OPTS_NO_ARG]; 00191 00193 Option_with_arg opts_with_arg[NUM_OPTS_WITH_ARG]; 00194 00196 static uint8_t tandem = TANDEM; 00197 00199 static Haplo_test_type test_type = TEST_TYPE; 00200 00202 static uint32_t num_cv_folds = NUM_CV_FOLDS; 00203 00205 static uint32_t num_cv_iters = NUM_CV_ITERS; 00206 00208 static const char* tmp_dirname = TMP_DIRNAME; 00209 00214 static const char* nb_freq_summary_fname = NB_FREQ_SUMMARY_FNAME; 00215 00220 static const char* nb_freq_details_cnt_fname = NB_FREQ_DETAILS_CNT_FNAME; 00221 00226 static const char* nb_freq_details_pct_fname = NB_FREQ_DETAILS_PCT_FNAME; 00227 00229 static const char* nb_freq_confusion_cnt_fname = NB_FREQ_CONFUSION_CNT_FNAME; 00230 00232 static const char* nb_freq_confusion_pct_fname = NB_FREQ_CONFUSION_PCT_FNAME; 00233 00235 static const char* nb_freq_preds_fname = NB_FREQ_PREDS_FNAME; 00236 00241 static const char* nb_gauss_summary_fname = NB_GAUSS_SUMMARY_FNAME; 00242 00247 static const char* nb_gauss_details_cnt_fname = NB_GAUSS_DETAILS_CNT_FNAME; 00248 00253 static const char* nb_gauss_details_pct_fname = NB_GAUSS_DETAILS_PCT_FNAME; 00254 00256 static const char* nb_gauss_confusion_cnt_fname = NB_GAUSS_CONFUSION_CNT_FNAME; 00257 00259 static const char* nb_gauss_confusion_pct_fname = NB_GAUSS_CONFUSION_PCT_FNAME; 00260 00262 static const char* nb_gauss_preds_fname = NB_GAUSS_PREDS_FNAME; 00263 00268 static const char* nb_gmm_summary_fname = NB_GMM_SUMMARY_FNAME; 00269 00274 static const char* nb_gmm_details_cnt_fname = NB_GMM_DETAILS_CNT_FNAME; 00275 00280 static const char* nb_gmm_details_pct_fname = NB_GMM_DETAILS_PCT_FNAME; 00281 00286 static const char* nb_gmm_confusion_cnt_fname = NB_GMM_CONFUSION_CNT_FNAME; 00287 00292 static const char* nb_gmm_confusion_pct_fname = NB_GMM_CONFUSION_PCT_FNAME; 00293 00297 static const char* nb_gmm_preds_fname = NB_GMM_PREDS_FNAME; 00298 00303 static const char* mv_gmm_summary_fname = MV_GMM_SUMMARY_FNAME; 00304 00309 static const char* mv_gmm_details_cnt_fname = MV_GMM_DETAILS_CNT_FNAME; 00310 00315 static const char* mv_gmm_details_pct_fname = MV_GMM_DETAILS_PCT_FNAME; 00316 00321 static const char* mv_gmm_confusion_cnt_fname = MV_GMM_CONFUSION_CNT_FNAME; 00322 00327 static const char* mv_gmm_confusion_pct_fname = MV_GMM_CONFUSION_PCT_FNAME; 00328 00332 static const char* mv_gmm_preds_fname = MV_GMM_PREDS_FNAME; 00333 00335 static const char* svm_summary_fname = SVM_SUMMARY_FNAME; 00336 00338 static const char* svm_details_cnt_fname = SVM_DETAILS_CNT_FNAME; 00339 00341 static const char* svm_details_pct_fname = SVM_DETAILS_PCT_FNAME; 00342 00344 static const char* svm_confusion_cnt_fname = SVM_CONFUSION_CNT_FNAME; 00345 00347 static const char* svm_confusion_pct_fname = SVM_CONFUSION_PCT_FNAME; 00348 00350 static const char* svm_preds_fname = SVM_PREDS_FNAME; 00351 00353 static const char* weka_j48_summary_fname = WEKA_J48_SUMMARY_FNAME; 00354 00356 static const char* weka_j48_details_cnt_fname = WEKA_J48_DETAILS_CNT_FNAME; 00357 00362 static const char* weka_j48_details_pct_fname = WEKA_J48_DETAILS_PCT_FNAME; 00363 00365 static const char* weka_j48_confusion_cnt_fname = WEKA_J48_CONFUSION_CNT_FNAME; 00366 00368 static const char* weka_j48_confusion_pct_fname = WEKA_J48_CONFUSION_PCT_FNAME; 00369 00371 static const char* weka_j48_preds_fname = WEKA_J48_PREDS_FNAME; 00372 00374 static const char* weka_part_summary_fname = WEKA_PART_SUMMARY_FNAME; 00375 00379 static const char* weka_part_details_cnt_fname = WEKA_PART_DETAILS_CNT_FNAME; 00380 00385 static const char* weka_part_details_pct_fname = WEKA_PART_DETAILS_PCT_FNAME; 00386 00388 static const char* weka_part_confusion_cnt_fname = WEKA_PART_CONFUSION_CNT_FNAME; 00389 00391 static const char* weka_part_confusion_pct_fname = WEKA_PART_CONFUSION_PCT_FNAME; 00392 00394 static const char* weka_part_preds_fname = WEKA_PART_PREDS_FNAME; 00395 00397 static const char* nearest_summary_fname = NEAREST_SUMMARY_FNAME; 00398 00402 static const char* nearest_details_cnt_fname = NEAREST_DETAILS_CNT_FNAME; 00403 00408 static const char* nearest_details_pct_fname = NEAREST_DETAILS_PCT_FNAME; 00409 00411 static const char* nearest_confusion_cnt_fname = NEAREST_CONFUSION_CNT_FNAME; 00412 00414 static const char* nearest_confusion_pct_fname = NEAREST_CONFUSION_PCT_FNAME; 00415 00417 static const char* nearest_preds_fname = NEAREST_PREDS_FNAME; 00418 00420 static const char* tandem_agree_summary_fname = TANDEM_AGREE_SUMMARY_FNAME; 00421 00426 static const char* tandem_agree_details_cnt_fname = TANDEM_AGREE_DETAILS_CNT_FNAME; 00427 00432 static const char* tandem_agree_details_pct_fname = TANDEM_AGREE_DETAILS_PCT_FNAME; 00433 00435 static const char* tandem_summary_fname = TANDEM_SUMMARY_FNAME; 00436 00438 static const char* tandem_details_cnt_fname = TANDEM_DETAILS_CNT_FNAME; 00439 00441 static const char* tandem_details_pct_fname = TANDEM_DETAILS_PCT_FNAME; 00442 00444 static const char* tandem_confusion_cnt_fname = TANDEM_CONFUSION_CNT_FNAME; 00445 00447 static const char* tandem_confusion_pct_fname = TANDEM_CONFUSION_PCT_FNAME; 00448 00450 static const char* tandem_preds_fname = TANDEM_PREDS_FNAME; 00451 00452 00454 uint32_t get_num_opts_no_arg() 00455 { 00456 return NUM_OPTS_NO_ARG; 00457 } 00458 00460 uint32_t get_num_opts_with_arg() 00461 { 00462 return NUM_OPTS_WITH_ARG; 00463 } 00464 00466 void print_usage() 00467 { 00468 fprintf(stderr, "usage: haplo-test OPTIONS [data-fname | <stdin>]\n"); 00469 print_options(stderr, 27, NUM_OPTS_NO_ARG, opts_no_arg, NUM_OPTS_WITH_ARG, 00470 opts_with_arg); 00471 } 00472 00474 static Error* process_tandem_opt() 00475 { 00476 tandem = 1; 00477 return NULL; 00478 } 00479 00481 static Error* process_test_type_opt(Option_arg arg) 00482 { 00483 if (arg == NULL) 00484 { 00485 return JWSC_EARG("Option 'test-type' requires an argument"); 00486 } 00487 if (strncmp(arg, "loo", 13) == 0) 00488 { 00489 test_type = HAPLO_TEST_LEAVE_ONE_OUT; 00490 } 00491 else if (strncmp(arg, "cv", 2) == 0) 00492 { 00493 test_type = HAPLO_TEST_CROSS_VALIDATE; 00494 } 00495 else 00496 { 00497 return JWSC_EARG("Option 'test-type' must be one of {loo, cv}"); 00498 } 00499 return NULL; 00500 } 00501 00503 static Error* process_num_cv_folds_opt(Option_arg arg) 00504 { 00505 if (arg == NULL) 00506 { 00507 return JWSC_EARG("Option 'num-cv-folds' requires an argument"); 00508 } 00509 if (sscanf(arg, "%u", &num_cv_folds) != 1 || num_cv_folds < 1) 00510 { 00511 return JWSC_EARG("Option 'num-cv-folds' must be > 0"); 00512 } 00513 00514 return NULL; 00515 } 00516 00518 static Error* process_num_cv_iters_opt(Option_arg arg) 00519 { 00520 if (arg == NULL) 00521 { 00522 return JWSC_EARG("Option 'num-cv-iters' requires an argument"); 00523 } 00524 if (sscanf(arg, "%u", &num_cv_iters) != 1 || num_cv_iters < 1) 00525 { 00526 return JWSC_EARG("Option 'num-cv-iters' must be > 0"); 00527 } 00528 00529 return NULL; 00530 } 00531 00532 Error* process_tmp_dir_opt(Option_arg arg) 00533 { 00534 if (arg == NULL) 00535 { 00536 return JWSC_EARG("Option 'tmp-dir' requires an argument"); 00537 } 00538 tmp_dirname = arg; 00539 00540 return NULL; 00541 } 00542 00544 Error* process_nb_freq_summary_out_opt(Option_arg arg) 00545 { 00546 if (arg == NULL) 00547 { 00548 return JWSC_EARG("Option 'nb-freq-summary-out' requires an argument"); 00549 } 00550 nb_freq_summary_fname = arg; 00551 return NULL; 00552 } 00553 00555 Error* process_nb_freq_details_pct_out_opt(Option_arg arg) 00556 { 00557 if (arg == NULL) 00558 { 00559 return JWSC_EARG("Option 'nb-freq-details-pct-out' requires an argument"); 00560 } 00561 nb_freq_details_pct_fname = arg; 00562 return NULL; 00563 } 00564 00566 Error* process_nb_freq_details_cnt_out_opt(Option_arg arg) 00567 { 00568 if (arg == NULL) 00569 { 00570 return JWSC_EARG("Option 'nb-freq-details-cnt-out' requires an argument"); 00571 } 00572 nb_freq_details_cnt_fname = arg; 00573 return NULL; 00574 } 00575 00577 Error* process_nb_freq_confusion_cnt_out_opt(Option_arg arg) 00578 { 00579 if (arg == NULL) 00580 { 00581 return JWSC_EARG("Option 'nb-freq-confusion-cnt-out' requires an argument"); 00582 } 00583 nb_freq_confusion_cnt_fname = arg; 00584 return NULL; 00585 } 00586 00588 Error* process_nb_freq_confusion_pct_out_opt(Option_arg arg) 00589 { 00590 if (arg == NULL) 00591 { 00592 return JWSC_EARG("Option 'nb-freq-confusion-pct-out' requires an argument"); 00593 } 00594 nb_freq_confusion_pct_fname = arg; 00595 return NULL; 00596 } 00597 00599 Error* process_nb_freq_preds_out_opt(Option_arg arg) 00600 { 00601 if (arg == NULL) 00602 { 00603 return JWSC_EARG("Option 'nb-freq-preds-out' requires an argument"); 00604 } 00605 nb_freq_preds_fname = arg; 00606 return NULL; 00607 } 00608 00610 Error* process_nb_gauss_summary_out_opt(Option_arg arg) 00611 { 00612 if (arg == NULL) 00613 { 00614 return JWSC_EARG("Option 'nb-gauss-summary-out' requires an argument"); 00615 } 00616 nb_gauss_summary_fname = arg; 00617 return NULL; 00618 } 00619 00621 Error* process_nb_gauss_details_pct_out_opt(Option_arg arg) 00622 { 00623 if (arg == NULL) 00624 { 00625 return JWSC_EARG("Option 'nb-gauss-details-pct-out' requires an argument"); 00626 } 00627 nb_gauss_details_pct_fname = arg; 00628 return NULL; 00629 } 00630 00632 Error* process_nb_gauss_details_cnt_out_opt(Option_arg arg) 00633 { 00634 if (arg == NULL) 00635 { 00636 return JWSC_EARG("Option 'nb-gauss-details-cnt-out' requires an argument"); 00637 } 00638 nb_gauss_details_cnt_fname = arg; 00639 return NULL; 00640 } 00641 00643 Error* process_nb_gauss_confusion_cnt_out_opt(Option_arg arg) 00644 { 00645 if (arg == NULL) 00646 { 00647 return JWSC_EARG("Option 'nb-gauss-confusion-cnt-out' requires an argument"); 00648 } 00649 nb_gauss_confusion_cnt_fname = arg; 00650 return NULL; 00651 } 00652 00654 Error* process_nb_gauss_confusion_pct_out_opt(Option_arg arg) 00655 { 00656 if (arg == NULL) 00657 { 00658 return JWSC_EARG("Option 'nb-gauss-confusion-pct-out' requires an argument"); 00659 } 00660 nb_gauss_confusion_pct_fname = arg; 00661 return NULL; 00662 } 00663 00665 Error* process_nb_gauss_preds_out_opt(Option_arg arg) 00666 { 00667 if (arg == NULL) 00668 { 00669 return JWSC_EARG("Option 'nb-gauss-preds-out' requires an argument"); 00670 } 00671 nb_gauss_preds_fname = arg; 00672 return NULL; 00673 } 00674 00676 Error* process_nb_gmm_summary_out_opt(Option_arg arg) 00677 { 00678 if (arg == NULL) 00679 { 00680 return JWSC_EARG("Option 'nb-gmm-summary-out' requires an argument"); 00681 } 00682 nb_gmm_summary_fname = arg; 00683 return NULL; 00684 } 00685 00687 Error* process_nb_gmm_details_pct_out_opt(Option_arg arg) 00688 { 00689 if (arg == NULL) 00690 { 00691 return JWSC_EARG("Option 'nb-gmm-details-pct-out' requires an argument"); 00692 } 00693 nb_gmm_details_pct_fname = arg; 00694 return NULL; 00695 } 00696 00698 Error* process_nb_gmm_details_cnt_out_opt(Option_arg arg) 00699 { 00700 if (arg == NULL) 00701 { 00702 return JWSC_EARG("Option 'nb-gmm-details-cnt-out' requires an argument"); 00703 } 00704 nb_gmm_details_cnt_fname = arg; 00705 return NULL; 00706 } 00707 00709 Error* process_nb_gmm_confusion_cnt_out_opt(Option_arg arg) 00710 { 00711 if (arg == NULL) 00712 { 00713 return JWSC_EARG("Option 'nb-gmm-confusion-cnt-out' requires an argument"); 00714 } 00715 nb_gmm_confusion_cnt_fname = arg; 00716 return NULL; 00717 } 00718 00720 Error* process_nb_gmm_confusion_pct_out_opt(Option_arg arg) 00721 { 00722 if (arg == NULL) 00723 { 00724 return JWSC_EARG("Option 'nb-gmm-confusion-pct-out' requires an argument"); 00725 } 00726 nb_gmm_confusion_pct_fname = arg; 00727 return NULL; 00728 } 00729 00731 Error* process_nb_gmm_preds_out_opt(Option_arg arg) 00732 { 00733 if (arg == NULL) 00734 { 00735 return JWSC_EARG("Option 'nb-gmm-preds-out' requires an argument"); 00736 } 00737 nb_gmm_preds_fname = arg; 00738 return NULL; 00739 } 00740 00742 Error* process_mv_gmm_summary_out_opt(Option_arg arg) 00743 { 00744 if (arg == NULL) 00745 { 00746 return JWSC_EARG("Option 'mv-gmm-summary-out' requires an argument"); 00747 } 00748 mv_gmm_summary_fname = arg; 00749 return NULL; 00750 } 00751 00753 Error* process_mv_gmm_details_pct_out_opt(Option_arg arg) 00754 { 00755 if (arg == NULL) 00756 { 00757 return JWSC_EARG("Option 'mv-gmm-details-pct-out' requires an argument"); 00758 } 00759 mv_gmm_details_pct_fname = arg; 00760 return NULL; 00761 } 00762 00764 Error* process_mv_gmm_details_cnt_out_opt(Option_arg arg) 00765 { 00766 if (arg == NULL) 00767 { 00768 return JWSC_EARG("Option 'mv-gmm-details-cnt-out' requires an argument"); 00769 } 00770 mv_gmm_details_cnt_fname = arg; 00771 return NULL; 00772 } 00773 00775 Error* process_mv_gmm_confusion_cnt_out_opt(Option_arg arg) 00776 { 00777 if (arg == NULL) 00778 { 00779 return JWSC_EARG("Option 'mv-gmm-confusion-cnt-out' requires an argument"); 00780 } 00781 mv_gmm_confusion_cnt_fname = arg; 00782 return NULL; 00783 } 00784 00786 Error* process_mv_gmm_confusion_pct_out_opt(Option_arg arg) 00787 { 00788 if (arg == NULL) 00789 { 00790 return JWSC_EARG("Option 'mv-gmm-confusion-pct-out' requires an argument"); 00791 } 00792 mv_gmm_confusion_pct_fname = arg; 00793 return NULL; 00794 } 00795 00797 Error* process_mv_gmm_preds_out_opt(Option_arg arg) 00798 { 00799 if (arg == NULL) 00800 { 00801 return JWSC_EARG("Option 'mv-gmm-preds-out' requires an argument"); 00802 } 00803 mv_gmm_preds_fname = arg; 00804 return NULL; 00805 } 00806 00808 Error* process_svm_summary_out_opt(Option_arg arg) 00809 { 00810 if (arg == NULL) 00811 { 00812 return JWSC_EARG("Option 'svm-summary-out' requires an argument"); 00813 } 00814 svm_summary_fname = arg; 00815 return NULL; 00816 } 00817 00819 Error* process_svm_details_pct_out_opt(Option_arg arg) 00820 { 00821 if (arg == NULL) 00822 { 00823 return JWSC_EARG("Option 'svm-details-pct-out' requires an argument"); 00824 } 00825 svm_details_pct_fname = arg; 00826 return NULL; 00827 } 00828 00830 Error* process_svm_details_cnt_out_opt(Option_arg arg) 00831 { 00832 if (arg == NULL) 00833 { 00834 return JWSC_EARG("Option 'svm-details-cnt-out' requires an argument"); 00835 } 00836 svm_details_cnt_fname = arg; 00837 return NULL; 00838 } 00839 00841 Error* process_svm_confusion_cnt_out_opt(Option_arg arg) 00842 { 00843 if (arg == NULL) 00844 { 00845 return JWSC_EARG("Option 'svm-confusion-cnt-out' requires an argument"); 00846 } 00847 svm_confusion_cnt_fname = arg; 00848 return NULL; 00849 } 00850 00852 Error* process_svm_confusion_pct_out_opt(Option_arg arg) 00853 { 00854 if (arg == NULL) 00855 { 00856 return JWSC_EARG("Option 'svm-confusion-pct-out' requires an argument"); 00857 } 00858 svm_confusion_pct_fname = arg; 00859 return NULL; 00860 } 00861 00863 Error* process_svm_preds_out_opt(Option_arg arg) 00864 { 00865 if (arg == NULL) 00866 { 00867 return JWSC_EARG("Option 'svm-preds-out' requires an argument"); 00868 } 00869 svm_preds_fname = arg; 00870 return NULL; 00871 } 00872 00874 Error* process_weka_j48_summary_out_opt(Option_arg arg) 00875 { 00876 if (arg == NULL) 00877 { 00878 return JWSC_EARG("Option 'weka-j48-summary-out' requires an argument"); 00879 } 00880 weka_j48_summary_fname = arg; 00881 return NULL; 00882 } 00883 00885 Error* process_weka_j48_details_pct_out_opt(Option_arg arg) 00886 { 00887 if (arg == NULL) 00888 { 00889 return JWSC_EARG("Option 'weka-j48-details-pct-out' requires an argument"); 00890 } 00891 weka_j48_details_pct_fname = arg; 00892 return NULL; 00893 } 00894 00896 Error* process_weka_j48_details_cnt_out_opt(Option_arg arg) 00897 { 00898 if (arg == NULL) 00899 { 00900 return JWSC_EARG("Option 'weka-j48-details-cnt-out' requires an argument"); 00901 } 00902 weka_j48_details_cnt_fname = arg; 00903 return NULL; 00904 } 00905 00907 Error* process_weka_j48_confusion_cnt_out_opt(Option_arg arg) 00908 { 00909 if (arg == NULL) 00910 { 00911 return JWSC_EARG("Option 'weka-j48-confusion-cnt-out' requires an argument"); 00912 } 00913 weka_j48_confusion_cnt_fname = arg; 00914 return NULL; 00915 } 00916 00918 Error* process_weka_j48_confusion_pct_out_opt(Option_arg arg) 00919 { 00920 if (arg == NULL) 00921 { 00922 return JWSC_EARG("Option 'weka-j48-confusion-pct-out' requires an argument"); 00923 } 00924 weka_j48_confusion_pct_fname = arg; 00925 return NULL; 00926 } 00927 00929 Error* process_weka_j48_preds_out_opt(Option_arg arg) 00930 { 00931 if (arg == NULL) 00932 { 00933 return JWSC_EARG("Option 'weka-j48-preds-out' requires an argument"); 00934 } 00935 weka_j48_preds_fname = arg; 00936 return NULL; 00937 } 00938 00940 Error* process_weka_part_summary_out_opt(Option_arg arg) 00941 { 00942 if (arg == NULL) 00943 { 00944 return JWSC_EARG("Option 'weka-part-summary-out' requires an argument"); 00945 } 00946 weka_part_summary_fname = arg; 00947 return NULL; 00948 } 00949 00951 Error* process_weka_part_details_pct_out_opt(Option_arg arg) 00952 { 00953 if (arg == NULL) 00954 { 00955 return JWSC_EARG("Option 'weka-part-details-pct-out' requires an argument"); 00956 } 00957 weka_part_details_pct_fname = arg; 00958 return NULL; 00959 } 00960 00962 Error* process_weka_part_details_cnt_out_opt(Option_arg arg) 00963 { 00964 if (arg == NULL) 00965 { 00966 return JWSC_EARG("Option 'weka-part-details-cnt-out' requires an argument"); 00967 } 00968 weka_part_details_cnt_fname = arg; 00969 return NULL; 00970 } 00971 00973 Error* process_weka_part_confusion_cnt_out_opt(Option_arg arg) 00974 { 00975 if (arg == NULL) 00976 { 00977 return JWSC_EARG("Option 'weka-part-confusion-cnt-out' requires an argument"); 00978 } 00979 weka_part_confusion_cnt_fname = arg; 00980 return NULL; 00981 } 00982 00984 Error* process_weka_part_confusion_pct_out_opt(Option_arg arg) 00985 { 00986 if (arg == NULL) 00987 { 00988 return JWSC_EARG("Option 'weka-part-confusion-pct-out' requires an argument"); 00989 } 00990 weka_part_confusion_pct_fname = arg; 00991 return NULL; 00992 } 00993 00995 Error* process_weka_part_preds_out_opt(Option_arg arg) 00996 { 00997 if (arg == NULL) 00998 { 00999 return JWSC_EARG("Option 'weka-part-preds-out' requires an argument"); 01000 } 01001 weka_part_preds_fname = arg; 01002 return NULL; 01003 } 01004 01006 Error* process_nearest_summary_out_opt(Option_arg arg) 01007 { 01008 if (arg == NULL) 01009 { 01010 return JWSC_EARG("Option 'nearest-summary-out' requires an argument"); 01011 } 01012 nearest_summary_fname = arg; 01013 return NULL; 01014 } 01015 01017 Error* process_nearest_details_pct_out_opt(Option_arg arg) 01018 { 01019 if (arg == NULL) 01020 { 01021 return JWSC_EARG("Option 'nearest-details-pct-out' requires an argument"); 01022 } 01023 nearest_details_pct_fname = arg; 01024 return NULL; 01025 } 01026 01028 Error* process_nearest_details_cnt_out_opt(Option_arg arg) 01029 { 01030 if (arg == NULL) 01031 { 01032 return JWSC_EARG("Option 'nearest-details-cnt-out' requires an argument"); 01033 } 01034 nearest_details_cnt_fname = arg; 01035 return NULL; 01036 } 01037 01039 Error* process_nearest_confusion_cnt_out_opt(Option_arg arg) 01040 { 01041 if (arg == NULL) 01042 { 01043 return JWSC_EARG("Option 'nearest-confusion-cnt-out' requires an argument"); 01044 } 01045 nearest_confusion_cnt_fname = arg; 01046 return NULL; 01047 } 01048 01050 Error* process_nearest_confusion_pct_out_opt(Option_arg arg) 01051 { 01052 if (arg == NULL) 01053 { 01054 return JWSC_EARG("Option 'nearest-confusion-pct-out' requires an argument"); 01055 } 01056 nearest_confusion_pct_fname = arg; 01057 return NULL; 01058 } 01059 01061 Error* process_nearest_preds_out_opt(Option_arg arg) 01062 { 01063 if (arg == NULL) 01064 { 01065 return JWSC_EARG("Option 'nearest-preds-out' requires an argument"); 01066 } 01067 nearest_preds_fname = arg; 01068 return NULL; 01069 } 01070 01072 Error* process_tandem_agree_summary_out_opt(Option_arg arg) 01073 { 01074 if (arg == NULL) 01075 { 01076 return JWSC_EARG("Option 'tandem-agree-summary-out' requires an argument"); 01077 } 01078 tandem_agree_summary_fname = arg; 01079 return NULL; 01080 } 01081 01083 Error* process_tandem_agree_details_pct_out_opt(Option_arg arg) 01084 { 01085 if (arg == NULL) 01086 { 01087 return JWSC_EARG("Option 'tandem-agree-details-pct-out' requires an argument"); 01088 } 01089 tandem_agree_details_pct_fname = arg; 01090 return NULL; 01091 } 01092 01094 Error* process_tandem_agree_details_cnt_out_opt(Option_arg arg) 01095 { 01096 if (arg == NULL) 01097 { 01098 return JWSC_EARG("Option 'tandem-agree-details-cnt-out' requires an argument"); 01099 } 01100 tandem_agree_details_cnt_fname = arg; 01101 return NULL; 01102 } 01103 01105 Error* process_tandem_summary_out_opt(Option_arg arg) 01106 { 01107 if (arg == NULL) 01108 { 01109 return JWSC_EARG("Option 'tandem-summary-out' requires an argument"); 01110 } 01111 tandem_summary_fname = arg; 01112 return NULL; 01113 } 01114 01116 Error* process_tandem_details_pct_out_opt(Option_arg arg) 01117 { 01118 if (arg == NULL) 01119 { 01120 return JWSC_EARG("Option 'tandem-details-pct-out' requires an argument"); 01121 } 01122 tandem_details_pct_fname = arg; 01123 return NULL; 01124 } 01125 01127 Error* process_tandem_details_cnt_out_opt(Option_arg arg) 01128 { 01129 if (arg == NULL) 01130 { 01131 return JWSC_EARG("Option 'tandem-details-cnt-out' requires an argument"); 01132 } 01133 tandem_details_cnt_fname = arg; 01134 return NULL; 01135 } 01136 01138 Error* process_tandem_confusion_cnt_out_opt(Option_arg arg) 01139 { 01140 if (arg == NULL) 01141 { 01142 return JWSC_EARG("Option 'tandem-confusion-cnt-out' requires an argument"); 01143 } 01144 tandem_confusion_cnt_fname = arg; 01145 return NULL; 01146 } 01147 01149 Error* process_tandem_confusion_pct_out_opt(Option_arg arg) 01150 { 01151 if (arg == NULL) 01152 { 01153 return JWSC_EARG("Option 'tandem-confusion-pct-out' requires an argument"); 01154 } 01155 tandem_confusion_pct_fname = arg; 01156 return NULL; 01157 } 01158 01160 Error* process_tandem_preds_out_opt(Option_arg arg) 01161 { 01162 if (arg == NULL) 01163 { 01164 return JWSC_EARG("Option 'tandem-preds-out' requires an argument"); 01165 } 01166 tandem_preds_fname = arg; 01167 return NULL; 01168 } 01169 01171 static void init_test_options(void) 01172 { 01173 uint32_t i; 01174 01175 char s_name; 01176 const char* l_name; 01177 const char* desc; 01178 01179 Error* (*fnoarg)(); 01180 Error* (*farg)(const char*); 01181 01182 init_options(opts_no_arg, opts_with_arg); 01183 01184 opts.label_col = LABEL_COL; 01185 01186 i = NUM_SHARED_OPTS_NO_ARG; 01187 l_name = "tandem"; 01188 s_name = 0; 01189 desc = "Perform tandem classifier decision analysis."; 01190 fnoarg = process_tandem_opt; 01191 init_option_no_arg(&(opts_no_arg[i++]), l_name, s_name, desc, fnoarg); 01192 01193 l_name = "exclude-one"; 01194 s_name = 0; 01195 desc = "When performing tandem classifier analysis, exclude at most one prediction from the set of classification algorithms. There must be three or more algorithms in play for this to take effect."; 01196 fnoarg = process_exclude_one_opt; 01197 init_option_no_arg(&(opts_no_arg[i++]), l_name, s_name, desc, fnoarg); 01198 assert(i == NUM_OPTS_NO_ARG); 01199 01200 i = NUM_SHARED_OPTS_WITH_ARG; 01201 l_name = "test-type"; 01202 s_name = 0; 01203 desc = "Type of testing to use. Must be one of leave-one-out or cross-validate. Use one of the abbreviations {loo, cv}."; 01204 farg = process_test_type_opt; 01205 init_option_with_arg(&(opts_with_arg[i++]), l_name, s_name, desc, farg); 01206 01207 l_name = "tmp-dir"; 01208 s_name = 0; 01209 desc = "Directory for temporary files, including trained models."; 01210 farg = process_tmp_dir_opt; 01211 init_option_with_arg(&(opts_with_arg[i++]), l_name, s_name, desc, farg); 01212 01213 l_name = "num-cv-folds"; 01214 s_name = 0; 01215 desc = "Number of data folds to use per cross-validation iteration."; 01216 farg = process_num_cv_folds_opt; 01217 init_option_with_arg(&(opts_with_arg[i++]), l_name, s_name, desc, farg); 01218 01219 l_name = "num-cv-iters"; 01220 s_name = 0; 01221 desc = "Number of cross-validation iterations."; 01222 farg = process_num_cv_iters_opt; 01223 init_option_with_arg(&(opts_with_arg[i++]), l_name, s_name, desc, farg); 01224 01225 l_name = "nb-freq"; 01226 s_name = 0; 01227 desc = "Naive Bayes non-parametric frequency model tree information."; 01228 farg = process_nb_freq_opt; 01229 init_option_with_arg(&(opts_with_arg[i++]), l_name, s_name, desc, farg); 01230 01231 l_name = "nb-freq-dtd"; 01232 s_name = 0; 01233 desc = "Validate the naive Bayes non-parametric frequency model tree information XML file with this DTD."; 01234 farg = process_nb_freq_dtd_opt; 01235 init_option_with_arg(&(opts_with_arg[i++]), l_name, s_name, desc, farg); 01236 01237 l_name = "nb-gauss"; 01238 s_name = 0; 01239 desc = "Naive Bayes Gaussian model tree information."; 01240 farg = process_nb_gauss_opt; 01241 init_option_with_arg(&(opts_with_arg[i++]), l_name, s_name, desc, farg); 01242 01243 l_name = "nb-gauss-dtd"; 01244 s_name = 0; 01245 desc = "Validate the naive Bayes Gaussian model tree information XML file with this DTD."; 01246 farg = process_nb_gauss_dtd_opt; 01247 init_option_with_arg(&(opts_with_arg[i++]), l_name, s_name, desc, farg); 01248 01249 l_name = "nb-gmm"; 01250 s_name = 0; 01251 desc = "Naive Bayes Gaussian mixture model tree information."; 01252 farg = process_nb_gmm_opt; 01253 init_option_with_arg(&(opts_with_arg[i++]), l_name, s_name, desc, farg); 01254 01255 l_name = "nb-gmm-dtd"; 01256 s_name = 0; 01257 desc = "Validate the naive Bayes Gaussian mixture model tree information XML file with this DTD."; 01258 farg = process_nb_gmm_dtd_opt; 01259 init_option_with_arg(&(opts_with_arg[i++]), l_name, s_name, desc, farg); 01260 01261 l_name = "mv-gmm"; 01262 s_name = 0; 01263 desc = "Multivariate Gaussian mixture model tree information."; 01264 farg = process_mv_gmm_opt; 01265 init_option_with_arg(&(opts_with_arg[i++]), l_name, s_name, desc, farg); 01266 01267 l_name = "mv-gmm-dtd"; 01268 s_name = 0; 01269 desc = "Validate the multivariate Gaussian mixture model tree information XML file with this DTD."; 01270 farg = process_mv_gmm_dtd_opt; 01271 init_option_with_arg(&(opts_with_arg[i++]), l_name, s_name, desc, farg); 01272 01273 #ifdef HAPLO_ENABLE_SVM 01274 l_name = "svm"; 01275 s_name = 0; 01276 desc = "SVM model tree information."; 01277 farg = process_svm_opt; 01278 init_option_with_arg(&(opts_with_arg[i++]), l_name, s_name, desc, farg); 01279 01280 l_name = "svm-dtd"; 01281 s_name = 0; 01282 desc = "Validate the SVM model tree information XML file with this DTD."; 01283 farg = process_svm_dtd_opt; 01284 init_option_with_arg(&(opts_with_arg[i++]), l_name, s_name, desc, farg); 01285 #endif 01286 01287 #ifdef HAPLO_ENABLE_WEKA 01288 l_name = "weka-j48"; 01289 s_name = 0; 01290 desc = "Weka J48 model tree information."; 01291 farg = process_weka_j48_opt; 01292 init_option_with_arg(&(opts_with_arg[i++]), l_name, s_name, desc, farg); 01293 01294 l_name = "weka-part"; 01295 s_name = 0; 01296 desc = "Weka PART model tree information."; 01297 farg = process_weka_part_opt; 01298 init_option_with_arg(&(opts_with_arg[i++]), l_name, s_name, desc, farg); 01299 01300 l_name = "weka-jar"; 01301 s_name = 0; 01302 desc = "Weka java archive file. Required for using the Weka algorithms."; 01303 farg = process_weka_jar_opt; 01304 init_option_with_arg(&(opts_with_arg[i++]), l_name, s_name, desc, farg); 01305 01306 l_name = "weka-dtd"; 01307 s_name = 0; 01308 desc = "Validate the Weka model tree information XML files with this DTD."; 01309 farg = process_weka_dtd_opt; 01310 init_option_with_arg(&(opts_with_arg[i++]), l_name, s_name, desc, farg); 01311 #endif 01312 01313 l_name = "nearest"; 01314 s_name = 0; 01315 desc = "Nearest neighbor model information."; 01316 farg = process_nearest_opt; 01317 init_option_with_arg(&(opts_with_arg[i++]), l_name, s_name, desc, farg); 01318 01319 l_name = "nearest-dtd"; 01320 s_name = 0; 01321 desc = "Validate the nearest neighbor model information XML file with this DTD."; 01322 farg = process_nearest_dtd_opt; 01323 init_option_with_arg(&(opts_with_arg[i++]), l_name, s_name, desc, farg); 01324 01325 l_name = "nearest-max-d"; 01326 s_name = 0; 01327 desc = "Maximum distance allowed for a nearest neighbor classification."; 01328 farg = process_nearest_max_d_opt; 01329 init_option_with_arg(&(opts_with_arg[i++]), l_name, s_name, desc, farg); 01330 01331 l_name = "nb-freq-summary-out"; 01332 s_name = 0; 01333 desc = "File to output the Naive Bayes freqency model test performance summary to. The default is stdout."; 01334 farg = process_nb_freq_summary_out_opt; 01335 init_option_with_arg(&(opts_with_arg[i++]), l_name, s_name, desc, farg); 01336 01337 l_name = "nb-freq-details-cnt-out"; 01338 s_name = 0; 01339 desc = "File to output the Naive Bayes freqency model test performance details (counts) to. The default is stdout."; 01340 farg = process_nb_freq_details_cnt_out_opt; 01341 init_option_with_arg(&(opts_with_arg[i++]), l_name, s_name, desc, farg); 01342 01343 l_name = "nb-freq-details-pct-out"; 01344 s_name = 0; 01345 desc = "File to output the Naive Bayes freqency model test performance details (percents) to. The default is stdout."; 01346 farg = process_nb_freq_details_pct_out_opt; 01347 init_option_with_arg(&(opts_with_arg[i++]), l_name, s_name, desc, farg); 01348 01349 l_name = "nb-freq-confusion-cnt-out"; 01350 s_name = 0; 01351 desc = "File to output the Naive Bayes freqency model test confusion matrix (counts) to. The default is stdout."; 01352 farg = process_nb_freq_confusion_cnt_out_opt; 01353 init_option_with_arg(&(opts_with_arg[i++]), l_name, s_name, desc, farg); 01354 01355 l_name = "nb-freq-confusion-pct-out"; 01356 s_name = 0; 01357 desc = "File to output the Naive Bayes freqency model test confusion matrix (percents) to. The default is stdout."; 01358 farg = process_nb_freq_confusion_pct_out_opt; 01359 init_option_with_arg(&(opts_with_arg[i++]), l_name, s_name, desc, farg); 01360 01361 l_name = "nb-freq-preds-out"; 01362 s_name = 0; 01363 desc = "File to output the Naive Bayes freqency model test predictions to. The default is no output."; 01364 farg = process_nb_freq_preds_out_opt; 01365 init_option_with_arg(&(opts_with_arg[i++]), l_name, s_name, desc, farg); 01366 01367 l_name = "nb-gauss-summary-out"; 01368 s_name = 0; 01369 desc = "File to output the Naive Bayes Gaussian model test performance summary to. The default is stdout."; 01370 farg = process_nb_gauss_summary_out_opt; 01371 init_option_with_arg(&(opts_with_arg[i++]), l_name, s_name, desc, farg); 01372 01373 l_name = "nb-gauss-details-cnt-out"; 01374 s_name = 0; 01375 desc = "File to output the Naive Bayes Gaussian model test performance details (counts) to. The default is stdout."; 01376 farg = process_nb_gauss_details_cnt_out_opt; 01377 init_option_with_arg(&(opts_with_arg[i++]), l_name, s_name, desc, farg); 01378 01379 l_name = "nb-gauss-details-pct-out"; 01380 s_name = 0; 01381 desc = "File to output the Naive Bayes Gaussian model test performance details (percents) to. The default is stdout."; 01382 farg = process_nb_gauss_details_pct_out_opt; 01383 init_option_with_arg(&(opts_with_arg[i++]), l_name, s_name, desc, farg); 01384 01385 l_name = "nb-gauss-confusion-cnt-out"; 01386 s_name = 0; 01387 desc = "File to output the Naive Bayes Gaussian model test confusion matrix (counts) to. The default is stdout."; 01388 farg = process_nb_gauss_confusion_cnt_out_opt; 01389 init_option_with_arg(&(opts_with_arg[i++]), l_name, s_name, desc, farg); 01390 01391 l_name = "nb-gauss-confusion-pct-out"; 01392 s_name = 0; 01393 desc = "File to output the Naive Bayes Gaussian model test confusion matrix (percents) to. The default is stdout."; 01394 farg = process_nb_gauss_confusion_pct_out_opt; 01395 init_option_with_arg(&(opts_with_arg[i++]), l_name, s_name, desc, farg); 01396 01397 l_name = "nb-gauss-preds-out"; 01398 s_name = 0; 01399 desc = "File to output the Naive Bayes Gaussian model test predictions to. The default is no output."; 01400 farg = process_nb_gauss_preds_out_opt; 01401 init_option_with_arg(&(opts_with_arg[i++]), l_name, s_name, desc, farg); 01402 01403 l_name = "nb-gmm-summary-out"; 01404 s_name = 0; 01405 desc = "File to output the Naive Bayes Gaussian mixture model test performance summary to. The default is stdout."; 01406 farg = process_nb_gmm_summary_out_opt; 01407 init_option_with_arg(&(opts_with_arg[i++]), l_name, s_name, desc, farg); 01408 01409 l_name = "nb-gmm-details-cnt-out"; 01410 s_name = 0; 01411 desc = "File to output the Naive Bayes Gaussian mixture model test performance details (counts) to. The default is stdout."; 01412 farg = process_nb_gmm_details_cnt_out_opt; 01413 init_option_with_arg(&(opts_with_arg[i++]), l_name, s_name, desc, farg); 01414 01415 l_name = "nb-gmm-details-pct-out"; 01416 s_name = 0; 01417 desc = "File to output the Naive Bayes Gaussian mixture model test performance details (percents) to. The default is stdout."; 01418 farg = process_nb_gmm_details_pct_out_opt; 01419 init_option_with_arg(&(opts_with_arg[i++]), l_name, s_name, desc, farg); 01420 01421 l_name = "nb-gmm-confusion-cnt-out"; 01422 s_name = 0; 01423 desc = "File to output the Naive Bayes Gaussian mixture model test confusion matrix (counts) to. The default is stdout."; 01424 farg = process_nb_gmm_confusion_cnt_out_opt; 01425 init_option_with_arg(&(opts_with_arg[i++]), l_name, s_name, desc, farg); 01426 01427 l_name = "nb-gmm-confusion-pct-out"; 01428 s_name = 0; 01429 desc = "File to output the Naive Bayes Gaussian mixture model test confusion matrix (percents) to. The default is stdout."; 01430 farg = process_nb_gmm_confusion_pct_out_opt; 01431 init_option_with_arg(&(opts_with_arg[i++]), l_name, s_name, desc, farg); 01432 01433 l_name = "nb-gmm-preds-out"; 01434 s_name = 0; 01435 desc = "File to output the Naive Bayes Gaussian mixture model test predictions to. The default is no output."; 01436 farg = process_nb_gmm_preds_out_opt; 01437 init_option_with_arg(&(opts_with_arg[i++]), l_name, s_name, desc, farg); 01438 01439 l_name = "mv-gmm-summary-out"; 01440 s_name = 0; 01441 desc = "File to output the multivariate Gaussian mixture model test performance summary to. The default is stdout."; 01442 farg = process_mv_gmm_summary_out_opt; 01443 init_option_with_arg(&(opts_with_arg[i++]), l_name, s_name, desc, farg); 01444 01445 l_name = "mv-gmm-details-cnt-out"; 01446 s_name = 0; 01447 desc = "File to output the multivariate Gaussian mixture model test performance details (counts) to. The default is stdout."; 01448 farg = process_mv_gmm_details_cnt_out_opt; 01449 init_option_with_arg(&(opts_with_arg[i++]), l_name, s_name, desc, farg); 01450 01451 l_name = "mv-gmm-details-pct-out"; 01452 s_name = 0; 01453 desc = "File to output the multivariate Gaussian mixture model test performance details (percents) to. The default is stdout."; 01454 farg = process_mv_gmm_details_pct_out_opt; 01455 init_option_with_arg(&(opts_with_arg[i++]), l_name, s_name, desc, farg); 01456 01457 l_name = "mv-gmm-confusion-cnt-out"; 01458 s_name = 0; 01459 desc = "File to output the multivariate Gaussian mixture model test confusion matrix (counts) to. The default is stdout."; 01460 farg = process_mv_gmm_confusion_cnt_out_opt; 01461 init_option_with_arg(&(opts_with_arg[i++]), l_name, s_name, desc, farg); 01462 01463 l_name = "mv-gmm-confusion-pct-out"; 01464 s_name = 0; 01465 desc = "File to output the multivariate Gaussian mixture model test confusion matrix (percents) to. The default is stdout."; 01466 farg = process_mv_gmm_confusion_pct_out_opt; 01467 init_option_with_arg(&(opts_with_arg[i++]), l_name, s_name, desc, farg); 01468 01469 l_name = "mv-gmm-preds-out"; 01470 s_name = 0; 01471 desc = "File to output the multivariate Gaussian mixture model test predictions to. The default is no output."; 01472 farg = process_mv_gmm_preds_out_opt; 01473 init_option_with_arg(&(opts_with_arg[i++]), l_name, s_name, desc, farg); 01474 01475 #ifdef HAPLO_ENABLE_SVM 01476 l_name = "svm-summary-out"; 01477 s_name = 0; 01478 desc = "File to output the SVM model test performance summary to. The default is stdout."; 01479 farg = process_svm_summary_out_opt; 01480 init_option_with_arg(&(opts_with_arg[i++]), l_name, s_name, desc, farg); 01481 01482 l_name = "svm-details-cnt-out"; 01483 s_name = 0; 01484 desc = "File to output the SVM model test performance details (counts) to. The default is stdout."; 01485 farg = process_svm_details_cnt_out_opt; 01486 init_option_with_arg(&(opts_with_arg[i++]), l_name, s_name, desc, farg); 01487 01488 l_name = "svm-details-pct-out"; 01489 s_name = 0; 01490 desc = "File to output the SVM model test performance details (percents) to. The default is stdout."; 01491 farg = process_svm_details_pct_out_opt; 01492 init_option_with_arg(&(opts_with_arg[i++]), l_name, s_name, desc, farg); 01493 01494 l_name = "svm-confusion-cnt-out"; 01495 s_name = 0; 01496 desc = "File to output the SVM model test confusion matrix (counts) to. The default is stdout."; 01497 farg = process_svm_confusion_cnt_out_opt; 01498 init_option_with_arg(&(opts_with_arg[i++]), l_name, s_name, desc, farg); 01499 01500 l_name = "svm-confusion-pct-out"; 01501 s_name = 0; 01502 desc = "File to output the SVM model test confusion matrix (percents) to. The default is stdout."; 01503 farg = process_svm_confusion_pct_out_opt; 01504 init_option_with_arg(&(opts_with_arg[i++]), l_name, s_name, desc, farg); 01505 01506 l_name = "svm-preds-out"; 01507 s_name = 0; 01508 desc = "File to output the SVM model test predictions to. The default is no output."; 01509 farg = process_svm_preds_out_opt; 01510 init_option_with_arg(&(opts_with_arg[i++]), l_name, s_name, desc, farg); 01511 #endif 01512 01513 #ifdef HAPLO_ENABLE_WEKA 01514 l_name = "weka-j48-summary-out"; 01515 s_name = 0; 01516 desc = "File to output the Weka J48 model test performance summary to. The default is stdout."; 01517 farg = process_weka_j48_summary_out_opt; 01518 init_option_with_arg(&(opts_with_arg[i++]), l_name, s_name, desc, farg); 01519 01520 l_name = "weka-j48-details-cnt-out"; 01521 s_name = 0; 01522 desc = "File to output the Weka J48 model test performance details (counts) to. The default is stdout."; 01523 farg = process_weka_j48_details_cnt_out_opt; 01524 init_option_with_arg(&(opts_with_arg[i++]), l_name, s_name, desc, farg); 01525 01526 l_name = "weka-j48-details-pct-out"; 01527 s_name = 0; 01528 desc = "File to output the Weka J48 model test performance details (percents) to. The default is stdout."; 01529 farg = process_weka_j48_details_pct_out_opt; 01530 init_option_with_arg(&(opts_with_arg[i++]), l_name, s_name, desc, farg); 01531 01532 l_name = "weka-j48-confusion-cnt-out"; 01533 s_name = 0; 01534 desc = "File to output the Weka J48 model test confusion matrix (counts) to. The default is stdout."; 01535 farg = process_weka_j48_confusion_cnt_out_opt; 01536 init_option_with_arg(&(opts_with_arg[i++]), l_name, s_name, desc, farg); 01537 01538 l_name = "weka-j48-confusion-pct-out"; 01539 s_name = 0; 01540 desc = "File to output the Weka J48 model test confusion matrix (percents) to. The default is stdout."; 01541 farg = process_weka_j48_confusion_pct_out_opt; 01542 init_option_with_arg(&(opts_with_arg[i++]), l_name, s_name, desc, farg); 01543 01544 l_name = "weka-j48-preds-out"; 01545 s_name = 0; 01546 desc = "File to output the Weka J48 model test predictions to. The default is no output."; 01547 farg = process_weka_j48_preds_out_opt; 01548 init_option_with_arg(&(opts_with_arg[i++]), l_name, s_name, desc, farg); 01549 01550 l_name = "weka-part-summary-out"; 01551 s_name = 0; 01552 desc = "File to output the Weka PART model test performance summary to. The default is stdout."; 01553 farg = process_weka_part_summary_out_opt; 01554 init_option_with_arg(&(opts_with_arg[i++]), l_name, s_name, desc, farg); 01555 01556 l_name = "weka-part-details-cnt-out"; 01557 s_name = 0; 01558 desc = "File to output the Weka PART model test performance details (counts) to. The default is stdout."; 01559 farg = process_weka_part_details_cnt_out_opt; 01560 init_option_with_arg(&(opts_with_arg[i++]), l_name, s_name, desc, farg); 01561 01562 l_name = "weka-part-details-pct-out"; 01563 s_name = 0; 01564 desc = "File to output the Weka PART model test performance details (percents) to. The default is stdout."; 01565 farg = process_weka_part_details_pct_out_opt; 01566 init_option_with_arg(&(opts_with_arg[i++]), l_name, s_name, desc, farg); 01567 01568 l_name = "weka-part-confusion-cnt-out"; 01569 s_name = 0; 01570 desc = "File to output the Weka PART model test confusion matrix (counts) to. The default is stdout."; 01571 farg = process_weka_part_confusion_cnt_out_opt; 01572 init_option_with_arg(&(opts_with_arg[i++]), l_name, s_name, desc, farg); 01573 01574 l_name = "weka-part-confusion-pct-out"; 01575 s_name = 0; 01576 desc = "File to output the Weka PART model test confusion matrix (percents) to. The default is stdout."; 01577 farg = process_weka_part_confusion_pct_out_opt; 01578 init_option_with_arg(&(opts_with_arg[i++]), l_name, s_name, desc, farg); 01579 01580 l_name = "weka-part-preds-out"; 01581 s_name = 0; 01582 desc = "File to output the Weka PART model test predictions to. The default is no output."; 01583 farg = process_weka_part_preds_out_opt; 01584 init_option_with_arg(&(opts_with_arg[i++]), l_name, s_name, desc, farg); 01585 #endif 01586 01587 l_name = "nearest-summary-out"; 01588 s_name = 0; 01589 desc = "File to output the nearest neighbor model test performance summary to. The default is stdout."; 01590 farg = process_nearest_summary_out_opt; 01591 init_option_with_arg(&(opts_with_arg[i++]), l_name, s_name, desc, farg); 01592 01593 l_name = "nearest-details-cnt-out"; 01594 s_name = 0; 01595 desc = "File to output the nearest neighbor model test performance details (counts) to. The default is stdout."; 01596 farg = process_nearest_details_cnt_out_opt; 01597 init_option_with_arg(&(opts_with_arg[i++]), l_name, s_name, desc, farg); 01598 01599 l_name = "nearest-details-pct-out"; 01600 s_name = 0; 01601 desc = "File to output the nearest neighbor model test performance details (percents) to. The default is stdout."; 01602 farg = process_nearest_details_pct_out_opt; 01603 init_option_with_arg(&(opts_with_arg[i++]), l_name, s_name, desc, farg); 01604 01605 l_name = "nearest-confusion-cnt-out"; 01606 s_name = 0; 01607 desc = "File to output the nearest neighbor model test confusion matrix (counts) to. The default is stdout."; 01608 farg = process_nearest_confusion_cnt_out_opt; 01609 init_option_with_arg(&(opts_with_arg[i++]), l_name, s_name, desc, farg); 01610 01611 l_name = "nearest-confusion-pct-out"; 01612 s_name = 0; 01613 desc = "File to output the nearest neighbor model test confusion matrix (percents) to. The default is stdout."; 01614 farg = process_nearest_confusion_pct_out_opt; 01615 init_option_with_arg(&(opts_with_arg[i++]), l_name, s_name, desc, farg); 01616 01617 l_name = "nearest-preds-out"; 01618 s_name = 0; 01619 desc = "File to output the nearest neighbor model test predictions to. The default is no output."; 01620 farg = process_nearest_preds_out_opt; 01621 init_option_with_arg(&(opts_with_arg[i++]), l_name, s_name, desc, farg); 01622 01623 l_name = "tandem-agree-summary-out"; 01624 s_name = 0; 01625 desc = "File to output the tandem agreement test performance summary to. The default is stdout."; 01626 farg = process_tandem_agree_summary_out_opt; 01627 init_option_with_arg(&(opts_with_arg[i++]), l_name, s_name, desc, farg); 01628 01629 l_name = "tandem-agree-details-cnt-out"; 01630 s_name = 0; 01631 desc = "File to output the tandem agreement test performance details (counts) to. The default is stdout."; 01632 farg = process_tandem_agree_details_cnt_out_opt; 01633 init_option_with_arg(&(opts_with_arg[i++]), l_name, s_name, desc, farg); 01634 01635 l_name = "tandem-agree-details-pct-out"; 01636 s_name = 0; 01637 desc = "File to output the tandem agreement test performance details (percents) to. The default is stdout."; 01638 farg = process_tandem_agree_details_pct_out_opt; 01639 init_option_with_arg(&(opts_with_arg[i++]), l_name, s_name, desc, farg); 01640 01641 l_name = "tandem-summary-out"; 01642 s_name = 0; 01643 desc = "File to output the tandem test performance summary to. The default is stdout."; 01644 farg = process_tandem_summary_out_opt; 01645 init_option_with_arg(&(opts_with_arg[i++]), l_name, s_name, desc, farg); 01646 01647 l_name = "tandem-details-cnt-out"; 01648 s_name = 0; 01649 desc = "File to output the tandem test performance details (counts) to. The default is stdout."; 01650 farg = process_tandem_details_cnt_out_opt; 01651 init_option_with_arg(&(opts_with_arg[i++]), l_name, s_name, desc, farg); 01652 01653 l_name = "tandem-details-pct-out"; 01654 s_name = 0; 01655 desc = "File to output the tandem test performance details (percents) to. The default is stdout."; 01656 farg = process_tandem_details_pct_out_opt; 01657 init_option_with_arg(&(opts_with_arg[i++]), l_name, s_name, desc, farg); 01658 01659 l_name = "tandem-confusion-cnt-out"; 01660 s_name = 0; 01661 desc = "File to output the tandem test confusion matrix (counts) to. The default is stdout."; 01662 farg = process_tandem_confusion_cnt_out_opt; 01663 init_option_with_arg(&(opts_with_arg[i++]), l_name, s_name, desc, farg); 01664 01665 l_name = "tandem-confusion-pct-out"; 01666 s_name = 0; 01667 desc = "File to output the tandem test confusion matrix (percents) to. The default is stdout."; 01668 farg = process_tandem_confusion_pct_out_opt; 01669 init_option_with_arg(&(opts_with_arg[i++]), l_name, s_name, desc, farg); 01670 01671 l_name = "tandem-preds-out"; 01672 s_name = 0; 01673 desc = "File to output the tandem test predictions to. The default is no output."; 01674 farg = process_tandem_preds_out_opt; 01675 init_option_with_arg(&(opts_with_arg[i++]), l_name, s_name, desc, farg); 01676 assert(i == NUM_OPTS_WITH_ARG); 01677 } 01678 01680 static uint8_t num_models_to_test() 01681 { 01682 return (opts.nb_freq_fname != 0) + 01683 (opts.nb_gauss_fname != 0) + 01684 (opts.nb_gmm_fname != 0) + 01685 (opts.mv_gmm_fname != 0) + 01686 #ifdef HAPLO_ENABLE_SVM 01687 (opts.svm_fname != 0) + 01688 #endif 01689 #ifdef HAPLO_ENABLE_WEKA 01690 (opts.weka_j48_fname != 0) + 01691 (opts.weka_part_fname != 0) + 01692 #endif 01693 (opts.nearest_fname != 0); 01694 } 01695 01697 static void find_ancestors 01698 ( 01699 Vector_u32** ancestor_types_out, 01700 Vector_u32** ancestor_labels_out, 01701 const Vector_u32* labels_1, 01702 const Vector_u32* labels_2 01703 ) 01704 { 01705 uint32_t i; 01706 uint32_t ancestor_label; 01707 Haplo_ancestor_type ancestor_type; 01708 01709 if (!labels_1 || !labels_2) 01710 return; 01711 01712 assert(labels_1->num_elts == labels_2->num_elts); 01713 create_vector_u32(ancestor_types_out, labels_1->num_elts); 01714 create_vector_u32(ancestor_labels_out, labels_1->num_elts); 01715 01716 for (i = 0; i < labels_1->num_elts; i++) 01717 { 01718 ancestor_type = find_ancestor_index_of_pair(&ancestor_label, 01719 labels_1->elts[ i ], labels_2->elts[ i ]); 01720 01721 (*ancestor_types_out)->elts[ i ] = ancestor_type; 01722 (*ancestor_labels_out)->elts[ i ] = ancestor_label; 01723 } 01724 } 01725 01727 static void find_tandem_ancestors 01728 ( 01729 Vector_u32** ancestor_types_out, 01730 Vector_u32** ancestor_labels_out, 01731 const Vector_u32* tandem_types, 01732 const Vector_u32* tandem_labels, 01733 const Vector_u32* test_labels 01734 ) 01735 { 01736 uint32_t i; 01737 uint32_t ancestor_label; 01738 Haplo_ancestor_type ancestor_type; 01739 01740 assert(tandem_labels->num_elts == test_labels->num_elts); 01741 create_vector_u32(ancestor_types_out, tandem_labels->num_elts); 01742 create_vector_u32(ancestor_labels_out, tandem_labels->num_elts); 01743 01744 for (i = 0; i < tandem_labels->num_elts; i++) 01745 { 01746 if (tandem_types->elts[ i ] != HAPLO_ANCESTOR_NONE) 01747 { 01748 ancestor_type = find_ancestor_index_of_pair(&ancestor_label, 01749 tandem_labels->elts[ i ], test_labels->elts[ i ]); 01750 } 01751 else 01752 { 01753 ancestor_type = HAPLO_ANCESTOR_NONE; 01754 ancestor_label = 0; 01755 } 01756 01757 (*ancestor_types_out)->elts[ i ] = ancestor_type; 01758 (*ancestor_labels_out)->elts[ i ] = ancestor_label; 01759 } 01760 } 01761 01763 static void find_ancestors_of_sets 01764 ( 01765 Vector_u32** ancestor_types_out, 01766 Vector_u32** ancestor_labels_out, 01767 const Vector_u32* labels_1, 01768 const Vector_u32* labels_2, 01769 const Vector_u32* labels_3, 01770 const Vector_u32* labels_4, 01771 const Vector_u32* labels_5, 01772 const Vector_u32* labels_6, 01773 const Vector_u32* labels_7, 01774 const Vector_u32* labels_8 01775 ) 01776 { 01777 uint32_t n, nn, N; 01778 uint32_t i; 01779 uint32_t num_labels; 01780 uint32_t ancestor_label; 01781 Haplo_ancestor_type ancestor_type; 01782 Vector_u32* labels; 01783 Vector_u32* labelss; 01784 01785 N = 0; 01786 01787 if (labels_1) {N++; num_labels = labels_1->num_elts;} 01788 if (labels_2) {N++; num_labels = labels_2->num_elts;} 01789 if (labels_3) {N++; num_labels = labels_3->num_elts;} 01790 if (labels_4) {N++; num_labels = labels_4->num_elts;} 01791 if (labels_5) {N++; num_labels = labels_5->num_elts;} 01792 if (labels_6) {N++; num_labels = labels_6->num_elts;} 01793 if (labels_7) {N++; num_labels = labels_7->num_elts;} 01794 if (labels_8) {N++; num_labels = labels_8->num_elts;} 01795 01796 assert(N > 0); 01797 01798 labels = NULL; 01799 create_vector_u32(&labels, N); 01800 01801 create_vector_u32(ancestor_types_out, num_labels); 01802 create_vector_u32(ancestor_labels_out, num_labels); 01803 01804 for (i = 0; i < num_labels; i++) 01805 { 01806 N = 0; 01807 01808 if (labels_1) labels->elts[ N++ ] = labels_1->elts[ i ]; 01809 if (labels_2) labels->elts[ N++ ] = labels_2->elts[ i ]; 01810 if (labels_3) labels->elts[ N++ ] = labels_3->elts[ i ]; 01811 if (labels_4) labels->elts[ N++ ] = labels_4->elts[ i ]; 01812 if (labels_5) labels->elts[ N++ ] = labels_5->elts[ i ]; 01813 if (labels_6) labels->elts[ N++ ] = labels_6->elts[ i ]; 01814 if (labels_7) labels->elts[ N++ ] = labels_7->elts[ i ]; 01815 if (labels_8) labels->elts[ N++ ] = labels_8->elts[ i ]; 01816 01817 ancestor_type = find_ancestor_index_of_set(&ancestor_label, labels); 01818 01819 if (opts.exclude_one && ancestor_type == HAPLO_ANCESTOR_NONE && N > 3) 01820 { 01821 labelss = NULL; 01822 create_vector_u32(&labelss, N-1); 01823 for (n = 0; ancestor_type == HAPLO_ANCESTOR_NONE && n < N; n++) 01824 { 01825 for (nn = 0; nn < N-1; nn++) 01826 { 01827 if (nn < n) 01828 { 01829 labelss->elts[ nn ] = labels->elts[ nn ]; 01830 } 01831 else 01832 { 01833 labelss->elts[ nn ] = labels->elts[ nn+1 ]; 01834 } 01835 } 01836 01837 ancestor_type = find_ancestor_index_of_set(&ancestor_label, 01838 labelss); 01839 } 01840 free_vector_u32(labelss); 01841 } 01842 01843 (*ancestor_types_out)->elts[ i ] = ancestor_type; 01844 (*ancestor_labels_out)->elts[ i ] = ancestor_label; 01845 } 01846 } 01847 01849 static void write_leave_one_out_summary 01850 ( 01851 const Matblock_u8* data_ids, 01852 const Vector_u32* data_labels, 01853 const Vector_u32* ancestor_types, 01854 const Vector_u32* ancestor_labels, 01855 const Vector_u32* pred_labels, 01856 const Vector_d* pred_confs, 01857 const Vector_u32* tandem_types, 01858 const char* fname 01859 ) 01860 { 01861 uint32_t i; 01862 uint32_t n, N; 01863 float D; 01864 const char* fmt; 01865 FILE* fp; 01866 xmlDoc* xml_doc = NULL; 01867 xmlNode* xml_root = NULL; 01868 xmlNode* xml_node[3]; 01869 char xml_buf[256]; 01870 Error* err; 01871 Vector_f* counts = NULL; 01872 01873 if ((err = open_output(&fp, &xml_doc, "haplo-test-loo-summary-out", 01874 "haplo-test-loo-summary-out.dtd", fname))) 01875 { 01876 print_error_msg("haplo-test", err->msg); 01877 } 01878 01879 create_zero_vector_f(&counts, 3); 01880 01881 if (opts.header_out && opts.output_format != HAPLO_OUTPUT_XML) 01882 { 01883 switch (opts.output_format) 01884 { 01885 case HAPLO_OUTPUT_TXT: 01886 fmt = "%-10s %-10s %-10s\n"; 01887 break; 01888 case HAPLO_OUTPUT_CSV: 01889 fmt = "%s,%s,%s\n"; 01890 break; 01891 case HAPLO_OUTPUT_XML: 01892 break; 01893 } 01894 fprintf(fp, fmt, "Direct", "Indirect", "None"); 01895 } 01896 else if (opts.output_format == HAPLO_OUTPUT_XML) 01897 { 01898 xml_root = xmlDocGetRootElement(xml_doc); 01899 } 01900 01901 N = ancestor_types->num_elts; 01902 01903 for (n = 0; n < N; n++) 01904 { 01905 if (!tandem_types || tandem_types->elts[n] != HAPLO_ANCESTOR_NONE) 01906 { 01907 switch (ancestor_types->elts[ n ]) 01908 { 01909 case HAPLO_ANCESTOR_DIRECT: 01910 counts->elts[0]++; 01911 break; 01912 case HAPLO_ANCESTOR_INDIRECT: 01913 counts->elts[1]++; 01914 break; 01915 case HAPLO_ANCESTOR_NONE: 01916 counts->elts[2]++; 01917 break; 01918 } 01919 } 01920 } 01921 01922 switch (opts.output_format) 01923 { 01924 case HAPLO_OUTPUT_TXT: 01925 fmt = "%-10.0f %-10.0f %-10.0f\n"; 01926 fprintf(fp, fmt, counts->elts[0], counts->elts[1], counts->elts[2]); 01927 break; 01928 case HAPLO_OUTPUT_CSV: 01929 fmt = "%.0f,%.0f,%.0f\n"; 01930 fprintf(fp, fmt, counts->elts[0], counts->elts[1], counts->elts[2]); 01931 break; 01932 case HAPLO_OUTPUT_XML: 01933 xml_node[0] = XMLNewChild(xml_root, "direct", NULL); 01934 xml_node[1] = XMLNewChild(xml_root, "indirect", NULL); 01935 xml_node[2] = XMLNewChild(xml_root, "none", NULL); 01936 for (i = 0; i < 3; i++) 01937 { 01938 snprintf(xml_buf, 256, "%.0f", counts->elts[ i ]); 01939 XMLNewChild(xml_node[ i ], "count", xml_buf); 01940 } 01941 break; 01942 } 01943 01944 D = counts->elts[0] + counts->elts[1] + counts->elts[2]; 01945 if (D > 0) 01946 { 01947 multiply_vector_by_scalar_f(&counts, counts, 1/D); 01948 } 01949 switch (opts.output_format) 01950 { 01951 case HAPLO_OUTPUT_TXT: 01952 fmt = "%-10.3f %-10.3f %-10.3f\n"; 01953 fprintf(fp, fmt, counts->elts[0], counts->elts[1], counts->elts[2]); 01954 break; 01955 case HAPLO_OUTPUT_CSV: 01956 fmt = "%.3f,%.3f,%.3f\n"; 01957 fprintf(fp, fmt, counts->elts[0], counts->elts[1], counts->elts[2]); 01958 break; 01959 case HAPLO_OUTPUT_XML: 01960 for (i = 0; i < 3; i++) 01961 { 01962 snprintf(xml_buf, 256, "%.3f", counts->elts[ i ]); 01963 XMLNewChild(xml_node[ i ], "percent", xml_buf); 01964 } 01965 break; 01966 } 01967 01968 free_vector_f(counts); 01969 01970 if ((err = close_output(fp, xml_doc, fname))) 01971 { 01972 print_error_msg("haplo-test", err->msg); 01973 } 01974 } 01975 01984 static void write_leave_one_out_details 01985 ( 01986 const Matblock_u8* data_ids, 01987 const Vector_u32* data_labels, 01988 const Vector_u32* ancestor_types, 01989 const Vector_u32* ancestor_labels, 01990 const Vector_u32* pred_labels, 01991 const Vector_d* pred_confs, 01992 const Vector_u32* tandem_types, 01993 const char* cnt_fname, 01994 const char* pct_fname 01995 ) 01996 { 01997 uint32_t i; 01998 uint32_t n, nn, N; 01999 float D; 02000 const char* label; 02001 const char* fmt; 02002 FILE* cnt_fp; 02003 FILE* pct_fp; 02004 xmlDoc* xml_doc = NULL; 02005 xmlNode* xml_root = NULL; 02006 xmlNode* xml_node = NULL; 02007 xmlNode* xml_child[3] = {0}; 02008 char xml_buf[256]; 02009 Error* err; 02010 02011 Vector_f* counts[3] = {0}; 02012 02013 if (!pred_labels) 02014 return; 02015 02016 if ((err = open_output(&cnt_fp, &xml_doc, "haplo-test-loo-details-out", 02017 "haplo-test-loo-details-out.dtd", cnt_fname)) || 02018 (err = open_output(&pct_fp, NULL, NULL, NULL, pct_fname))) 02019 { 02020 print_error_msg("haplo-test", err->msg); 02021 } 02022 02023 N = get_num_haplo_groups(); 02024 for (i = 0; i < 3; i++) 02025 { 02026 create_zero_vector_f(&(counts[ i ]), N); 02027 } 02028 02029 for (i = 0; i < ancestor_types->num_elts; i++) 02030 { 02031 if (!tandem_types || tandem_types->elts[i] != HAPLO_ANCESTOR_NONE) 02032 { 02033 assert(pred_labels->elts[ i ] < N); 02034 n = pred_labels->elts[ i ]; 02035 02036 switch (ancestor_types->elts[ i ]) 02037 { 02038 case HAPLO_ANCESTOR_DIRECT: 02039 counts[0]->elts[ n ]++; 02040 break; 02041 case HAPLO_ANCESTOR_INDIRECT: 02042 counts[1]->elts[ n ]++; 02043 break; 02044 case HAPLO_ANCESTOR_NONE: 02045 counts[2]->elts[ n ]++; 02046 break; 02047 } 02048 } 02049 } 02050 02051 if (opts.header_out && opts.output_format != HAPLO_OUTPUT_XML) 02052 { 02053 switch (opts.output_format) 02054 { 02055 case HAPLO_OUTPUT_TXT: 02056 fmt = "%-10s %-15s %-15s %-15s\n"; 02057 break; 02058 case HAPLO_OUTPUT_CSV: 02059 fmt = "%s,%s,%s,%s\n"; 02060 break; 02061 case HAPLO_OUTPUT_XML: 02062 break; 02063 } 02064 fprintf(cnt_fp, fmt, "Predicted", "Direct", "Indirect", "None"); 02065 fprintf(pct_fp, fmt, "Predicted", "Direct", "Indirect", "None"); 02066 } 02067 else if (opts.output_format == HAPLO_OUTPUT_XML) 02068 { 02069 xml_root = xmlDocGetRootElement(xml_doc); 02070 } 02071 02072 nn = 0; 02073 for (n = 0; n < N; n++) 02074 { 02075 D = counts[0]->elts[n] + counts[1]->elts[n] + counts[2]->elts[n]; 02076 if (D > 0) 02077 { 02078 lookup_haplo_group_label_from_index(&label, n); 02079 switch (opts.output_format) 02080 { 02081 case HAPLO_OUTPUT_TXT: 02082 fprintf(cnt_fp, 02083 "%-10s %-15.0f %-15.0f %-15.0f\n", 02084 label, counts[0]->elts[n], counts[1]->elts[n], 02085 counts[2]->elts[n]); 02086 fprintf(pct_fp, 02087 "%-10s %-15.3f %-15.3f %-15.3f\n", 02088 label, counts[0]->elts[n]/D, counts[1]->elts[n]/D, 02089 counts[2]->elts[n]/D); 02090 break; 02091 case HAPLO_OUTPUT_CSV: 02092 fprintf(cnt_fp, "%s,%.0f,%.0f,%.0f\n", label, 02093 counts[0]->elts[n], counts[1]->elts[n], 02094 counts[2]->elts[n]); 02095 fprintf(pct_fp, "%s,%.3f,%.3f,%.3f\n", label, 02096 counts[0]->elts[n]/D, counts[1]->elts[n]/D, 02097 counts[2]->elts[n]/D); 02098 break; 02099 case HAPLO_OUTPUT_XML: 02100 xml_node = XMLNewChild(xml_root, "predicted", NULL); 02101 snprintf(xml_buf, 256, "%d", nn+1); 02102 XMLNewProp(xml_node, "number", xml_buf); 02103 XMLNewChild(xml_node, "label", label); 02104 xml_child[0] = XMLNewChild(xml_node, "direct", 0); 02105 xml_child[1] = XMLNewChild(xml_node, "indirect", 0); 02106 xml_child[2] = XMLNewChild(xml_node, "none", 0); 02107 for (i = 0; i < 3; i++) 02108 { 02109 snprintf(xml_buf, 256, "%.0f", counts[i]->elts[n]); 02110 XMLNewChild(xml_child[ i ], "count", xml_buf); 02111 snprintf(xml_buf, 256, "%.3f", counts[i]->elts[n]/D); 02112 XMLNewChild(xml_child[ i ], "percent", xml_buf); 02113 } 02114 break; 02115 } 02116 02117 nn++; 02118 } 02119 } 02120 02121 for (i = 0; i < 3; i++) 02122 { 02123 free_vector_f(counts[ i ]); 02124 } 02125 02126 if ((err = close_output(cnt_fp, xml_doc, cnt_fname)) || 02127 (err = close_output(pct_fp, NULL, pct_fname))) 02128 { 02129 print_error_msg("haplo-test", err->msg); 02130 } 02131 } 02132 02134 static void write_leave_one_out_confusion 02135 ( 02136 const Matblock_u8* data_ids, 02137 const Vector_u32* data_labels, 02138 const Vector_u32* ancestor_types, 02139 const Vector_u32* ancestor_labels, 02140 const Vector_u32* pred_labels, 02141 const Vector_d* pred_confs, 02142 const Vector_u32* tandem_types, 02143 const char* cnt_fname, 02144 const char* pct_fname 02145 ) 02146 { 02147 uint32_t i; 02148 uint32_t n_1, n_2, N; 02149 uint32_t nn_1, nn_2; 02150 float D; 02151 const char* label; 02152 FILE* cnt_fp; 02153 FILE* pct_fp; 02154 xmlDoc* xml_doc = NULL; 02155 xmlNode* xml_root = NULL; 02156 xmlNode* xml_actual = NULL; 02157 xmlNode* xml_pred = NULL; 02158 char xml_buf[256]; 02159 Error* err; 02160 02161 Vector_f* actual_counts = NULL; 02162 Vector_f* predicted_counts = NULL; 02163 Matrix_f* confusion = NULL; 02164 Matrix_f* confusion_pct = NULL; 02165 02166 if (!pred_labels) 02167 return; 02168 02169 assert(data_labels->num_elts == pred_labels->num_elts); 02170 02171 if ((err = open_output(&cnt_fp, &xml_doc, "haplo-test-loo-confusion-out", 02172 "haplo-test-loo-confusion-out.dtd", cnt_fname)) || 02173 (err = open_output(&pct_fp, NULL, NULL, NULL, pct_fname))) 02174 { 02175 print_error_msg("haplo-test", err->msg); 02176 } 02177 02178 N = get_num_haplo_groups(); 02179 create_zero_vector_f(&actual_counts, N); 02180 create_zero_vector_f(&predicted_counts, N); 02181 create_zero_matrix_f(&confusion, N, N); 02182 create_zero_matrix_f(&confusion_pct, N, N); 02183 02184 for (i = 0; i < data_labels->num_elts; i++) 02185 { 02186 if (!tandem_types || tandem_types->elts[i] != HAPLO_ANCESTOR_NONE) 02187 { 02188 assert(data_labels->elts[ i ] < N); 02189 assert(pred_labels->elts[ i ] < N); 02190 02191 actual_counts->elts[ data_labels->elts[ i ] ]++; 02192 predicted_counts->elts[ pred_labels->elts[ i ] ]++; 02193 confusion->elts[ data_labels->elts[ i ] ][ pred_labels->elts[ i ] ]++; 02194 } 02195 } 02196 02197 copy_matrix_f(&confusion_pct, confusion); 02198 for (n_2 = 0; n_2 < N; n_2++) 02199 { 02200 D = predicted_counts->elts[ n_2 ]; 02201 if (D > 0) 02202 { 02203 for (n_1 = 0; n_1 < N; n_1++) 02204 { 02205 confusion_pct->elts[ n_1 ][ n_2 ] /= D; 02206 } 02207 } 02208 } 02209 02210 if (opts.header_out && opts.output_format != HAPLO_OUTPUT_XML) 02211 { 02212 switch (opts.output_format) 02213 { 02214 case HAPLO_OUTPUT_TXT: 02215 fprintf(cnt_fp, "%-10s", "Actual"); 02216 fprintf(pct_fp, "%-10s", "Actual"); 02217 for (n_2 = 0; n_2 < N; n_2++) 02218 { 02219 if (predicted_counts->elts[ n_2 ]) 02220 { 02221 lookup_haplo_group_label_from_index(&label, n_2); 02222 fprintf(cnt_fp, " %-10s", label); 02223 fprintf(pct_fp, " %-10s", label); 02224 } 02225 } 02226 break; 02227 case HAPLO_OUTPUT_CSV: 02228 fprintf(cnt_fp, "%s", "Actual"); 02229 fprintf(pct_fp, "%s", "Actual"); 02230 for (n_2 = 0; n_2 < N; n_2++) 02231 { 02232 if (predicted_counts->elts[ n_2 ]) 02233 { 02234 lookup_haplo_group_label_from_index(&label, n_2); 02235 fprintf(cnt_fp, ",%s", label); 02236 fprintf(pct_fp, ",%s", label); 02237 } 02238 } 02239 break; 02240 case HAPLO_OUTPUT_XML: 02241 break; 02242 } 02243 fprintf(cnt_fp, "\n"); 02244 fprintf(pct_fp, "\n"); 02245 } 02246 else if (opts.output_format == HAPLO_OUTPUT_XML) 02247 { 02248 xml_root = xmlDocGetRootElement(xml_doc); 02249 } 02250 02251 nn_1 = 0; 02252 for (n_1 = 0; n_1 < N; n_1++) 02253 { 02254 switch (opts.output_format) 02255 { 02256 case HAPLO_OUTPUT_TXT: 02257 if (actual_counts->elts[ n_1 ] > 0) 02258 { 02259 lookup_haplo_group_label_from_index(&label, n_1); 02260 fprintf(cnt_fp, "%-10s", label); 02261 fprintf(pct_fp, "%-10s", label); 02262 02263 nn_2 = 0; 02264 for (n_2 = 0; n_2 < N; n_2++) 02265 { 02266 if (predicted_counts->elts[ n_2 ] > 0) 02267 { 02268 fprintf(cnt_fp, " %-10.0f", 02269 confusion->elts[ n_1 ][ n_2 ]); 02270 fprintf(pct_fp, " %-10.3f", 02271 confusion_pct->elts[ n_1 ][ n_2 ]); 02272 nn_2++; 02273 } 02274 } 02275 fprintf(cnt_fp, "\n"); 02276 fprintf(pct_fp, "\n"); 02277 nn_1++; 02278 } 02279 break; 02280 case HAPLO_OUTPUT_CSV: 02281 if (actual_counts->elts[ n_1 ] > 0) 02282 { 02283 lookup_haplo_group_label_from_index(&label, n_1); 02284 fprintf(cnt_fp, "%s", label); 02285 fprintf(pct_fp, "%s", label); 02286 02287 nn_2 = 0; 02288 for (n_2 = 0; n_2 < N; n_2++) 02289 { 02290 if (predicted_counts->elts[ n_2 ] > 0) 02291 { 02292 fprintf(cnt_fp, ",%.0f", 02293 confusion->elts[ n_1 ][ n_2 ]); 02294 fprintf(pct_fp, ",%.3f", 02295 confusion_pct->elts[ n_1 ][ n_2 ]); 02296 nn_2++; 02297 } 02298 } 02299 fprintf(cnt_fp, "\n"); 02300 fprintf(pct_fp, "\n"); 02301 nn_1++; 02302 } 02303 break; 02304 case HAPLO_OUTPUT_XML: 02305 if (actual_counts->elts[ n_1 ] > 0) 02306 { 02307 lookup_haplo_group_label_from_index(&label, n_1); 02308 xml_actual = XMLNewChild(xml_root, "actual", NULL); 02309 snprintf(xml_buf, 256, "%d", nn_1+1); 02310 XMLNewProp(xml_actual, "number", xml_buf); 02311 XMLNewChild(xml_actual, "label", label); 02312 02313 nn_2 = 0; 02314 for (n_2 = 0; n_2 < N; n_2++) 02315 { 02316 if (predicted_counts->elts[ n_2 ] > 0) 02317 { 02318 lookup_haplo_group_label_from_index(&label, n_2); 02319 xml_pred = XMLNewChild(xml_actual, "predicted", 0); 02320 snprintf(xml_buf, 256, "%d", nn_2+1); 02321 XMLNewProp(xml_pred, "number", xml_buf); 02322 XMLNewChild(xml_pred, "label", label); 02323 02324 snprintf(xml_buf, 256, "%.0f", 02325 confusion->elts[n_1][n_2]); 02326 XMLNewChild(xml_pred, "count", xml_buf); 02327 02328 snprintf(xml_buf, 256, "%.3f", 02329 confusion_pct->elts[n_1][n_2]); 02330 XMLNewChild(xml_pred, "percent", xml_buf); 02331 02332 nn_2++; 02333 } 02334 } 02335 nn_1++; 02336 } 02337 break; 02338 } 02339 } 02340 02341 free_vector_f(actual_counts); 02342 free_vector_f(predicted_counts); 02343 free_matrix_f(confusion); 02344 free_matrix_f(confusion_pct); 02345 02346 if ((err = close_output(cnt_fp, xml_doc, cnt_fname)) || 02347 (err = close_output(pct_fp, NULL, pct_fname))) 02348 { 02349 print_error_msg("haplo-test", err->msg); 02350 } 02351 } 02352 02354 static void write_leave_one_out_preds 02355 ( 02356 const char* type, 02357 const Matblock_u8* data_ids, 02358 const Vector_u32* data_labels, 02359 const Vector_u32* ancestor_types, 02360 const Vector_u32* ancestor_labels, 02361 const Vector_u32* pred_labels, 02362 const Vector_d* pred_confs, 02363 const Vector_u32* tandem_types, 02364 const char* fname 02365 ) 02366 { 02367 uint32_t i, j; 02368 FILE* fp; 02369 xmlDoc* xml_doc = NULL; 02370 xmlNode* xml_root = NULL; 02371 xmlNode* xml_node = NULL; 02372 char xml_buf[256] = {0}; 02373 Error* err; 02374 02375 if (!pred_labels) 02376 return; 02377 02378 if ((err = open_output(&fp, &xml_doc, "haplo-test-loo-predictions-out", 02379 "haplo-test-loo-predictions-out.dtd", fname))) 02380 { 02381 print_error_msg("haplo-test", err->msg); 02382 } 02383 02384 if (opts.header_out && opts.output_format != HAPLO_OUTPUT_XML) 02385 { 02386 if (data_ids) 02387 { 02388 for (j = 0; j < data_ids->num_rows; j++) 02389 { 02390 switch (opts.output_format) 02391 { 02392 case HAPLO_OUTPUT_TXT: 02393 fprintf(fp, "ID %-7d ", j+1); 02394 break; 02395 case HAPLO_OUTPUT_CSV: 02396 fprintf(fp, "ID %d,", j+1); 02397 break; 02398 case HAPLO_OUTPUT_XML: 02399 break; 02400 } 02401 } 02402 } 02403 02404 switch (opts.output_format) 02405 { 02406 case HAPLO_OUTPUT_TXT: 02407 fprintf(fp, "%-10s %-4s %-10s", "Actual", "Ancestor", "Type"); 02408 if (pred_labels) 02409 fprintf(fp, " %-10s %-5s", "Prediction", "Conf"); 02410 break; 02411 case HAPLO_OUTPUT_CSV: 02412 fprintf(fp, "%s,%s,%s", "Actual", "Ancestor", "Type"); 02413 if (pred_labels) 02414 fprintf(fp, ",%s,%s", "Prediction", "Conf"); 02415 break; 02416 case HAPLO_OUTPUT_XML: 02417 break; 02418 } 02419 fprintf(fp, "\n"); 02420 } 02421 else if (opts.output_format == HAPLO_OUTPUT_XML) 02422 { 02423 xml_root = xmlDocGetRootElement(xml_doc); 02424 } 02425 02426 for (i = 0; i < data_labels->num_elts; i++) 02427 { 02428 if (!tandem_types || tandem_types->elts[i] != HAPLO_ANCESTOR_NONE) 02429 { 02430 if (opts.output_format == HAPLO_OUTPUT_XML) 02431 { 02432 xml_node = XMLNewChild(xml_root, "sample", NULL); 02433 snprintf(xml_buf, 256, "%d", i+1); 02434 XMLNewProp(xml_node, "number", xml_buf); 02435 } 02436 02437 write_ids(data_ids, i, HAPLO_SEP_SUFFIX, fp, xml_node); 02438 write_label(data_labels, i, HAPLO_SEP_SUFFIX, fp, xml_node); 02439 02440 write_ancestor_label(ancestor_types, ancestor_labels, i, 02441 HAPLO_SEP_NONE, fp, xml_node); 02442 02443 write_prediction(type, pred_labels, pred_confs, i, 02444 HAPLO_SEP_PREFIX, fp, xml_node); 02445 02446 if (opts.output_format != HAPLO_OUTPUT_XML) 02447 { 02448 fprintf(fp, "\n"); 02449 } 02450 } 02451 } 02452 02453 if ((err = close_output(fp, xml_doc, fname))) 02454 { 02455 print_error_msg("haplo-test", err->msg); 02456 } 02457 } 02458 02462 static void create_leave_one_out_train_and_test_data 02463 ( 02464 Vector_u32** train_labels_out, 02465 Matrix_i32** train_markers_out, 02466 Matrix_i32** test_markers_out, 02467 const Vector_u32* labels, 02468 const Matrix_i32* markers, 02469 uint32_t i 02470 ) 02471 { 02472 create_vector_u32(train_labels_out, labels->num_elts - 1); 02473 create_matrix_i32(train_markers_out, markers->num_rows - 1, 02474 markers->num_cols); 02475 create_matrix_i32(test_markers_out, 1, markers->num_cols); 02476 02477 if (i > 0) 02478 { 02479 copy_vector_section_into_vector_u32(*train_labels_out, 0, labels, 0, i); 02480 copy_matrix_block_into_matrix_i32(*train_markers_out, 0, 0, 02481 markers, 0, 0, i, markers->num_cols); 02482 } 02483 if (i < labels->num_elts - 1) 02484 { 02485 copy_vector_section_into_vector_u32(*train_labels_out, i, 02486 labels, i+1, labels->num_elts - 1 - i); 02487 copy_matrix_block_into_matrix_i32(*train_markers_out, i, 0, 02488 markers, i+1, 0, markers->num_rows - 1 - i, markers->num_cols); 02489 } 02490 02491 copy_matrix_block_into_matrix_i32(*test_markers_out, 0, 0, 02492 markers, i, 0, 1, markers->num_cols); 02493 } 02494 02496 static void leave_one_out_nb_freq 02497 ( 02498 Vector_u32** labels_out, 02499 Vector_d** confs_out, 02500 Vector_u32** ancestor_types_out, 02501 Vector_u32** ancestor_labels_out, 02502 const Matblock_u8* data_ids, 02503 const Vector_u32* data_labels, 02504 const Matrix_i32* data_markers 02505 ) 02506 { 02507 uint32_t i; 02508 02509 NB_freq_model_tree* tree = NULL; 02510 Vector_u32* train_labels = NULL; 02511 Matrix_i32* train_markers = NULL; 02512 Vector_u32* test_labels = NULL; 02513 Vector_d* test_confs = NULL; 02514 Matrix_i32* test_markers = NULL; 02515 Error* err; 02516 02517 if (!opts.nb_freq_fname) 02518 return; 02519 02520 create_vector_u32(labels_out, data_labels->num_elts); 02521 create_vector_d(confs_out, data_labels->num_elts); 02522 02523 for (i = 0; i < data_labels->num_elts; i++) 02524 { 02525 create_leave_one_out_train_and_test_data(&train_labels, &train_markers, 02526 &test_markers, data_labels, data_markers, i); 02527 02528 if ((err = train_nb_freq_model_tree(&tree, train_labels, train_markers, 02529 opts.nb_freq_fname, opts.nb_freq_dtd_fname)) || 02530 (err = predict_labels_with_nb_freq_model_tree(&test_labels, 02531 &test_confs, test_markers, tree, 0))) 02532 { 02533 print_error_msg_exit("haplo-test", err->msg); 02534 } 02535 02536 assert(test_labels->num_elts == 1); 02537 (*labels_out)->elts[ i ] = test_labels->elts[ 0 ]; 02538 (*confs_out)->elts[ i ] = test_confs->elts[ 0 ]; 02539 } 02540 02541 free_vector_u32(test_labels); 02542 free_vector_d(test_confs); 02543 free_matrix_i32(test_markers); 02544 free_vector_u32(train_labels); 02545 free_matrix_i32(train_markers); 02546 free_nb_freq_model_tree(tree); 02547 02548 find_ancestors(ancestor_types_out, ancestor_labels_out, *labels_out, 02549 data_labels); 02550 02551 write_leave_one_out_summary(data_ids, data_labels, *ancestor_types_out, 02552 *ancestor_labels_out, *labels_out, *confs_out, NULL, 02553 nb_freq_summary_fname); 02554 02555 write_leave_one_out_details(data_ids, data_labels, *ancestor_types_out, 02556 *ancestor_labels_out, *labels_out, *confs_out, NULL, 02557 nb_freq_details_cnt_fname, nb_freq_details_pct_fname); 02558 02559 write_leave_one_out_confusion(data_ids, data_labels, *ancestor_types_out, 02560 *ancestor_labels_out, *labels_out, *confs_out, NULL, 02561 nb_freq_confusion_cnt_fname, nb_freq_confusion_pct_fname); 02562 02563 write_leave_one_out_preds("nb-freq", data_ids, data_labels, 02564 *ancestor_types_out, *ancestor_labels_out, *labels_out, 02565 *confs_out, NULL, nb_freq_preds_fname); 02566 } 02567 02569 static void leave_one_out_nb_gauss 02570 ( 02571 Vector_u32** labels_out, 02572 Vector_d** confs_out, 02573 Vector_u32** ancestor_types_out, 02574 Vector_u32** ancestor_labels_out, 02575 const Matblock_u8* data_ids, 02576 const Vector_u32* data_labels, 02577 const Matrix_i32* data_markers 02578 ) 02579 { 02580 uint32_t i; 02581 02582 NB_gauss_model_tree* tree = NULL; 02583 Vector_u32* train_labels = NULL; 02584 Matrix_i32* train_markers = NULL; 02585 Vector_u32* test_labels = NULL; 02586 Vector_d* test_confs = NULL; 02587 Matrix_i32* test_markers = NULL; 02588 Error* err; 02589 02590 if (!opts.nb_gauss_fname) 02591 return; 02592 02593 create_vector_u32(labels_out, data_labels->num_elts); 02594 create_vector_d(confs_out, data_labels->num_elts); 02595 02596 for (i = 0; i < data_labels->num_elts; i++) 02597 { 02598 create_leave_one_out_train_and_test_data(&train_labels, &train_markers, 02599 &test_markers, data_labels, data_markers, i); 02600 02601 if ((err = train_nb_gauss_model_tree(&tree, train_labels, train_markers, 02602 opts.nb_gauss_fname, opts.nb_gauss_dtd_fname)) || 02603 (err = predict_labels_with_nb_gauss_model_tree(&test_labels, 02604 &test_confs, test_markers, tree, 0))) 02605 { 02606 print_error_msg_exit("haplo-test", err->msg); 02607 } 02608 02609 assert(test_labels->num_elts == 1); 02610 (*labels_out)->elts[ i ] = test_labels->elts[ 0 ]; 02611 (*confs_out)->elts[ i ] = test_confs->elts[ 0 ]; 02612 } 02613 02614 free_vector_u32(test_labels); 02615 free_vector_d(test_confs); 02616 free_matrix_i32(test_markers); 02617 free_vector_u32(train_labels); 02618 free_matrix_i32(train_markers); 02619 free_nb_gauss_model_tree(tree); 02620 02621 find_ancestors(ancestor_types_out, ancestor_labels_out, *labels_out, 02622 data_labels); 02623 02624 write_leave_one_out_summary(data_ids, data_labels, *ancestor_types_out, 02625 *ancestor_labels_out, *labels_out, *confs_out, NULL, 02626 nb_gauss_summary_fname); 02627 02628 write_leave_one_out_details(data_ids, data_labels, *ancestor_types_out, 02629 *ancestor_labels_out, *labels_out, *confs_out, NULL, 02630 nb_gauss_details_cnt_fname, nb_gauss_details_pct_fname); 02631 02632 write_leave_one_out_confusion(data_ids, data_labels, *ancestor_types_out, 02633 *ancestor_labels_out, *labels_out, *confs_out, NULL, 02634 nb_gauss_confusion_cnt_fname, nb_gauss_confusion_pct_fname); 02635 02636 write_leave_one_out_preds("nb-gauss", data_ids, data_labels, 02637 *ancestor_types_out, *ancestor_labels_out, *labels_out, 02638 *confs_out, NULL, nb_gauss_preds_fname); 02639 } 02640 02642 static void leave_one_out_nb_gmm 02643 ( 02644 Vector_u32** labels_out, 02645 Vector_d** confs_out, 02646 Vector_u32** ancestor_types_out, 02647 Vector_u32** ancestor_labels_out, 02648 const Matblock_u8* data_ids, 02649 const Vector_u32* data_labels, 02650 const Matrix_i32* data_markers 02651 ) 02652 { 02653 uint32_t i; 02654 02655 NB_gmm_model_tree* tree = NULL; 02656 Vector_u32* train_labels = NULL; 02657 Matrix_i32* train_markers = NULL; 02658 Vector_u32* test_labels = NULL; 02659 Vector_d* test_confs = NULL; 02660 Matrix_i32* test_markers = NULL; 02661 Error* err; 02662 02663 if (!opts.nb_gmm_fname) 02664 return; 02665 02666 create_vector_u32(labels_out, data_labels->num_elts); 02667 create_vector_d(confs_out, data_labels->num_elts); 02668 02669 for (i = 0; i < data_labels->num_elts; i++) 02670 { 02671 create_leave_one_out_train_and_test_data(&train_labels, &train_markers, 02672 &test_markers, data_labels, data_markers, i); 02673 02674 if ((err = train_nb_gmm_model_tree(&tree, train_labels, train_markers, 02675 opts.nb_gmm_fname, opts.nb_gmm_dtd_fname)) || 02676 (err = predict_labels_with_nb_gmm_model_tree(&test_labels, 02677 &test_confs, test_markers, tree, 0))) 02678 { 02679 print_error_msg_exit("haplo-test", err->msg); 02680 } 02681 02682 assert(test_labels->num_elts == 1); 02683 (*labels_out)->elts[ i ] = test_labels->elts[ 0 ]; 02684 (*confs_out)->elts[ i ] = test_confs->elts[ 0 ]; 02685 } 02686 02687 free_vector_u32(test_labels); 02688 free_vector_d(test_confs); 02689 free_matrix_i32(test_markers); 02690 free_vector_u32(train_labels); 02691 free_matrix_i32(train_markers); 02692 free_nb_gmm_model_tree(tree); 02693 02694 find_ancestors(ancestor_types_out, ancestor_labels_out, *labels_out, 02695 data_labels); 02696 02697 write_leave_one_out_summary(data_ids, data_labels, *ancestor_types_out, 02698 *ancestor_labels_out, *labels_out, *confs_out, NULL, 02699 nb_gmm_summary_fname); 02700 02701 write_leave_one_out_details(data_ids, data_labels, *ancestor_types_out, 02702 *ancestor_labels_out, *labels_out, *confs_out, NULL, 02703 nb_gmm_details_cnt_fname, nb_gmm_details_pct_fname); 02704 02705 write_leave_one_out_confusion(data_ids, data_labels, *ancestor_types_out, 02706 *ancestor_labels_out, *labels_out, *confs_out, NULL, 02707 nb_gmm_confusion_cnt_fname, nb_gmm_confusion_pct_fname); 02708 02709 write_leave_one_out_preds("nb-gmm", data_ids, data_labels, 02710 *ancestor_types_out, *ancestor_labels_out, *labels_out, 02711 *confs_out, NULL, nb_gmm_preds_fname); 02712 } 02713 02715 static void leave_one_out_mv_gmm 02716 ( 02717 Vector_u32** labels_out, 02718 Vector_d** confs_out, 02719 Vector_u32** ancestor_types_out, 02720 Vector_u32** ancestor_labels_out, 02721 const Matblock_u8* data_ids, 02722 const Vector_u32* data_labels, 02723 const Matrix_i32* data_markers 02724 ) 02725 { 02726 uint32_t i; 02727 02728 MV_gmm_model_tree* tree = NULL; 02729 Vector_u32* train_labels = NULL; 02730 Matrix_i32* train_markers = NULL; 02731 Vector_u32* test_labels = NULL; 02732 Vector_d* test_confs = NULL; 02733 Matrix_i32* test_markers = NULL; 02734 Error* err; 02735 02736 if (!opts.mv_gmm_fname) 02737 return; 02738 02739 create_vector_u32(labels_out, data_labels->num_elts); 02740 create_vector_d(confs_out, data_labels->num_elts); 02741 02742 for (i = 0; i < data_labels->num_elts; i++) 02743 { 02744 create_leave_one_out_train_and_test_data(&train_labels, &train_markers, 02745 &test_markers, data_labels, data_markers, i); 02746 02747 if ((err = train_mv_gmm_model_tree(&tree, train_labels, train_markers, 02748 opts.mv_gmm_fname, opts.mv_gmm_dtd_fname)) || 02749 (err = predict_labels_with_mv_gmm_model_tree(&test_labels, 02750 &test_confs, test_markers, tree, 0))) 02751 { 02752 print_error_msg_exit("haplo-test", err->msg); 02753 } 02754 02755 assert(test_labels->num_elts == 1); 02756 (*labels_out)->elts[ i ] = test_labels->elts[ 0 ]; 02757 (*confs_out)->elts[ i ] = test_confs->elts[ 0 ]; 02758 } 02759 02760 free_vector_u32(test_labels); 02761 free_vector_d(test_confs); 02762 free_matrix_i32(test_markers); 02763 free_vector_u32(train_labels); 02764 free_matrix_i32(train_markers); 02765 free_mv_gmm_model_tree(tree); 02766 02767 find_ancestors(ancestor_types_out, ancestor_labels_out, *labels_out, 02768 data_labels); 02769 02770 write_leave_one_out_summary(data_ids, data_labels, *ancestor_types_out, 02771 *ancestor_labels_out, *labels_out, *confs_out, NULL, 02772 mv_gmm_summary_fname); 02773 02774 write_leave_one_out_details(data_ids, data_labels, *ancestor_types_out, 02775 *ancestor_labels_out, *labels_out, *confs_out, NULL, 02776 mv_gmm_details_cnt_fname, mv_gmm_details_pct_fname); 02777 02778 write_leave_one_out_confusion(data_ids, data_labels, *ancestor_types_out, 02779 *ancestor_labels_out, *labels_out, *confs_out, NULL, 02780 mv_gmm_confusion_cnt_fname, mv_gmm_confusion_pct_fname); 02781 02782 write_leave_one_out_preds("mv-gmm", data_ids, data_labels, 02783 *ancestor_types_out, *ancestor_labels_out, *labels_out, 02784 *confs_out, NULL, mv_gmm_preds_fname); 02785 } 02786 02788 static void leave_one_out_svm 02789 ( 02790 Vector_u32** labels_out, 02791 Vector_d** confs_out, 02792 Vector_u32** ancestor_types_out, 02793 Vector_u32** ancestor_labels_out, 02794 const Matblock_u8* data_ids, 02795 const Vector_u32* data_labels, 02796 const Matrix_i32* data_markers 02797 ) 02798 { 02799 #ifdef HAPLO_ENABLE_SVM 02800 uint32_t i; 02801 02802 SVM_model_tree* tree = NULL; 02803 Vector_u32* train_labels = NULL; 02804 Matrix_i32* train_markers = NULL; 02805 Vector_u32* test_labels = NULL; 02806 Vector_d* test_confs = NULL; 02807 Matrix_i32* test_markers = NULL; 02808 Error* err; 02809 02810 if (!opts.svm_fname) 02811 return; 02812 02813 create_vector_u32(labels_out, data_labels->num_elts); 02814 create_vector_d(confs_out, data_labels->num_elts); 02815 02816 for (i = 0; i < data_labels->num_elts; i++) 02817 { 02818 create_leave_one_out_train_and_test_data(&train_labels, &train_markers, 02819 &test_markers, data_labels, data_markers, i); 02820 02821 if ((err = train_svm_model_tree(&tree, train_labels, train_markers, 02822 opts.svm_fname, opts.svm_dtd_fname)) || 02823 (err = predict_labels_with_svm_model_tree(&test_labels, 02824 &test_confs, test_markers, tree))) 02825 { 02826 print_error_msg_exit("haplo-test", err->msg); 02827 } 02828 02829 assert(test_labels->num_elts == 1); 02830 (*labels_out)->elts[ i ] = test_labels->elts[ 0 ]; 02831 (*confs_out)->elts[ i ] = test_confs->elts[ 0 ]; 02832 } 02833 02834 free_vector_u32(test_labels); 02835 free_vector_d(test_confs); 02836 free_matrix_i32(test_markers); 02837 free_vector_u32(train_labels); 02838 free_matrix_i32(train_markers); 02839 free_svm_model_tree(tree); 02840 02841 find_ancestors(ancestor_types_out, ancestor_labels_out, *labels_out, 02842 data_labels); 02843 02844 write_leave_one_out_summary(data_ids, data_labels, *ancestor_types_out, 02845 *ancestor_labels_out, *labels_out, *confs_out, NULL, 02846 svm_summary_fname); 02847 02848 write_leave_one_out_details(data_ids, data_labels, *ancestor_types_out, 02849 *ancestor_labels_out, *labels_out, *confs_out, NULL, 02850 svm_details_cnt_fname, svm_details_pct_fname); 02851 02852 write_leave_one_out_confusion(data_ids, data_labels, *ancestor_types_out, 02853 *ancestor_labels_out, *labels_out, *confs_out, NULL, 02854 svm_confusion_cnt_fname, svm_confusion_pct_fname); 02855 02856 write_leave_one_out_preds("svm", data_ids, data_labels, 02857 *ancestor_types_out, *ancestor_labels_out, *labels_out, 02858 *confs_out, NULL, svm_preds_fname); 02859 #else 02860 return; 02861 #endif 02862 } 02863 02865 static void leave_one_out_j48 02866 ( 02867 Vector_u32** labels_out, 02868 Vector_d** confs_out, 02869 Vector_u32** ancestor_types_out, 02870 Vector_u32** ancestor_labels_out, 02871 const Matblock_u8* data_ids, 02872 const Vector_u32* data_labels, 02873 const Matrix_i32* data_markers 02874 ) 02875 { 02876 #ifdef HAPLO_ENABLE_WEKA 02877 uint32_t i; 02878 pid_t pid; 02879 char tmp_dir[1024] = {0}; 02880 char script[4096] = {0}; 02881 02882 Weka_model_tree* tree = NULL; 02883 Vector_u32* train_labels = NULL; 02884 Matrix_i32* train_markers = NULL; 02885 Vector_u32* test_labels = NULL; 02886 Vector_d* test_confs = NULL; 02887 Matrix_i32* test_markers = NULL; 02888 Error* err; 02889 02890 if (!opts.weka_j48_fname) 02891 return; 02892 02893 pid = getpid(); 02894 snprintf(tmp_dir, 1024, "%s/.haplo_test_leave_one_out_j48_%u", 02895 tmp_dirname, pid); 02896 snprintf(script, 4096, "mkdir -p %s", tmp_dir); 02897 if (system(script) == 127) 02898 { 02899 print_error_msg_exit("haplo-test", "Could not create tmp files"); 02900 } 02901 02902 create_vector_u32(labels_out, data_labels->num_elts); 02903 create_vector_d(confs_out, data_labels->num_elts); 02904 02905 for (i = 0; i < data_labels->num_elts; i++) 02906 { 02907 create_leave_one_out_train_and_test_data(&train_labels, &train_markers, 02908 &test_markers, data_labels, data_markers, i); 02909 02910 if ((err = train_weka_j48_model_tree(&tree, train_labels, train_markers, 02911 opts.weka_j48_fname, opts.weka_dtd_fname, 02912 tmp_dir, opts.weka_jar_fname)) || 02913 (err = predict_labels_with_weka_j48_model_tree(&test_labels, 02914 &test_confs, test_markers, tree, opts.weka_jar_fname))) 02915 { 02916 print_error_msg_exit("haplo-test", err->msg); 02917 } 02918 02919 assert(test_labels->num_elts == 1); 02920 (*labels_out)->elts[ i ] = test_labels->elts[ 0 ]; 02921 (*confs_out)->elts[ i ] = test_confs->elts[ 0 ]; 02922 } 02923 02924 snprintf(script, 4096, "rm -rf %s", tmp_dir); 02925 if (system(script) == 127) 02926 { 02927 print_error_msg_exit("haplo-test", "Could not remove tmp files"); 02928 } 02929 02930 free_vector_u32(test_labels); 02931 free_vector_d(test_confs); 02932 free_matrix_i32(test_markers); 02933 free_vector_u32(train_labels); 02934 free_matrix_i32(train_markers); 02935 free_weka_model_tree(tree); 02936 02937 find_ancestors(ancestor_types_out, ancestor_labels_out, *labels_out, 02938 data_labels); 02939 02940 write_leave_one_out_summary(data_ids, data_labels, *ancestor_types_out, 02941 *ancestor_labels_out, *labels_out, *confs_out, NULL, 02942 weka_j48_summary_fname); 02943 02944 write_leave_one_out_details(data_ids, data_labels, *ancestor_types_out, 02945 *ancestor_labels_out, *labels_out, *confs_out, NULL, 02946 weka_j48_details_cnt_fname, weka_j48_details_pct_fname); 02947 02948 write_leave_one_out_confusion(data_ids, data_labels, *ancestor_types_out, 02949 *ancestor_labels_out, *labels_out, *confs_out, NULL, 02950 weka_j48_confusion_cnt_fname, weka_j48_confusion_pct_fname); 02951 02952 write_leave_one_out_preds("j48", data_ids, data_labels, 02953 *ancestor_types_out, *ancestor_labels_out, *labels_out, 02954 *confs_out, NULL, weka_j48_preds_fname); 02955 #else 02956 return; 02957 #endif 02958 } 02959 02961 static void leave_one_out_part 02962 ( 02963 Vector_u32** labels_out, 02964 Vector_d** confs_out, 02965 Vector_u32** ancestor_types_out, 02966 Vector_u32** ancestor_labels_out, 02967 const Matblock_u8* data_ids, 02968 const Vector_u32* data_labels, 02969 const Matrix_i32* data_markers 02970 ) 02971 { 02972 #ifdef HAPLO_ENABLE_WEKA 02973 uint32_t i; 02974 pid_t pid; 02975 char tmp_dir[1024] = {0}; 02976 char script[4096] = {0}; 02977 02978 Weka_model_tree* tree = NULL; 02979 Vector_u32* train_labels = NULL; 02980 Matrix_i32* train_markers = NULL; 02981 Vector_u32* test_labels = NULL; 02982 Vector_d* test_confs = NULL; 02983 Matrix_i32* test_markers = NULL; 02984 Error* err; 02985 02986 if (!opts.weka_part_fname) 02987 return; 02988 02989 pid = getpid(); 02990 snprintf(tmp_dir, 1024, "%s/.haplo_test_leave_one_out_part_%u", 02991 tmp_dirname, pid); 02992 snprintf(script, 4096, "mkdir -p %s", tmp_dir); 02993 if (system(script) == 127) 02994 { 02995 print_error_msg_exit("haplo-test", "Could not create tmp files"); 02996 } 02997 02998 create_vector_u32(labels_out, data_labels->num_elts); 02999 create_vector_d(confs_out, data_labels->num_elts); 03000 03001 for (i = 0; i < data_labels->num_elts; i++) 03002 { 03003 create_leave_one_out_train_and_test_data(&train_labels, &train_markers, 03004 &test_markers, data_labels, data_markers, i); 03005 03006 if ((err = train_weka_part_model_tree(&tree, train_labels, 03007 train_markers, opts.weka_part_fname, 03008 opts.weka_dtd_fname, tmp_dir, 03009 opts.weka_jar_fname)) || 03010 (err = predict_labels_with_weka_part_model_tree(&test_labels, 03011 &test_confs, test_markers, tree, opts.weka_jar_fname))) 03012 { 03013 print_error_msg_exit("haplo-test", err->msg); 03014 } 03015 03016 assert(test_labels->num_elts == 1); 03017 (*labels_out)->elts[ i ] = test_labels->elts[ 0 ]; 03018 (*confs_out)->elts[ i ] = test_confs->elts[ 0 ]; 03019 } 03020 03021 snprintf(script, 4096, "rm -rf %s", tmp_dir); 03022 if (system(script) == 127) 03023 { 03024 print_error_msg_exit("haplo-test", "Could not remove tmp files"); 03025 } 03026 03027 free_vector_u32(test_labels); 03028 free_vector_d(test_confs); 03029 free_matrix_i32(test_markers); 03030 free_vector_u32(train_labels); 03031 free_matrix_i32(train_markers); 03032 free_weka_model_tree(tree); 03033 03034 find_ancestors(ancestor_types_out, ancestor_labels_out, *labels_out, 03035 data_labels); 03036 03037 write_leave_one_out_summary(data_ids, data_labels, *ancestor_types_out, 03038 *ancestor_labels_out, *labels_out, *confs_out, NULL, 03039 weka_part_summary_fname); 03040 03041 write_leave_one_out_details(data_ids, data_labels, *ancestor_types_out, 03042 *ancestor_labels_out, *labels_out, *confs_out, NULL, 03043 weka_part_details_cnt_fname, weka_part_details_pct_fname); 03044 03045 write_leave_one_out_confusion(data_ids, data_labels, *ancestor_types_out, 03046 *ancestor_labels_out, *labels_out, *confs_out, NULL, 03047 weka_part_confusion_cnt_fname, weka_part_confusion_pct_fname); 03048 03049 write_leave_one_out_preds("part", data_ids, data_labels, 03050 *ancestor_types_out, *ancestor_labels_out, *labels_out, 03051 *confs_out, NULL, weka_part_preds_fname); 03052 #else 03053 return; 03054 #endif 03055 } 03056 03060 static void leave_one_out_nearest 03061 ( 03062 Vector_u32** labels_out, 03063 Vector_d** dists_out, 03064 Vector_u32** ancestor_types_out, 03065 Vector_u32** ancestor_labels_out, 03066 const Matblock_u8* data_ids, 03067 const Vector_u32* data_labels, 03068 const Matrix_i32* data_markers 03069 ) 03070 { 03071 uint32_t i; 03072 03073 Nearest_model* model = NULL; 03074 Vector_u32* train_labels = NULL; 03075 Matrix_i32* train_markers = NULL; 03076 Vector_u32* test_labels = NULL; 03077 Vector_d* test_dists = NULL; 03078 Matrix_i32* test_markers = NULL; 03079 Error* err; 03080 03081 if (!opts.nearest_fname) 03082 return; 03083 03084 create_vector_u32(labels_out, data_labels->num_elts); 03085 create_vector_d(dists_out, data_labels->num_elts); 03086 03087 for (i = 0; i < data_labels->num_elts; i++) 03088 { 03089 create_leave_one_out_train_and_test_data(&train_labels, &train_markers, 03090 &test_markers, data_labels, data_markers, i); 03091 03092 if ((err = train_nearest_model(&model, train_labels, train_markers, 03093 opts.nearest_fname, opts.nearest_dtd_fname)) || 03094 (err = predict_labels_with_nearest_model(&test_labels, &test_dists, 03095 test_markers, model))) 03096 { 03097 print_error_msg_exit("haplo-test", err->msg); 03098 } 03099 03100 assert(test_labels->num_elts == 1); 03101 (*labels_out)->elts[ i ] = test_labels->elts[ 0 ]; 03102 (*dists_out)->elts[ i ] = test_dists->elts[ 0 ]; 03103 } 03104 03105 free_vector_u32(test_labels); 03106 free_vector_d(test_dists); 03107 free_matrix_i32(test_markers); 03108 free_vector_u32(train_labels); 03109 free_matrix_i32(train_markers); 03110 free_nearest_model(model); 03111 03112 find_ancestors(ancestor_types_out, ancestor_labels_out, *labels_out, 03113 data_labels); 03114 03115 write_leave_one_out_summary(data_ids, data_labels, *ancestor_types_out, 03116 *ancestor_labels_out, *labels_out, *dists_out, NULL, 03117 nearest_summary_fname); 03118 03119 write_leave_one_out_details(data_ids, data_labels, *ancestor_types_out, 03120 *ancestor_labels_out, *labels_out, *dists_out, NULL, 03121 nearest_details_cnt_fname, nearest_details_pct_fname); 03122 03123 write_leave_one_out_confusion(data_ids, data_labels, *ancestor_types_out, 03124 *ancestor_labels_out, *labels_out, *dists_out, NULL, 03125 nearest_confusion_cnt_fname, nearest_confusion_pct_fname); 03126 03127 write_leave_one_out_preds("nearest", data_ids, data_labels, 03128 *ancestor_types_out, *ancestor_labels_out, *labels_out, *dists_out, 03129 NULL, nearest_preds_fname); 03130 } 03131 03135 static void leave_one_out_tandem_agree 03136 ( 03137 Vector_u32** types_out, 03138 Vector_u32** labels_out, 03139 Vector_u32** ancestor_types_out, 03140 Vector_u32** ancestor_labels_out, 03141 const Vector_u32* nb_freq_labels, 03142 const Vector_u32* nb_gauss_labels, 03143 const Vector_u32* nb_gmm_labels, 03144 const Vector_u32* mv_gmm_labels, 03145 const Vector_u32* svm_labels, 03146 const Vector_u32* j48_labels, 03147 const Vector_u32* part_labels, 03148 const Vector_u32* nearest_labels, 03149 const Matblock_u8* data_ids, 03150 const Vector_u32* data_labels 03151 ) 03152 { 03153 find_ancestors_of_sets(ancestor_types_out, ancestor_labels_out, 03154 nb_freq_labels, nb_gauss_labels, nb_gmm_labels, mv_gmm_labels, 03155 svm_labels, j48_labels, part_labels, nearest_labels); 03156 03157 copy_vector_u32(types_out, *ancestor_types_out); 03158 copy_vector_u32(labels_out, *ancestor_labels_out); 03159 03160 write_leave_one_out_summary(data_ids, data_labels, *ancestor_types_out, 03161 *ancestor_labels_out, *labels_out, NULL, NULL, 03162 tandem_agree_summary_fname); 03163 03164 write_leave_one_out_details(data_ids, data_labels, *ancestor_types_out, 03165 *ancestor_labels_out, *labels_out, NULL, *types_out, 03166 tandem_agree_details_cnt_fname, tandem_agree_details_pct_fname); 03167 } 03168 03172 static void leave_one_out_tandem 03173 ( 03174 const Vector_u32* tandem_types, 03175 const Vector_u32* tandem_labels, 03176 const Matblock_u8* data_ids, 03177 const Vector_u32* data_labels, 03178 Vector_u32* ancestor_types, 03179 Vector_u32* ancestor_labels 03180 ) 03181 { 03182 find_tandem_ancestors(&ancestor_types, &ancestor_labels, tandem_types, 03183 tandem_labels, data_labels); 03184 03185 write_leave_one_out_summary(data_ids, data_labels, ancestor_types, 03186 ancestor_labels, tandem_labels, NULL, tandem_types, 03187 tandem_summary_fname); 03188 03189 write_leave_one_out_details(data_ids, data_labels, ancestor_types, 03190 ancestor_labels, tandem_labels, NULL, tandem_types, 03191 tandem_details_cnt_fname, tandem_details_pct_fname); 03192 03193 write_leave_one_out_confusion(data_ids, data_labels, ancestor_types, 03194 ancestor_labels, tandem_labels, NULL, tandem_types, 03195 tandem_confusion_cnt_fname, tandem_confusion_pct_fname); 03196 03197 write_leave_one_out_preds("tandem", data_ids, data_labels, 03198 ancestor_types, ancestor_labels, tandem_labels, NULL, 03199 tandem_types, tandem_preds_fname); 03200 } 03201 03206 static void create_cross_validation_train_and_test_data 03207 ( 03208 Matblock_u8**** train_ids_out, 03209 Vector_u32**** train_labels_out, 03210 Matrix_i32**** train_markers_out, 03211 Matblock_u8**** test_ids_out, 03212 Vector_u32**** test_labels_out, 03213 Matrix_i32**** test_markers_out, 03214 const Matblock_u8* data_ids, 03215 const Vector_u32* data_labels, 03216 const Matrix_i32* data_markers 03217 ) 03218 { 03219 uint32_t num_samples; 03220 uint32_t sample; 03221 uint32_t num_markers; 03222 uint32_t marker; 03223 uint32_t num_ids; 03224 uint32_t id; 03225 uint32_t iter; 03226 uint32_t fold; 03227 uint32_t num_samples_mod_num_folds; 03228 uint32_t num_samples_per_fold; 03229 uint32_t num_times_subtract_a_sample; 03230 uint32_t train_id_sample; 03231 uint32_t train_label_sample; 03232 uint32_t train_marker_sample; 03233 uint32_t test_id_sample; 03234 uint32_t test_label_sample; 03235 uint32_t test_marker_sample; 03236 uint32_t c; 03237 int32_t i, j, k; 03238 03239 Matblock_u8*** train_ids; 03240 Vector_u32*** train_labels; 03241 Matrix_i32*** train_markers; 03242 Matblock_u8*** test_ids; 03243 Vector_u32*** test_labels; 03244 Matrix_i32*** test_markers; 03245 Vector_i32* fold_members; 03246 Matblock_u8* folded_ids; 03247 Vector_u32* folded_labels; 03248 Matrix_i32* folded_markers; 03249 03250 num_ids = (data_ids) ? data_ids->num_rows : 0; 03251 num_samples = data_markers->num_rows; 03252 num_markers = data_markers->num_cols; 03253 03254 assert(*train_ids_out = malloc(num_cv_iters*sizeof(void**))); 03255 assert(*train_labels_out = malloc(num_cv_iters*sizeof(void**))); 03256 assert(*train_markers_out = malloc(num_cv_iters*sizeof(void**))); 03257 assert(*test_ids_out = malloc(num_cv_iters*sizeof(void**))); 03258 assert(*test_labels_out = malloc(num_cv_iters*sizeof(void**))); 03259 assert(*test_markers_out = malloc(num_cv_iters*sizeof(void**))); 03260 for (iter = 0; iter < num_cv_iters; iter++) 03261 { 03262 assert((*train_ids_out)[iter] = calloc(num_cv_folds,sizeof(void*))); 03263 assert((*train_labels_out)[iter] = calloc(num_cv_folds,sizeof(void*))); 03264 assert((*train_markers_out)[iter] = calloc(num_cv_folds,sizeof(void*))); 03265 assert((*test_ids_out)[iter] = calloc(num_cv_folds,sizeof(void*))); 03266 assert((*test_labels_out)[iter] = calloc(num_cv_folds,sizeof(void*))); 03267 assert((*test_markers_out)[iter] = calloc(num_cv_folds,sizeof(void*))); 03268 } 03269 03270 train_ids = *train_ids_out; 03271 train_labels = *train_labels_out; 03272 train_markers = *train_markers_out; 03273 test_ids = *test_ids_out; 03274 test_labels = *test_labels_out; 03275 test_markers = *test_markers_out; 03276 03277 fold_members = NULL; 03278 03279 for (iter = 0; iter < num_cv_iters; iter++) 03280 { 03281 num_samples_mod_num_folds = num_samples % num_cv_folds; 03282 num_samples_per_fold = ceil(num_samples / (double)num_cv_folds); 03283 03284 num_times_subtract_a_sample = (num_samples_mod_num_folds > 0) ? 03285 num_cv_folds - num_samples_mod_num_folds : 0; 03286 03287 create_init_vector_i32(&fold_members, num_samples, -1); 03288 03289 for (fold = 0; fold < num_cv_folds; fold++) 03290 { 03291 if (num_times_subtract_a_sample > 0) 03292 { 03293 i = num_samples_per_fold - 1; 03294 num_times_subtract_a_sample--; 03295 } 03296 else 03297 { 03298 i = num_samples_per_fold; 03299 } 03300 j = i; 03301 03302 while (j > 0) 03303 { 03304 j--; 03305 k = -1; 03306 03307 while (k < 0) 03308 { 03309 k = floor((rand()/(double)RAND_MAX) * num_samples); 03310 if (k == (int32_t)num_samples) 03311 { 03312 k--; 03313 } 03314 03315 if (fold_members->elts[ k ] < 0) 03316 { 03317 fold_members->elts[ k ] = fold; 03318 } 03319 else 03320 { 03321 k = -1; 03322 } 03323 } 03324 } 03325 03326 if (data_ids) 03327 { 03328 create_zero_matblock_u8(&(test_ids[ iter ][ fold ]), i, 03329 num_ids, data_ids->num_cols); 03330 create_zero_matblock_u8(&(train_ids[ iter ][ fold ]), 03331 num_samples - i, num_ids, data_ids->num_cols); 03332 } 03333 create_zero_vector_u32(&(test_labels[ iter ][ fold ]), i); 03334 create_zero_matrix_i32(&(test_markers[ iter ][ fold ]), i, 03335 num_markers); 03336 create_zero_vector_u32(&(train_labels[ iter ][ fold ]), 03337 num_samples - i); 03338 create_zero_matrix_i32(&(train_markers[ iter ][ fold ]), 03339 num_samples - i, num_markers); 03340 } 03341 03342 for (fold = 0; fold < num_cv_folds; fold++) 03343 { 03344 test_id_sample = 0; 03345 test_label_sample = 0; 03346 test_marker_sample = 0; 03347 train_id_sample = 0; 03348 train_label_sample = 0; 03349 train_marker_sample = 0; 03350 03351 for (sample = 0; sample < num_samples; sample++) 03352 { 03353 if (fold_members->elts[ sample ] == (int32_t)fold) 03354 { 03355 if (data_ids) 03356 { 03357 folded_ids = test_ids[ iter ][ fold ]; 03358 for (id = 0; id < num_ids; id++) 03359 { 03360 for (c = 0; c < data_ids->num_cols; c++) 03361 { 03362 folded_ids->elts[ test_id_sample ][ id ][ c ] = 03363 data_ids->elts[ sample ][ id ][ c ]; 03364 } 03365 } 03366 test_id_sample++; 03367 } 03368 03369 folded_labels = test_labels[ iter ][ fold ]; 03370 folded_labels->elts[ test_label_sample ] = 03371 data_labels->elts[ sample ]; 03372 03373 folded_markers = test_markers[ iter ][ fold ]; 03374 for (marker = 0; marker < num_markers; marker++) 03375 { 03376 folded_markers->elts[ test_marker_sample ][ marker ] = 03377 data_markers->elts[ sample ][ marker ]; 03378 } 03379 03380 test_label_sample++; 03381 test_marker_sample++; 03382 } 03383 else 03384 { 03385 if (data_ids) 03386 { 03387 folded_ids = train_ids[ iter ][ fold ]; 03388 for (id = 0; id < num_ids; id++) 03389 { 03390 for (c = 0; c < data_ids->num_cols; c++) 03391 { 03392 folded_ids->elts[ train_id_sample ][ id ][ c ] = 03393 data_ids->elts[ sample ][ id ][ c ]; 03394 } 03395 } 03396 train_id_sample++; 03397 } 03398 03399 folded_labels = train_labels[ iter ][ fold ]; 03400 folded_labels->elts[ train_label_sample ] = 03401 data_labels->elts[ sample ]; 03402 03403 folded_markers = train_markers[ iter ][ fold ]; 03404 for (marker = 0; marker < num_markers; marker++) 03405 { 03406 folded_markers->elts[ train_marker_sample ][ marker ] = 03407 data_markers->elts[ sample ][ marker ]; 03408 } 03409 03410 train_label_sample++; 03411 train_marker_sample++; 03412 } 03413 } 03414 } 03415 03416 } 03417 03418 free_vector_i32(fold_members); 03419 } 03420 03422 static void free_cross_validation_train_and_test_data 03423 ( 03424 Matblock_u8*** train_ids, 03425 Vector_u32*** train_labels, 03426 Matrix_i32*** train_markers, 03427 Matblock_u8*** test_ids, 03428 Vector_u32*** test_labels, 03429 Matrix_i32*** test_markers 03430 ) 03431 { 03432 uint32_t i, j; 03433 03434 for (i = 0; i < num_cv_iters; i++) 03435 { 03436 for (j = 0; j < num_cv_folds; j++) 03437 { 03438 free_matblock_u8(train_ids[ i ][ j ]); 03439 free_vector_u32(train_labels[ i ][ j ]); 03440 free_matrix_i32(train_markers[ i ][ j ]); 03441 free_matblock_u8(test_ids[ i ][ j ]); 03442 free_vector_u32(test_labels[ i ][ j ]); 03443 free_matrix_i32(test_markers[ i ][ j ]); 03444 } 03445 03446 free(train_ids[ i ]); 03447 free(train_labels[ i ]); 03448 free(train_markers[ i ]); 03449 free(test_ids[ i ]); 03450 free(test_labels[ i ]); 03451 free(test_markers[ i ]); 03452 } 03453 03454 free(train_ids); 03455 free(train_labels); 03456 free(train_markers); 03457 free(test_ids); 03458 free(test_labels); 03459 free(test_markers); 03460 } 03461 03463 static void write_cross_validation_summary 03464 ( 03465 Vector_u32*** ancestor_types, 03466 Vector_u32*** ancestor_labels, 03467 Vector_u32*** pred_labels, 03468 Vector_d*** pred_confs, 03469 Vector_u32*** tandem_types, 03470 const char* fname 03471 ) 03472 { 03473 uint32_t n, N; 03474 uint32_t i, j; 03475 float D; 03476 FILE* fp; 03477 const char* fmt_1; 03478 const char* fmt_2; 03479 xmlDoc* xml_doc = NULL; 03480 xmlNode* xml_root = NULL; 03481 xmlNode* xml_node[3]; 03482 xmlNode* xml_child = NULL; 03483 char xml_buf[256]; 03484 Error* err; 03485 03486 Matrix_f* counts = NULL; 03487 Vector_f* means = NULL; 03488 Vector_f* vars = NULL; 03489 Vector_f* errs = NULL; 03490 03491 if ((err = open_output(&fp, &xml_doc, "haplo-test-cv-summary-out", 03492 "haplo-test-cv-summary-out.dtd", fname))) 03493 { 03494 print_error_msg("haplo-test", err->msg); 03495 } 03496 03497 create_zero_matrix_f(&counts, num_cv_iters*num_cv_folds, 3); 03498 03499 if (opts.header_out && opts.output_format != HAPLO_OUTPUT_XML) 03500 { 03501 switch (opts.output_format) 03502 { 03503 case HAPLO_OUTPUT_TXT: 03504 fmt_1 = "%-21s %-21s %-21s\n"; 03505 fmt_2 = "%-8s %-6s %-5s %-8s %-6s %-5s %-8s %-6s %-5s\n"; 03506 break; 03507 case HAPLO_OUTPUT_CSV: 03508 fmt_1 = "%s,,,%s,,,%s,,\n"; 03509 fmt_2 = "%s,%s,%s,%s,%s,%s,%s,%s,%s\n"; 03510 break; 03511 case HAPLO_OUTPUT_XML: 03512 break; 03513 } 03514 fprintf(fp, fmt_1, "Direct", "Indirect", "None"); 03515 fprintf(fp, fmt_2, "Mean", "Dev", "Err", "Mean", "Dev", "Err", "Mean", 03516 "Dev", "Err"); 03517 } 03518 else if (opts.output_format == HAPLO_OUTPUT_XML) 03519 { 03520 xml_root = xmlDocGetRootElement(xml_doc); 03521 } 03522 03523 for (i = 0; i < num_cv_iters; i++) 03524 { 03525 for (j = 0; j < num_cv_folds; j++) 03526 { 03527 N = ancestor_types[ i ][ j ]->num_elts; 03528 03529 for (n = 0; n < N; n++) 03530 { 03531 if (!tandem_types || 03532 tandem_types[i][j]->elts[n] != HAPLO_ANCESTOR_NONE) 03533 { 03534 switch (ancestor_types[ i ][ j ]->elts[ n ]) 03535 { 03536 case HAPLO_ANCESTOR_DIRECT: 03537 counts->elts[ i*num_cv_folds + j ][ 0 ]++; 03538 break; 03539 case HAPLO_ANCESTOR_INDIRECT: 03540 counts->elts[ i*num_cv_folds + j ][ 1 ]++; 03541 break; 03542 case HAPLO_ANCESTOR_NONE: 03543 counts->elts[ i*num_cv_folds + j ][ 2 ]++; 03544 break; 03545 } 03546 } 03547 } 03548 } 03549 } 03550 03551 ind_mv_sample_stats_f(&means, &vars, &errs, counts); 03552 03553 switch (opts.output_format) 03554 { 03555 case HAPLO_OUTPUT_TXT: 03556 fmt_1 = "%-8.1f %-6.1f %-5.1f %-8.1f %-6.1f %-5.1f %-8.1f %-6.1f %-5.1f\n"; 03557 fprintf(fp, fmt_1, means->elts[0], sqrt(vars->elts[0]), 03558 errs->elts[0], means->elts[1], sqrt(vars->elts[1]), 03559 errs->elts[1], means->elts[2], sqrt(vars->elts[2]), 03560 errs->elts[2]); 03561 break; 03562 case HAPLO_OUTPUT_CSV: 03563 fmt_1 = "%.1f,%.1f,%.1f,%.1f,%.1f,%.1f,%.1f,%.1f,%.1f\n"; 03564 fprintf(fp, fmt_1, means->elts[0], sqrt(vars->elts[0]), 03565 errs->elts[0], means->elts[1], sqrt(vars->elts[1]), 03566 errs->elts[1], means->elts[2], sqrt(vars->elts[2]), 03567 errs->elts[2]); 03568 break; 03569 case HAPLO_OUTPUT_XML: 03570 xml_node[0] = XMLNewChild(xml_root, "direct", NULL); 03571 xml_node[1] = XMLNewChild(xml_root, "indirect", NULL); 03572 xml_node[2] = XMLNewChild(xml_root, "none", NULL); 03573 for (i = 0; i < 3; i++) 03574 { 03575 xml_child = XMLNewChild(xml_node[ i ], "count", NULL); 03576 snprintf(xml_buf, 256, "%.1f", means->elts[ i ]); 03577 XMLNewChild(xml_child, "mean", xml_buf); 03578 snprintf(xml_buf, 256, "%.1f", sqrt(vars->elts[ i ])); 03579 XMLNewChild(xml_child, "dev", xml_buf); 03580 snprintf(xml_buf, 256, "%.1f", errs->elts[ i ]); 03581 XMLNewChild(xml_child, "err", xml_buf); 03582 } 03583 break; 03584 } 03585 03586 D = means->elts[0] + means->elts[1] + means->elts[2]; 03587 if (D > 0) 03588 { 03589 multiply_vector_by_scalar_f(&means, means, 1/D); 03590 multiply_vector_by_scalar_f(&vars, vars, 1/(D*D)); 03591 multiply_vector_by_scalar_f(&errs, errs, 1/D); 03592 } 03593 03594 switch (opts.output_format) 03595 { 03596 case HAPLO_OUTPUT_TXT: 03597 fmt_1 = "%-8.3f %-6.3f %-5.3f %-8.3f %-6.3f %-5.3f %-8.3f %-6.3f %-5.3f\n"; 03598 fprintf(fp, fmt_1, means->elts[0], sqrt(vars->elts[0]), 03599 errs->elts[0], means->elts[1], sqrt(vars->elts[1]), 03600 errs->elts[1], means->elts[2], sqrt(vars->elts[2]), 03601 errs->elts[2]); 03602 break; 03603 case HAPLO_OUTPUT_CSV: 03604 fmt_1 = "%.3f,%.3f,%.3f,%.3f,%.3f,%.3f,%.3f,%.3f,%.3f\n"; 03605 fprintf(fp, fmt_1, means->elts[0], sqrt(vars->elts[0]), 03606 errs->elts[0], means->elts[1], sqrt(vars->elts[1]), 03607 errs->elts[1], means->elts[2], sqrt(vars->elts[2]), 03608 errs->elts[2]); 03609 break; 03610 case HAPLO_OUTPUT_XML: 03611 for (i = 0; i < 3; i++) 03612 { 03613 xml_child = XMLNewChild(xml_node[ i ], "percent", NULL); 03614 snprintf(xml_buf, 256, "%.3f", means->elts[ i ]); 03615 XMLNewChild(xml_child, "mean", xml_buf); 03616 snprintf(xml_buf, 256, "%.3f", sqrt(vars->elts[ i ])); 03617 XMLNewChild(xml_child, "dev", xml_buf); 03618 snprintf(xml_buf, 256, "%.3f", errs->elts[ i ]); 03619 XMLNewChild(xml_child, "err", xml_buf); 03620 } 03621 break; 03622 } 03623 03624 free_matrix_f(counts); 03625 free_vector_f(means); 03626 free_vector_f(vars); 03627 free_vector_f(errs); 03628 03629 if ((err = close_output(fp, xml_doc, fname))) 03630 { 03631 print_error_msg("haplo-test", err->msg); 03632 } 03633 } 03634 03645 static void write_cross_validation_details 03646 ( 03647 Vector_u32*** ancestor_types, 03648 Vector_u32*** ancestor_labels, 03649 Vector_u32*** pred_labels, 03650 Vector_d*** pred_confs, 03651 Vector_u32*** tandem_types, 03652 const char* cnt_fname, 03653 const char* pct_fname 03654 ) 03655 { 03656 uint32_t i, j, k; 03657 uint32_t n, nn, N; 03658 float D; 03659 const char* label; 03660 const char* fmt_1; 03661 const char* fmt_2; 03662 FILE* cnt_fp; 03663 FILE* pct_fp; 03664 xmlDoc* xml_doc = NULL; 03665 xmlNode* xml_root = NULL; 03666 xmlNode* xml_nd1 = NULL; 03667 xmlNode* xml_nd2[3] = {0}; 03668 xmlNode* xml_nd3 = NULL; 03669 char xml_buf[256]; 03670 Error* err; 03671 03672 Matrix_f* counts[3] = {0}; 03673 Vector_f* stats[3][3] = {{0}}; 03674 03675 if (!(pred_labels[0][0])) 03676 return; 03677 03678 if ((err = open_output(&cnt_fp, &xml_doc, "haplo-test-cv-details-out", 03679 "haplo-test-cv-details-out.dtd", cnt_fname)) || 03680 (err = open_output(&pct_fp, NULL, NULL, NULL, pct_fname))) 03681 { 03682 print_error_msg("haplo-test", err->msg); 03683 } 03684 03685 N = get_num_haplo_groups(); 03686 for (i = 0; i < 3; i++) 03687 { 03688 create_zero_matrix_f(&(counts[ i ]), num_cv_iters*num_cv_folds, N); 03689 } 03690 03691 for (i = 0; i < num_cv_iters; i++) 03692 { 03693 for (j = 0; j < num_cv_folds; j++) 03694 { 03695 for (k = 0; k < ancestor_types[ i ][ j ]->num_elts; k++) 03696 { 03697 if (!tandem_types || 03698 tandem_types[i][j]->elts[k] != HAPLO_ANCESTOR_NONE) 03699 { 03700 assert(pred_labels[ i ][ j ]->elts[ k ] < N); 03701 n = pred_labels[ i ][ j ]->elts[ k ]; 03702 03703 switch (ancestor_types[ i ][ j ]->elts[ k ]) 03704 { 03705 case HAPLO_ANCESTOR_DIRECT: 03706 counts[0]->elts[ i*num_cv_folds + j ][ n ]++; 03707 break; 03708 case HAPLO_ANCESTOR_INDIRECT: 03709 counts[1]->elts[ i*num_cv_folds + j ][ n ]++; 03710 break; 03711 case HAPLO_ANCESTOR_NONE: 03712 counts[2]->elts[ i*num_cv_folds + j ][ n ]++; 03713 break; 03714 } 03715 } 03716 } 03717 } 03718 } 03719 03720 for (i = 0; i < 3; i++) 03721 { 03722 ind_mv_sample_stats_f(&(stats[ i ][0]), &(stats[ i ][1]), 03723 &(stats[ i ][2]), counts[ i ]); 03724 } 03725 03726 if (opts.header_out && opts.output_format != HAPLO_OUTPUT_XML) 03727 { 03728 switch (opts.output_format) 03729 { 03730 case HAPLO_OUTPUT_TXT: 03731 fmt_1 = "%-10s %-21s %-21s %-21s\n"; 03732 fmt_2 = "%-10s %-8s %-6s %-5s %-8s %-6s %-5s %-8s %-6s %-5s\n"; 03733 break; 03734 case HAPLO_OUTPUT_CSV: 03735 fmt_1 = "%s,%s,,,%s,,,%s,,\n"; 03736 fmt_2 = "%s,%s,%s,%s,%s,%s,%s,%s,%s,%s\n"; 03737 break; 03738 case HAPLO_OUTPUT_XML: 03739 break; 03740 } 03741 fprintf(cnt_fp, fmt_1, "Predicted", "Direct", "Indirect", "None"); 03742 fprintf(cnt_fp, fmt_2, "Label", "Mean", "Dev", "Err", "Mean", "Dev", 03743 "Err", "Mean", "Dev", "Err"); 03744 fprintf(pct_fp, fmt_1, "Predicted", "Direct", "Indirect", "None"); 03745 fprintf(pct_fp, fmt_2, "Label", "Mean", "Dev", "Err", "Mean", "Dev", 03746 "Err", "Mean", "Dev", "Err"); 03747 } 03748 else if (opts.output_format == HAPLO_OUTPUT_XML) 03749 { 03750 xml_root = xmlDocGetRootElement(xml_doc); 03751 } 03752 03753 nn = 0; 03754 for (n = 0; n < N; n++) 03755 { 03756 D = stats[0][0]->elts[n] + stats[1][0]->elts[n] + stats[2][0]->elts[n]; 03757 if (D > 0) 03758 { 03759 lookup_haplo_group_label_from_index(&label, n); 03760 03761 switch (opts.output_format) 03762 { 03763 case HAPLO_OUTPUT_TXT: 03764 fmt_1 = "%-10s %-8.1f %-6.1f %-5.1f %-8.1f %-6.1f %-5.1f %-8.1f %-6.1f %-5.1f\n"; 03765 fprintf(cnt_fp, fmt_1, label, 03766 stats[0][0]->elts[n], sqrt(stats[0][1]->elts[n]), stats[0][2]->elts[n], 03767 stats[1][0]->elts[n], sqrt(stats[1][1]->elts[n]), stats[1][2]->elts[n], 03768 stats[2][0]->elts[n], sqrt(stats[2][1]->elts[n]), stats[2][2]->elts[n]); 03769 fmt_1 = "%-10s %-8.3f %-6.3f %-5.3f %-8.3f %-6.3f %-5.3f %-8.3f %-6.3f %-5.3f\n"; 03770 fprintf(pct_fp, fmt_1, label, 03771 stats[0][0]->elts[n]/D, sqrt(stats[0][1]->elts[n])/D, stats[0][2]->elts[n]/D, 03772 stats[1][0]->elts[n]/D, sqrt(stats[1][1]->elts[n])/D, stats[1][2]->elts[n]/D, 03773 stats[2][0]->elts[n]/D, sqrt(stats[2][1]->elts[n])/D, stats[2][2]->elts[n]/D); 03774 break; 03775 case HAPLO_OUTPUT_CSV: 03776 fmt_1 = "%s,%.1f,%.1f,%.1f,%.1f,%.1f,%.1f,%.1f,%.1f,%.1f\n"; 03777 fprintf(cnt_fp, fmt_1, label, 03778 stats[0][0]->elts[n], sqrt(stats[0][1]->elts[n]), stats[0][2]->elts[n], 03779 stats[1][0]->elts[n], sqrt(stats[1][1]->elts[n]), stats[1][2]->elts[n], 03780 stats[2][0]->elts[n], sqrt(stats[2][1]->elts[n]), stats[2][2]->elts[n]); 03781 fmt_1 = "%s,%.3f,%.3f,%.3f,%.3f,%.3f,%.3f,%.3f,%.3f,%.3f\n"; 03782 fprintf(pct_fp, fmt_1, label, 03783 stats[0][0]->elts[n]/D, sqrt(stats[0][1]->elts[n])/D, stats[0][2]->elts[n]/D, 03784 stats[1][0]->elts[n]/D, sqrt(stats[1][1]->elts[n])/D, stats[1][2]->elts[n]/D, 03785 stats[2][0]->elts[n]/D, sqrt(stats[2][1]->elts[n])/D, stats[2][2]->elts[n]/D); 03786 break; 03787 case HAPLO_OUTPUT_XML: 03788 xml_nd1 = XMLNewChild(xml_root, "predicted", NULL); 03789 snprintf(xml_buf, 256, "%d", nn+1); 03790 XMLNewProp(xml_nd1, "number", xml_buf); 03791 XMLNewChild(xml_nd1, "label", label); 03792 xml_nd2[0] = XMLNewChild(xml_nd1, "direct", 0); 03793 xml_nd2[1] = XMLNewChild(xml_nd1, "indirect", 0); 03794 xml_nd2[2] = XMLNewChild(xml_nd1, "none", 0); 03795 for (i = 0; i < 3; i++) 03796 { 03797 xml_nd3 = XMLNewChild(xml_nd2[i], "count", NULL); 03798 snprintf(xml_buf, 256, "%.1f", stats[i][0]->elts[n]); 03799 XMLNewChild(xml_nd3, "mean", xml_buf); 03800 snprintf(xml_buf, 256, "%.1f", 03801 sqrt(stats[i][1]->elts[n])); 03802 XMLNewChild(xml_nd3, "dev", xml_buf); 03803 snprintf(xml_buf, 256, "%.1f", stats[i][2]->elts[n]); 03804 XMLNewChild(xml_nd3, "err", xml_buf); 03805 03806 xml_nd3 = XMLNewChild(xml_nd2[i], "percent", NULL); 03807 snprintf(xml_buf, 256, "%.1f", 03808 stats[i][0]->elts[n]/D); 03809 XMLNewChild(xml_nd3, "mean", xml_buf); 03810 snprintf(xml_buf, 256, "%.1f", 03811 sqrt(stats[i][1]->elts[n])/D); 03812 XMLNewChild(xml_nd3, "dev", xml_buf); 03813 snprintf(xml_buf, 256, "%.1f", 03814 stats[i][2]->elts[n]/D); 03815 XMLNewChild(xml_nd3, "err", xml_buf); 03816 } 03817 break; 03818 } 03819 03820 nn++; 03821 } 03822 } 03823 03824 for (i = 0; i < 3; i++) 03825 { 03826 free_matrix_f(counts[ i ]); 03827 03828 for (j = 0; j < 3; j++) 03829 { 03830 free_vector_f(stats[ i ][ j ]); 03831 } 03832 } 03833 03834 if ((err = close_output(cnt_fp, xml_doc, cnt_fname)) || 03835 (err = close_output(pct_fp, NULL, pct_fname))) 03836 { 03837 print_error_msg("haplo-test", err->msg); 03838 } 03839 } 03840 03844 static void write_cross_validation_confusion 03845 ( 03846 Vector_u32*** test_labels, 03847 Vector_u32*** pred_labels, 03848 Vector_d*** pred_confs, 03849 Vector_u32*** tandem_types, 03850 const char* cnt_fname, 03851 const char* pct_fname 03852 ) 03853 { 03854 uint32_t i, j, k; 03855 uint32_t n_1, n_2, N; 03856 uint32_t nn_1, nn_2; 03857 float D; 03858 const char* label; 03859 FILE* cnt_fp; 03860 FILE* pct_fp; 03861 xmlDoc* xml_doc = NULL; 03862 xmlNode* xml_root = NULL; 03863 xmlNode* xml_actual = NULL; 03864 xmlNode* xml_pred = NULL; 03865 char xml_buf[256]; 03866 Error* err; 03867 03868 Vector_f* actual_counts = NULL; 03869 Vector_f* predicted_counts = NULL; 03870 Matrix_f* confusion = NULL; 03871 Matrix_f* confusion_pct = NULL; 03872 03873 if (!(pred_labels[0][0])) 03874 return; 03875 03876 if ((err = open_output(&cnt_fp, &xml_doc, "haplo-test-cv-confusion-out", 03877 "haplo-test-cv-confusion-out.dtd", cnt_fname)) || 03878 (err = open_output(&pct_fp, NULL, NULL, NULL, pct_fname))) 03879 { 03880 print_error_msg("haplo-test", err->msg); 03881 } 03882 03883 N = get_num_haplo_groups(); 03884 create_zero_vector_f(&actual_counts, N); 03885 create_zero_vector_f(&predicted_counts, N); 03886 create_zero_matrix_f(&confusion, N, N); 03887 create_zero_matrix_f(&confusion_pct, N, N); 03888 03889 for (i = 0; i < num_cv_iters; i++) 03890 { 03891 for (j = 0; j < num_cv_folds; j++) 03892 { 03893 for (k = 0; k < test_labels[ i ][ j ]->num_elts; k++) 03894 { 03895 if (!tandem_types || 03896 tandem_types[i][j]->elts[k] != HAPLO_ANCESTOR_NONE) 03897 { 03898 assert(test_labels[ i ][ j ]->elts[ k ] < N); 03899 assert(pred_labels[ i ][ j ]->elts[ k ] < N); 03900 03901 actual_counts->elts[ test_labels[ i ][ j ]->elts[ k ] ]++; 03902 predicted_counts->elts[ pred_labels[ i ][ j ]->elts[ k ] ]++; 03903 confusion->elts[ test_labels[ i ][ j ]->elts[ k ] ] 03904 [ pred_labels[ i ][ j ]->elts[ k ] ]++; 03905 } 03906 } 03907 } 03908 } 03909 03910 multiply_vector_by_scalar_f(&actual_counts, actual_counts, 03911 1.0f/(num_cv_iters*num_cv_folds)); 03912 multiply_vector_by_scalar_f(&predicted_counts, predicted_counts, 03913 1.0f/(num_cv_iters*num_cv_folds)); 03914 multiply_matrix_by_scalar_f(&confusion, confusion, 03915 1.0f/(num_cv_iters*num_cv_folds)); 03916 03917 copy_matrix_f(&confusion_pct, confusion); 03918 for (n_2 = 0; n_2 < N; n_2++) 03919 { 03920 D = predicted_counts->elts[ n_2 ]; 03921 if (D > 0) 03922 { 03923 for (n_1 = 0; n_1 < N; n_1++) 03924 { 03925 confusion_pct->elts[ n_1 ][ n_2 ] /= D; 03926 } 03927 } 03928 } 03929 03930 if (opts.header_out && opts.output_format != HAPLO_OUTPUT_XML) 03931 { 03932 switch (opts.output_format) 03933 { 03934 case HAPLO_OUTPUT_TXT: 03935 fprintf(cnt_fp, "%-10s", "Actual"); 03936 fprintf(pct_fp, "%-10s", "Actual"); 03937 for (n_2 = 0; n_2 < N; n_2++) 03938 { 03939 if (predicted_counts->elts[ n_2 ]) 03940 { 03941 lookup_haplo_group_label_from_index(&label, n_2); 03942 fprintf(cnt_fp, " %-10s", label); 03943 fprintf(pct_fp, " %-10s", label); 03944 } 03945 } 03946 break; 03947 case HAPLO_OUTPUT_CSV: 03948 fprintf(cnt_fp, "%s", "Actual"); 03949 fprintf(pct_fp, "%s", "Actual"); 03950 for (n_2 = 0; n_2 < N; n_2++) 03951 { 03952 if (predicted_counts->elts[ n_2 ]) 03953 { 03954 lookup_haplo_group_label_from_index(&label, n_2); 03955 fprintf(cnt_fp, ",%s", label); 03956 fprintf(pct_fp, ",%s", label); 03957 } 03958 } 03959 break; 03960 case HAPLO_OUTPUT_XML: 03961 break; 03962 } 03963 fprintf(cnt_fp, "\n"); 03964 fprintf(pct_fp, "\n"); 03965 } 03966 else if (opts.output_format == HAPLO_OUTPUT_XML) 03967 { 03968 xml_root = xmlDocGetRootElement(xml_doc); 03969 } 03970 03971 nn_1 = 0; 03972 for (n_1 = 0; n_1 < N; n_1++) 03973 { 03974 switch (opts.output_format) 03975 { 03976 case HAPLO_OUTPUT_TXT: 03977 if (actual_counts->elts[ n_1 ] > 0) 03978 { 03979 lookup_haplo_group_label_from_index(&label, n_1); 03980 fprintf(cnt_fp, "%-10s", label); 03981 fprintf(pct_fp, "%-10s", label); 03982 03983 nn_2 = 0; 03984 for (n_2 = 0; n_2 < N; n_2++) 03985 { 03986 if (predicted_counts->elts[ n_2 ] > 0) 03987 { 03988 fprintf(cnt_fp, " %-10.1f", 03989 confusion->elts[ n_1 ][ n_2 ]); 03990 fprintf(pct_fp, " %-10.3f", 03991 confusion_pct->elts[ n_1 ][ n_2 ]); 03992 nn_2++; 03993 } 03994 } 03995 fprintf(cnt_fp, "\n"); 03996 fprintf(pct_fp, "\n"); 03997 nn_1++; 03998 } 03999 break; 04000 case HAPLO_OUTPUT_CSV: 04001 if (actual_counts->elts[ n_1 ] > 0) 04002 { 04003 lookup_haplo_group_label_from_index(&label, n_1); 04004 fprintf(cnt_fp, "%s", label); 04005 fprintf(pct_fp, "%s", label); 04006 04007 nn_2 = 0; 04008 for (n_2 = 0; n_2 < N; n_2++) 04009 { 04010 if (predicted_counts->elts[ n_2 ] > 0) 04011 { 04012 fprintf(cnt_fp, ",%.1f", 04013 confusion->elts[ n_1 ][ n_2 ]); 04014 fprintf(pct_fp, ",%.3f", 04015 confusion_pct->elts[ n_1 ][ n_2 ]); 04016 nn_2++; 04017 } 04018 } 04019 fprintf(cnt_fp, "\n"); 04020 fprintf(pct_fp, "\n"); 04021 nn_1++; 04022 } 04023 break; 04024 case HAPLO_OUTPUT_XML: 04025 if (actual_counts->elts[ n_1 ] > 0) 04026 { 04027 lookup_haplo_group_label_from_index(&label, n_1); 04028 xml_actual = XMLNewChild(xml_root, "actual", NULL); 04029 snprintf(xml_buf, 256, "%d", nn_1+1); 04030 XMLNewProp(xml_actual, "number", xml_buf); 04031 XMLNewChild(xml_actual, "label", label); 04032 04033 nn_2 = 0; 04034 for (n_2 = 0; n_2 < N; n_2++) 04035 { 04036 if (predicted_counts->elts[ n_2 ] > 0) 04037 { 04038 lookup_haplo_group_label_from_index(&label, n_2); 04039 xml_pred = XMLNewChild(xml_actual, "predicted", 0); 04040 snprintf(xml_buf, 256, "%d", nn_2+1); 04041 XMLNewProp(xml_pred, "number", xml_buf); 04042 XMLNewChild(xml_pred, "label", label); 04043 04044 snprintf(xml_buf, 256, "%.1f", 04045 confusion->elts[n_1][n_2]); 04046 XMLNewChild(xml_pred, "count", xml_buf); 04047 04048 snprintf(xml_buf, 256, "%.3f", 04049 confusion_pct->elts[n_1][n_2]); 04050 XMLNewChild(xml_pred, "percent", xml_buf); 04051 04052 nn_2++; 04053 } 04054 } 04055 nn_1++; 04056 } 04057 break; 04058 } 04059 } 04060 04061 free_vector_f(actual_counts); 04062 free_vector_f(predicted_counts); 04063 free_matrix_f(confusion); 04064 free_matrix_f(confusion_pct); 04065 04066 if ((err = close_output(cnt_fp, xml_doc, cnt_fname)) || 04067 (err = close_output(pct_fp, NULL, pct_fname))) 04068 { 04069 print_error_msg("haplo-test", err->msg); 04070 } 04071 } 04072 04074 static void write_cross_validation_preds 04075 ( 04076 const char* type, 04077 Matblock_u8*** data_ids, 04078 Vector_u32*** data_labels, 04079 Vector_u32*** ancestor_types, 04080 Vector_u32*** ancestor_labels, 04081 Vector_u32*** pred_labels, 04082 Vector_d*** pred_confs, 04083 Vector_u32*** tandem_types, 04084 const char* fname 04085 ) 04086 { 04087 uint32_t i, j, k; 04088 FILE* fp; 04089 xmlDoc* xml_doc = NULL; 04090 xmlNode* xml_root = NULL; 04091 xmlNode* xml_node = NULL; 04092 char xml_buf[256] = {0}; 04093 Error* err; 04094 04095 if (!(pred_labels[0][0])) 04096 return; 04097 04098 if ((err = open_output(&fp, &xml_doc, "haplo-test-cv-predictions-out", 04099 "haplo-test-cv-predictions-out.dtd", fname))) 04100 { 04101 print_error_msg("haplo-test", err->msg); 04102 } 04103 04104 if (opts.header_out && opts.output_format != HAPLO_OUTPUT_XML) 04105 { 04106 if (data_ids) 04107 { 04108 for (j = 0; j < data_ids[0][0]->num_rows; j++) 04109 { 04110 switch (opts.output_format) 04111 { 04112 case HAPLO_OUTPUT_TXT: 04113 fprintf(fp, "ID %-7d ", j+1); 04114 break; 04115 case HAPLO_OUTPUT_CSV: 04116 fprintf(fp, "ID %d,", j+1); 04117 break; 04118 case HAPLO_OUTPUT_XML: 04119 break; 04120 } 04121 } 04122 } 04123 04124 switch (opts.output_format) 04125 { 04126 case HAPLO_OUTPUT_TXT: 04127 fprintf(fp, "%-10s %-10s %-4s", "Actual", "Ancestor", "Type"); 04128 if (pred_labels) 04129 fprintf(fp, " %-10s %-5s", "Prediction", "Conf"); 04130 break; 04131 case HAPLO_OUTPUT_CSV: 04132 fprintf(fp, "%s,%s,%s", "Actual", "Ancestor", "Type"); 04133 if (pred_labels) 04134 fprintf(fp, ",%s,%s", "Prediction", "Conf"); 04135 break; 04136 case HAPLO_OUTPUT_XML: 04137 break; 04138 } 04139 fprintf(fp, "\n"); 04140 } 04141 else if (opts.output_format == HAPLO_OUTPUT_XML) 04142 { 04143 xml_root = xmlDocGetRootElement(xml_doc); 04144 } 04145 04146 for (i = 0; i < num_cv_iters; i++) 04147 { 04148 for (j = 0; j < num_cv_folds; j++) 04149 { 04150 for (k = 0; k < data_labels[i][j]->num_elts; k++) 04151 { 04152 if (!tandem_types || 04153 tandem_types[i][j]->elts[k] != HAPLO_ANCESTOR_NONE) 04154 { 04155 if (opts.output_format == HAPLO_OUTPUT_XML) 04156 { 04157 xml_node = XMLNewChild(xml_root, "sample", NULL); 04158 snprintf(xml_buf, 256, "%d", k+1); 04159 XMLNewProp(xml_node, "number", xml_buf); 04160 } 04161 04162 write_ids(data_ids[i][j], k, HAPLO_SEP_SUFFIX, fp, xml_node); 04163 write_label(data_labels[i][j], k, HAPLO_SEP_SUFFIX, fp, 04164 xml_node); 04165 04166 write_ancestor_label(ancestor_types[i][j], 04167 ancestor_labels[i][j], k, HAPLO_SEP_NONE, fp, xml_node); 04168 04169 write_prediction(type, pred_labels[i][j], (pred_confs) ? 04170 pred_confs[i][j] : NULL, k, HAPLO_SEP_PREFIX, fp, 04171 xml_node); 04172 04173 if (opts.output_format != HAPLO_OUTPUT_XML) 04174 { 04175 fprintf(fp, "\n"); 04176 } 04177 } 04178 } 04179 } 04180 } 04181 04182 if ((err = close_output(fp, xml_doc, fname))) 04183 { 04184 print_error_msg("haplo-test", err->msg); 04185 } 04186 } 04187 04189 static void cross_validate_nb_freq 04190 ( 04191 Vector_u32**** labels_out, 04192 Vector_d**** confs_out, 04193 Matblock_u8*** train_ids, 04194 Vector_u32*** train_labels, 04195 Matrix_i32*** train_markers, 04196 Matblock_u8*** test_ids, 04197 Vector_u32*** test_labels, 04198 Matrix_i32*** test_markers, 04199 Vector_u32*** ancestor_types, 04200 Vector_u32*** ancestor_labels 04201 ) 04202 { 04203 uint32_t i, j; 04204 04205 NB_freq_model_tree* tree = NULL; 04206 Error* err; 04207 04208 if (!opts.nb_freq_fname) 04209 return; 04210 04211 for (i = 0; i < num_cv_iters; i++) 04212 { 04213 for (j = 0; j < num_cv_folds; j++) 04214 { 04215 if ((err = train_nb_freq_model_tree(&tree, train_labels[i][j], 04216 train_markers[i][j], opts.nb_freq_fname, 04217 opts.nb_freq_dtd_fname)) || 04218 (err = predict_labels_with_nb_freq_model_tree( 04219 &((*labels_out)[i][j]), 04220 &((*confs_out)[i][j]), test_markers[i][j], 04221 tree, 0))) 04222 { 04223 print_error_msg_exit("haplo-test", err->msg); 04224 } 04225 04226 find_ancestors(&(ancestor_types[i][j]), 04227 &(ancestor_labels[i][j]), (*labels_out)[i][j], 04228 test_labels[i][j]); 04229 } 04230 } 04231 04232 free_nb_freq_model_tree(tree); 04233 04234 write_cross_validation_summary(ancestor_types, ancestor_labels, 04235 *labels_out, *confs_out, NULL, nb_freq_summary_fname); 04236 04237 write_cross_validation_details(ancestor_types, ancestor_labels, 04238 *labels_out, *confs_out, NULL, nb_freq_details_cnt_fname, 04239 nb_freq_details_pct_fname); 04240 04241 write_cross_validation_confusion(test_labels, *labels_out, *confs_out, 04242 NULL, nb_freq_confusion_cnt_fname, nb_freq_confusion_pct_fname); 04243 04244 write_cross_validation_preds("nb-freq", test_ids, test_labels, 04245 ancestor_types, ancestor_labels, *labels_out, *confs_out, 04246 NULL, nb_freq_preds_fname); 04247 } 04248 04250 static void cross_validate_nb_gauss 04251 ( 04252 Vector_u32**** labels_out, 04253 Vector_d**** confs_out, 04254 Matblock_u8*** train_ids, 04255 Vector_u32*** train_labels, 04256 Matrix_i32*** train_markers, 04257 Matblock_u8*** test_ids, 04258 Vector_u32*** test_labels, 04259 Matrix_i32*** test_markers, 04260 Vector_u32*** ancestor_types, 04261 Vector_u32*** ancestor_labels 04262 ) 04263 { 04264 uint32_t i, j; 04265 04266 NB_gauss_model_tree* tree = NULL; 04267 Error* err; 04268 04269 if (!opts.nb_gauss_fname) 04270 return; 04271 04272 for (i = 0; i < num_cv_iters; i++) 04273 { 04274 for (j = 0; j < num_cv_folds; j++) 04275 { 04276 if ((err = train_nb_gauss_model_tree(&tree, train_labels[i][j], 04277 train_markers[i][j], opts.nb_gauss_fname, 04278 opts.nb_gauss_dtd_fname)) || 04279 (err = predict_labels_with_nb_gauss_model_tree( 04280 &((*labels_out)[i][j]), 04281 &((*confs_out)[i][j]), test_markers[i][j], 04282 tree, 0))) 04283 { 04284 print_error_msg_exit("haplo-test", err->msg); 04285 } 04286 04287 find_ancestors(&(ancestor_types[i][j]), 04288 &(ancestor_labels[i][j]), (*labels_out)[i][j], 04289 test_labels[i][j]); 04290 } 04291 } 04292 04293 free_nb_gauss_model_tree(tree); 04294 04295 write_cross_validation_summary(ancestor_types, ancestor_labels, 04296 *labels_out, *confs_out, NULL, nb_gauss_summary_fname); 04297 04298 write_cross_validation_details(ancestor_types, ancestor_labels, 04299 *labels_out, *confs_out, NULL, nb_gauss_details_cnt_fname, 04300 nb_gauss_details_pct_fname); 04301 04302 write_cross_validation_confusion(test_labels, *labels_out, *confs_out, 04303 NULL, nb_gauss_confusion_cnt_fname, nb_gauss_confusion_pct_fname); 04304 04305 write_cross_validation_preds("nb-gauss", test_ids, test_labels, 04306 ancestor_types, ancestor_labels, *labels_out, *confs_out, 04307 NULL, nb_gauss_preds_fname); 04308 } 04309 04311 static void cross_validate_nb_gmm 04312 ( 04313 Vector_u32**** labels_out, 04314 Vector_d**** confs_out, 04315 Matblock_u8*** train_ids, 04316 Vector_u32*** train_labels, 04317 Matrix_i32*** train_markers, 04318 Matblock_u8*** test_ids, 04319 Vector_u32*** test_labels, 04320 Matrix_i32*** test_markers, 04321 Vector_u32*** ancestor_types, 04322 Vector_u32*** ancestor_labels 04323 ) 04324 { 04325 uint32_t i, j; 04326 04327 NB_gmm_model_tree* tree = NULL; 04328 Error* err; 04329 04330 if (!opts.nb_gmm_fname) 04331 return; 04332 04333 for (i = 0; i < num_cv_iters; i++) 04334 { 04335 for (j = 0; j < num_cv_folds; j++) 04336 { 04337 if ((err = train_nb_gmm_model_tree(&tree, train_labels[i][j], 04338 train_markers[i][j], opts.nb_gmm_fname, 04339 opts.nb_gmm_dtd_fname)) || 04340 (err = predict_labels_with_nb_gmm_model_tree( 04341 &((*labels_out)[i][j]), 04342 &((*confs_out)[i][j]), test_markers[i][j], 04343 tree, 0))) 04344 { 04345 print_error_msg_exit("haplo-test", err->msg); 04346 } 04347 04348 find_ancestors(&(ancestor_types[i][j]), 04349 &(ancestor_labels[i][j]), (*labels_out)[i][j], 04350 test_labels[i][j]); 04351 } 04352 } 04353 04354 free_nb_gmm_model_tree(tree); 04355 04356 write_cross_validation_summary(ancestor_types, ancestor_labels, 04357 *labels_out, *confs_out, NULL, nb_gmm_summary_fname); 04358 04359 write_cross_validation_details(ancestor_types, ancestor_labels, 04360 *labels_out, *confs_out, NULL, nb_gmm_details_cnt_fname, 04361 nb_gmm_details_pct_fname); 04362 04363 write_cross_validation_confusion(test_labels, *labels_out, *confs_out, 04364 NULL, nb_gmm_confusion_cnt_fname, nb_gmm_confusion_pct_fname); 04365 04366 write_cross_validation_preds("nb-gmm", test_ids, test_labels, 04367 ancestor_types, ancestor_labels, *labels_out, *confs_out, 04368 NULL, nb_gmm_preds_fname); 04369 } 04370 04372 static void cross_validate_mv_gmm 04373 ( 04374 Vector_u32**** labels_out, 04375 Vector_d**** confs_out, 04376 Matblock_u8*** train_ids, 04377 Vector_u32*** train_labels, 04378 Matrix_i32*** train_markers, 04379 Matblock_u8*** test_ids, 04380 Vector_u32*** test_labels, 04381 Matrix_i32*** test_markers, 04382 Vector_u32*** ancestor_types, 04383 Vector_u32*** ancestor_labels 04384 ) 04385 { 04386 uint32_t i, j; 04387 04388 MV_gmm_model_tree* tree = NULL; 04389 Error* err; 04390 04391 if (!opts.mv_gmm_fname) 04392 return; 04393 04394 for (i = 0; i < num_cv_iters; i++) 04395 { 04396 for (j = 0; j < num_cv_folds; j++) 04397 { 04398 if ((err = train_mv_gmm_model_tree(&tree, train_labels[i][j], 04399 train_markers[i][j], opts.mv_gmm_fname, 04400 opts.mv_gmm_dtd_fname)) || 04401 (err = predict_labels_with_mv_gmm_model_tree( 04402 &((*labels_out)[i][j]), 04403 &((*confs_out)[i][j]), test_markers[i][j], 04404 tree, 0))) 04405 { 04406 print_error_msg_exit("haplo-test", err->msg); 04407 } 04408 04409 find_ancestors(&(ancestor_types[i][j]), 04410 &(ancestor_labels[i][j]), (*labels_out)[i][j], 04411 test_labels[i][j]); 04412 } 04413 } 04414 04415 free_mv_gmm_model_tree(tree); 04416 04417 write_cross_validation_summary(ancestor_types, ancestor_labels, 04418 *labels_out, *confs_out, NULL, mv_gmm_summary_fname); 04419 04420 write_cross_validation_details(ancestor_types, ancestor_labels, 04421 *labels_out, *confs_out, NULL, mv_gmm_details_cnt_fname, 04422 mv_gmm_details_pct_fname); 04423 04424 write_cross_validation_confusion(test_labels, *labels_out, *confs_out, 04425 NULL, mv_gmm_confusion_cnt_fname, mv_gmm_confusion_pct_fname); 04426 04427 write_cross_validation_preds("mv-gmm", test_ids, test_labels, 04428 ancestor_types, ancestor_labels, *labels_out, *confs_out, 04429 NULL, mv_gmm_preds_fname); 04430 } 04431 04433 static void cross_validate_svm 04434 ( 04435 Vector_u32**** labels_out, 04436 Vector_d**** confs_out, 04437 Matblock_u8*** train_ids, 04438 Vector_u32*** train_labels, 04439 Matrix_i32*** train_markers, 04440 Matblock_u8*** test_ids, 04441 Vector_u32*** test_labels, 04442 Matrix_i32*** test_markers, 04443 Vector_u32*** ancestor_types, 04444 Vector_u32*** ancestor_labels 04445 ) 04446 { 04447 #ifdef HAPLO_ENABLE_SVM 04448 uint32_t i, j; 04449 04450 SVM_model_tree* tree = NULL; 04451 Error* err; 04452 04453 if (!opts.svm_fname) 04454 return; 04455 04456 for (i = 0; i < num_cv_iters; i++) 04457 { 04458 for (j = 0; j < num_cv_folds; j++) 04459 { 04460 if ((err = train_svm_model_tree(&tree, train_labels[i][j], 04461 train_markers[i][j], opts.svm_fname, 04462 opts.svm_dtd_fname)) || 04463 (err = predict_labels_with_svm_model_tree( 04464 &((*labels_out)[i][j]), 04465 &((*confs_out)[i][j]), test_markers[i][j], tree))) 04466 { 04467 print_error_msg_exit("haplo-test", err->msg); 04468 } 04469 04470 find_ancestors(&(ancestor_types[i][j]), 04471 &(ancestor_labels[i][j]), (*labels_out)[i][j], 04472 test_labels[i][j]); 04473 } 04474 } 04475 04476 free_svm_model_tree(tree); 04477 04478 write_cross_validation_summary(ancestor_types, ancestor_labels, 04479 *labels_out, *confs_out, NULL, svm_summary_fname); 04480 04481 write_cross_validation_details(ancestor_types, ancestor_labels, 04482 *labels_out, *confs_out, NULL, svm_details_cnt_fname, 04483 svm_details_pct_fname); 04484 04485 write_cross_validation_confusion(test_labels, *labels_out, *confs_out, 04486 NULL, svm_confusion_cnt_fname, svm_confusion_pct_fname); 04487 04488 write_cross_validation_preds("svm", test_ids, test_labels, 04489 ancestor_types, ancestor_labels, *labels_out, *confs_out, 04490 NULL, svm_preds_fname); 04491 #else 04492 return; 04493 #endif 04494 } 04495 04497 static void cross_validate_j48 04498 ( 04499 Vector_u32**** labels_out, 04500 Vector_d**** confs_out, 04501 Matblock_u8*** train_ids, 04502 Vector_u32*** train_labels, 04503 Matrix_i32*** train_markers, 04504 Matblock_u8*** test_ids, 04505 Vector_u32*** test_labels, 04506 Matrix_i32*** test_markers, 04507 Vector_u32*** ancestor_types, 04508 Vector_u32*** ancestor_labels 04509 ) 04510 { 04511 #ifdef HAPLO_ENABLE_WEKA 04512 uint32_t i, j; 04513 pid_t pid; 04514 char tmp_dir[1024] = {0}; 04515 char script[4096] = {0}; 04516 04517 Weka_model_tree* tree = NULL; 04518 Error* err; 04519 04520 if (!opts.weka_j48_fname) 04521 return; 04522 04523 pid = getpid(); 04524 snprintf(tmp_dir, 1024, "%s/.haplo_test_cross_validate_j48_%u", 04525 tmp_dirname, pid); 04526 snprintf(script, 4096, "mkdir -p %s", tmp_dir); 04527 if (system(script) == 127) 04528 { 04529 print_error_msg_exit("haplo-test", "Could not create tmp files"); 04530 } 04531 04532 for (i = 0; i < num_cv_iters; i++) 04533 { 04534 for (j = 0; j < num_cv_folds; j++) 04535 { 04536 if ((err = train_weka_j48_model_tree(&tree, train_labels[i][j], 04537 train_markers[i][j], opts.weka_j48_fname, 04538 opts.weka_dtd_fname, tmp_dir, 04539 opts.weka_jar_fname)) || 04540 (err = predict_labels_with_weka_j48_model_tree( 04541 &((*labels_out)[i][j]), 04542 &((*confs_out)[i][j]), test_markers[i][j], 04543 tree, opts.weka_jar_fname))) 04544 { 04545 print_error_msg_exit("haplo-test", err->msg); 04546 } 04547 04548 find_ancestors(&(ancestor_types[i][j]), 04549 &(ancestor_labels[i][j]), (*labels_out)[i][j], 04550 test_labels[i][j]); 04551 } 04552 } 04553 04554 snprintf(script, 4096, "rm -rf %s", tmp_dir); 04555 if (system(script) == 127) 04556 { 04557 print_error_msg_exit("haplo-test", "Could not remove tmp files"); 04558 } 04559 04560 free_weka_model_tree(tree); 04561 04562 write_cross_validation_summary(ancestor_types, ancestor_labels, 04563 *labels_out, *confs_out, NULL, weka_j48_summary_fname); 04564 04565 write_cross_validation_details(ancestor_types, ancestor_labels, 04566 *labels_out, *confs_out, NULL, weka_j48_details_cnt_fname, 04567 weka_j48_details_pct_fname); 04568 04569 write_cross_validation_confusion(test_labels, *labels_out, *confs_out, 04570 NULL, weka_j48_confusion_cnt_fname, weka_j48_confusion_pct_fname); 04571 04572 write_cross_validation_preds("j48", test_ids, test_labels, 04573 ancestor_types, ancestor_labels, *labels_out, *confs_out, 04574 NULL, weka_j48_preds_fname); 04575 #else 04576 return; 04577 #endif 04578 } 04579 04581 static void cross_validate_part 04582 ( 04583 Vector_u32**** labels_out, 04584 Vector_d**** confs_out, 04585 Matblock_u8*** train_ids, 04586 Vector_u32*** train_labels, 04587 Matrix_i32*** train_markers, 04588 Matblock_u8*** test_ids, 04589 Vector_u32*** test_labels, 04590 Matrix_i32*** test_markers, 04591 Vector_u32*** ancestor_types, 04592 Vector_u32*** ancestor_labels 04593 ) 04594 { 04595 #ifdef HAPLO_ENABLE_WEKA 04596 uint32_t i, j; 04597 pid_t pid; 04598 char tmp_dir[1024] = {0}; 04599 char script[4096] = {0}; 04600 04601 Weka_model_tree* tree = NULL; 04602 Error* err; 04603 04604 if (!opts.weka_part_fname) 04605 return; 04606 04607 pid = getpid(); 04608 snprintf(tmp_dir, 1024, "%s/.haplo_test_cross_validate_part_%u", 04609 tmp_dirname, pid); 04610 snprintf(script, 4096, "mkdir -p %s", tmp_dir); 04611 if (system(script) == 127) 04612 { 04613 print_error_msg_exit("haplo-test", "Could not create tmp files"); 04614 } 04615 04616 for (i = 0; i < num_cv_iters; i++) 04617 { 04618 for (j = 0; j < num_cv_folds; j++) 04619 { 04620 if ((err = train_weka_part_model_tree(&tree, train_labels[i][j], 04621 train_markers[i][j], opts.weka_part_fname, 04622 opts.weka_dtd_fname, tmp_dir, 04623 opts.weka_jar_fname)) || 04624 (err = predict_labels_with_weka_part_model_tree( 04625 &((*labels_out)[i][j]), 04626 &((*confs_out)[i][j]), test_markers[i][j], 04627 tree, opts.weka_jar_fname))) 04628 { 04629 print_error_msg_exit("haplo-test", err->msg); 04630 } 04631 04632 find_ancestors(&(ancestor_types[i][j]), 04633 &(ancestor_labels[i][j]), (*labels_out)[i][j], 04634 test_labels[i][j]); 04635 } 04636 } 04637 04638 snprintf(script, 4096, "rm -rf %s", tmp_dir); 04639 if (system(script) == 127) 04640 { 04641 print_error_msg_exit("haplo-test", "Could not remove tmp files"); 04642 } 04643 04644 free_weka_model_tree(tree); 04645 04646 write_cross_validation_summary(ancestor_types, ancestor_labels, 04647 *labels_out, *confs_out, NULL, weka_part_summary_fname); 04648 04649 write_cross_validation_details(ancestor_types, ancestor_labels, 04650 *labels_out, *confs_out, NULL, weka_part_details_cnt_fname, 04651 weka_part_details_pct_fname); 04652 04653 write_cross_validation_confusion(test_labels, *labels_out, *confs_out, 04654 NULL, weka_part_confusion_cnt_fname, weka_part_confusion_pct_fname); 04655 04656 write_cross_validation_preds("part", test_ids, test_labels, 04657 ancestor_types, ancestor_labels, *labels_out, *confs_out, 04658 NULL, weka_part_preds_fname); 04659 #else 04660 return; 04661 #endif 04662 } 04663 04668 static void cross_validate_nearest 04669 ( 04670 Vector_u32**** labels_out, 04671 Vector_d**** dists_out, 04672 Matblock_u8*** train_ids, 04673 Vector_u32*** train_labels, 04674 Matrix_i32*** train_markers, 04675 Matblock_u8*** test_ids, 04676 Vector_u32*** test_labels, 04677 Matrix_i32*** test_markers, 04678 Vector_u32*** ancestor_types, 04679 Vector_u32*** ancestor_labels 04680 ) 04681 { 04682 uint32_t i, j; 04683 04684 Nearest_model* model = NULL; 04685 Error* err; 04686 04687 if (!opts.nearest_fname) 04688 return; 04689 04690 for (i = 0; i < num_cv_iters; i++) 04691 { 04692 for (j = 0; j < num_cv_folds; j++) 04693 { 04694 if ((err = train_nearest_model(&model, train_labels[i][j], 04695 train_markers[i][j], opts.nearest_fname, 04696 opts.nearest_dtd_fname)) || 04697 (err = predict_labels_with_nearest_model( 04698 &((*labels_out)[i][j]), 04699 &((*dists_out)[i][j]), test_markers[i][j], model))) 04700 { 04701 print_error_msg_exit("haplo-test", err->msg); 04702 } 04703 04704 find_ancestors(&(ancestor_types[i][j]), 04705 &(ancestor_labels[i][j]), (*labels_out)[i][j], 04706 test_labels[i][j]); 04707 } 04708 } 04709 04710 free_nearest_model(model); 04711 04712 write_cross_validation_summary(ancestor_types, ancestor_labels, *labels_out, 04713 *dists_out, NULL, nearest_summary_fname); 04714 04715 write_cross_validation_details(ancestor_types, ancestor_labels, *labels_out, 04716 *dists_out, NULL, nearest_details_cnt_fname, 04717 nearest_details_pct_fname); 04718 04719 write_cross_validation_confusion(test_labels, *labels_out, *dists_out, 04720 NULL, nearest_confusion_cnt_fname, nearest_confusion_pct_fname); 04721 04722 write_cross_validation_preds("nearest", test_ids, test_labels, 04723 ancestor_types, ancestor_labels, *labels_out, *dists_out, 04724 NULL, nearest_preds_fname); 04725 } 04726 04731 static void cross_validate_tandem_agree 04732 ( 04733 Vector_u32**** types_out, 04734 Vector_u32**** labels_out, 04735 Vector_u32*** nb_freq_labels, 04736 Vector_u32*** nb_gauss_labels, 04737 Vector_u32*** nb_gmm_labels, 04738 Vector_u32*** mv_gmm_labels, 04739 Vector_u32*** svm_labels, 04740 Vector_u32*** j48_labels, 04741 Vector_u32*** part_labels, 04742 Vector_u32*** nearest_labels, 04743 Matblock_u8*** test_ids, 04744 Vector_u32*** test_labels, 04745 Vector_u32*** ancestor_types, 04746 Vector_u32*** ancestor_labels 04747 ) 04748 { 04749 uint32_t i, j; 04750 const Vector_u32* nb_freq; 04751 const Vector_u32* nb_gauss; 04752 const Vector_u32* nb_gmm; 04753 const Vector_u32* mv_gmm; 04754 const Vector_u32* svm; 04755 const Vector_u32* j48; 04756 const Vector_u32* part; 04757 const Vector_u32* nearest; 04758 04759 for (i = 0; i < num_cv_iters; i++) 04760 { 04761 for (j = 0; j < num_cv_folds; j++) 04762 { 04763 nb_freq = (opts.nb_freq_fname) ? nb_freq_labels[i][j] : NULL; 04764 nb_gauss = (opts.nb_gauss_fname) ? nb_gauss_labels[i][j] : NULL; 04765 nb_gmm = (opts.nb_gmm_fname) ? nb_gmm_labels[i][j] : NULL; 04766 mv_gmm = (opts.mv_gmm_fname) ? mv_gmm_labels[i][j] : NULL; 04767 #ifdef HAPLO_ENABLE_SVM 04768 svm = (opts.svm_fname) ? svm_labels[i][j] : NULL; 04769 #else 04770 svm = NULL; 04771 #endif 04772 #ifdef HAPLO_ENABLE_WEKA 04773 j48 = (opts.weka_j48_fname) ? j48_labels[i][j] : NULL; 04774 part = (opts.weka_part_fname) ? part_labels[i][j] : NULL; 04775 #else 04776 j48 = NULL; 04777 part = NULL; 04778 #endif 04779 nearest = (opts.nearest_fname) ? nearest_labels[i][j] : NULL; 04780 04781 find_ancestors_of_sets(&(ancestor_types[i][j]), 04782 &(ancestor_labels[i][j]), nb_freq, nb_gauss, nb_gmm, 04783 mv_gmm, svm, j48, part, nearest); 04784 04785 copy_vector_u32(&((*types_out)[i][j]), ancestor_types[i][j]); 04786 copy_vector_u32(&((*labels_out)[i][j]), ancestor_labels[i][j]); 04787 } 04788 } 04789 04790 write_cross_validation_summary(ancestor_types, ancestor_labels, 04791 *labels_out, NULL, NULL, tandem_agree_summary_fname); 04792 04793 write_cross_validation_details(ancestor_types, ancestor_labels, 04794 *labels_out, NULL, *types_out, tandem_agree_details_cnt_fname, 04795 tandem_agree_details_pct_fname); 04796 } 04797 04802 static void cross_validate_tandem 04803 ( 04804 Vector_u32*** tandem_types, 04805 Vector_u32*** tandem_labels, 04806 Matblock_u8*** test_ids, 04807 Vector_u32*** test_labels, 04808 Vector_u32*** ancestor_types, 04809 Vector_u32*** ancestor_labels 04810 ) 04811 { 04812 uint32_t i, j; 04813 04814 for (i = 0; i < num_cv_iters; i++) 04815 { 04816 for (j = 0; j < num_cv_folds; j++) 04817 { 04818 find_tandem_ancestors(&(ancestor_types[i][j]), 04819 &(ancestor_labels[i][j]), tandem_types[i][j], 04820 tandem_labels[i][j], test_labels[i][j]); 04821 } 04822 } 04823 04824 write_cross_validation_summary(ancestor_types, ancestor_labels, 04825 tandem_labels, NULL, tandem_types, tandem_summary_fname); 04826 04827 write_cross_validation_details(ancestor_types, ancestor_labels, 04828 tandem_labels, NULL, tandem_types, tandem_details_cnt_fname, 04829 tandem_details_pct_fname); 04830 04831 write_cross_validation_confusion(test_labels, tandem_labels, NULL, 04832 tandem_types, tandem_confusion_cnt_fname, 04833 tandem_confusion_pct_fname); 04834 04835 write_cross_validation_preds("tandem", test_ids, test_labels, 04836 ancestor_types, ancestor_labels, tandem_labels, NULL, 04837 tandem_types, tandem_preds_fname); 04838 } 04839 04841 static void leave_one_out 04842 ( 04843 const Matblock_u8* ids, 04844 const Vector_u32* labels, 04845 const Matrix_i32* markers 04846 ) 04847 { 04848 Vector_u32* nb_freq_labels = NULL; 04849 Vector_d* nb_freq_confs = NULL; 04850 Vector_u32* nb_gauss_labels = NULL; 04851 Vector_d* nb_gauss_confs = NULL; 04852 Vector_u32* nb_gmm_labels = NULL; 04853 Vector_d* nb_gmm_confs = NULL; 04854 Vector_u32* mv_gmm_labels = NULL; 04855 Vector_d* mv_gmm_confs = NULL; 04856 Vector_u32* svm_labels = NULL; 04857 Vector_d* svm_confs = NULL; 04858 Vector_u32* j48_labels = NULL; 04859 Vector_d* j48_confs = NULL; 04860 Vector_u32* part_labels = NULL; 04861 Vector_d* part_confs = NULL; 04862 Vector_u32* nearest_labels = NULL; 04863 Vector_d* nearest_dists = NULL; 04864 Vector_u32* tandem_types = NULL; 04865 Vector_u32* tandem_labels = NULL; 04866 Vector_u32* ancestor_types = NULL; 04867 Vector_u32* ancestor_labels = NULL; 04868 04869 leave_one_out_nb_freq(&nb_freq_labels, &nb_freq_confs, &ancestor_types, 04870 &ancestor_labels, ids, labels, markers); 04871 04872 leave_one_out_nb_gauss(&nb_gauss_labels, &nb_gauss_confs, &ancestor_types, 04873 &ancestor_labels, ids, labels, markers); 04874 04875 leave_one_out_nb_gmm(&nb_gmm_labels, &nb_gmm_confs, &ancestor_types, 04876 &ancestor_labels, ids, labels, markers); 04877 04878 leave_one_out_mv_gmm(&mv_gmm_labels, &mv_gmm_confs, &ancestor_types, 04879 &ancestor_labels, ids, labels, markers); 04880 04881 leave_one_out_svm(&svm_labels, &svm_confs, &ancestor_types, 04882 &ancestor_labels, ids, labels, markers); 04883 04884 leave_one_out_j48(&j48_labels, &j48_confs, &ancestor_types, 04885 &ancestor_labels, ids, labels, markers); 04886 04887 leave_one_out_part(&part_labels, &part_confs, &ancestor_types, 04888 &ancestor_labels, ids, labels, markers); 04889 04890 leave_one_out_nearest(&nearest_labels, &nearest_dists, &ancestor_types, 04891 &ancestor_labels, ids, labels, markers); 04892 04893 if (tandem) 04894 { 04895 leave_one_out_tandem_agree(&tandem_types, &tandem_labels, 04896 &ancestor_types, &ancestor_labels, nb_freq_labels, 04897 nb_gauss_labels, nb_gmm_labels, mv_gmm_labels, svm_labels, 04898 j48_labels, part_labels, nearest_labels, ids, labels); 04899 04900 leave_one_out_tandem(tandem_types, tandem_labels, ids, labels, 04901 ancestor_types, ancestor_labels); 04902 } 04903 04904 free_vector_u32(nb_freq_labels); 04905 free_vector_d(nb_freq_confs); 04906 free_vector_u32(nb_gauss_labels); 04907 free_vector_d(nb_gauss_confs); 04908 free_vector_u32(nb_gmm_labels); 04909 free_vector_d(nb_gmm_confs); 04910 free_vector_u32(mv_gmm_labels); 04911 free_vector_d(mv_gmm_confs); 04912 free_vector_u32(svm_labels); 04913 free_vector_d(svm_confs); 04914 free_vector_u32(j48_labels); 04915 free_vector_d(j48_confs); 04916 free_vector_u32(part_labels); 04917 free_vector_d(part_confs); 04918 free_vector_u32(nearest_labels); 04919 free_vector_d(nearest_dists); 04920 free_vector_u32(tandem_labels); 04921 free_vector_u32(ancestor_types); 04922 free_vector_u32(ancestor_labels); 04923 } 04924 04929 static void allocate_cross_validation_results 04930 ( 04931 Vector_u32**** nb_freq_labels_out, 04932 Vector_d**** nb_freq_confs_out, 04933 Vector_u32**** nb_gauss_labels_out, 04934 Vector_d**** nb_gauss_confs_out, 04935 Vector_u32**** nb_gmm_labels_out, 04936 Vector_d**** nb_gmm_confs_out, 04937 Vector_u32**** mv_gmm_labels_out, 04938 Vector_d**** mv_gmm_confs_out, 04939 Vector_u32**** svm_labels_out, 04940 Vector_d**** svm_confs_out, 04941 Vector_u32**** j48_labels_out, 04942 Vector_d**** j48_confs_out, 04943 Vector_u32**** part_labels_out, 04944 Vector_d**** part_confs_out, 04945 Vector_u32**** nearest_labels_out, 04946 Vector_d**** nearest_dists_out, 04947 Vector_u32**** tandem_types_out, 04948 Vector_u32**** tandem_labels_out, 04949 Vector_u32**** ancestor_types_out, 04950 Vector_u32**** ancestor_labels_out, 04951 uint32_t num_samples 04952 ) 04953 { 04954 uint32_t i, j; 04955 04956 assert(*nb_freq_labels_out = malloc(num_cv_iters*sizeof(void**))); 04957 assert(*nb_freq_confs_out = malloc(num_cv_iters*sizeof(void**))); 04958 assert(*nb_gauss_labels_out = malloc(num_cv_iters*sizeof(void**))); 04959 assert(*nb_gauss_confs_out = malloc(num_cv_iters*sizeof(void**))); 04960 assert(*nb_gmm_labels_out = malloc(num_cv_iters*sizeof(void**))); 04961 assert(*nb_gmm_confs_out = malloc(num_cv_iters*sizeof(void**))); 04962 assert(*mv_gmm_labels_out = malloc(num_cv_iters*sizeof(void**))); 04963 assert(*mv_gmm_confs_out = malloc(num_cv_iters*sizeof(void**))); 04964 assert(*svm_labels_out = malloc(num_cv_iters*sizeof(void**))); 04965 assert(*svm_confs_out = malloc(num_cv_iters*sizeof(void**))); 04966 assert(*j48_labels_out = malloc(num_cv_iters*sizeof(void**))); 04967 assert(*j48_confs_out = malloc(num_cv_iters*sizeof(void**))); 04968 assert(*part_labels_out = malloc(num_cv_iters*sizeof(void**))); 04969 assert(*part_confs_out = malloc(num_cv_iters*sizeof(void**))); 04970 assert(*nearest_labels_out = malloc(num_cv_iters*sizeof(void**))); 04971 assert(*nearest_dists_out = malloc(num_cv_iters*sizeof(void**))); 04972 assert(*tandem_types_out = malloc(num_cv_iters*sizeof(void**))); 04973 assert(*tandem_labels_out = malloc(num_cv_iters*sizeof(void**))); 04974 assert(*ancestor_types_out = malloc(num_cv_iters*sizeof(void**))); 04975 assert(*ancestor_labels_out = malloc(num_cv_iters*sizeof(void**))); 04976 04977 for (i = 0; i < num_cv_iters; i++) 04978 { 04979 assert((*nb_freq_labels_out)[i] = calloc(num_cv_folds,sizeof(void*))); 04980 assert((*nb_freq_confs_out)[i] = calloc(num_cv_folds,sizeof(void*))); 04981 assert((*nb_gauss_labels_out)[i] = calloc(num_cv_folds,sizeof(void*))); 04982 assert((*nb_gauss_confs_out)[i] = calloc(num_cv_folds,sizeof(void*))); 04983 assert((*nb_gmm_labels_out)[i] = calloc(num_cv_folds,sizeof(void*))); 04984 assert((*nb_gmm_confs_out)[i] = calloc(num_cv_folds,sizeof(void*))); 04985 assert((*mv_gmm_labels_out)[i] = calloc(num_cv_folds,sizeof(void*))); 04986 assert((*mv_gmm_confs_out)[i] = calloc(num_cv_folds,sizeof(void*))); 04987 assert((*svm_labels_out)[i] = calloc(num_cv_folds,sizeof(void*))); 04988 assert((*svm_confs_out)[i] = calloc(num_cv_folds,sizeof(void*))); 04989 assert((*j48_labels_out)[i] = calloc(num_cv_folds,sizeof(void*))); 04990 assert((*j48_confs_out)[i] = calloc(num_cv_folds,sizeof(void*))); 04991 assert((*part_labels_out)[i] = calloc(num_cv_folds,sizeof(void*))); 04992 assert((*part_confs_out)[i] = calloc(num_cv_folds,sizeof(void*))); 04993 assert((*nearest_labels_out)[i] = calloc(num_cv_folds,sizeof(void*))); 04994 assert((*nearest_dists_out)[i] = calloc(num_cv_folds,sizeof(void*))); 04995 assert((*tandem_types_out)[i] = calloc(num_cv_folds,sizeof(void*))); 04996 assert((*tandem_labels_out)[i] = calloc(num_cv_folds,sizeof(void*))); 04997 assert((*ancestor_types_out)[i] = calloc(num_cv_folds,sizeof(void*))); 04998 assert((*ancestor_labels_out)[i] = calloc(num_cv_folds,sizeof(void*))); 04999 05000 for (j = 0; j < num_cv_folds; j++) 05001 { 05002 create_zero_vector_u32(&((*nb_freq_labels_out)[i][j]), num_samples); 05003 create_zero_vector_d(&((*nb_freq_confs_out)[i][j]), num_samples); 05004 create_zero_vector_u32(&((*nb_gauss_labels_out)[i][j]), num_samples); 05005 create_zero_vector_d(&((*nb_gauss_confs_out)[i][j]), num_samples); 05006 create_zero_vector_u32(&((*nb_gmm_labels_out)[i][j]), num_samples); 05007 create_zero_vector_d(&((*nb_gmm_confs_out)[i][j]), num_samples); 05008 create_zero_vector_u32(&((*mv_gmm_labels_out)[i][j]), num_samples); 05009 create_zero_vector_d(&((*mv_gmm_confs_out)[i][j]), num_samples); 05010 create_zero_vector_u32(&((*svm_labels_out)[i][j]), num_samples); 05011 create_zero_vector_d(&((*svm_confs_out)[i][j]), num_samples); 05012 create_zero_vector_u32(&((*j48_labels_out)[i][j]), num_samples); 05013 create_zero_vector_d(&((*j48_confs_out)[i][j]), num_samples); 05014 create_zero_vector_u32(&((*part_labels_out)[i][j]), num_samples); 05015 create_zero_vector_d(&((*part_confs_out)[i][j]), num_samples); 05016 create_zero_vector_u32(&((*nearest_labels_out)[i][j]), num_samples); 05017 create_zero_vector_d(&((*nearest_dists_out)[i][j]), num_samples); 05018 create_zero_vector_u32(&((*tandem_types_out)[i][j]), num_samples); 05019 create_zero_vector_u32(&((*tandem_labels_out)[i][j]), num_samples); 05020 create_zero_vector_u32(&((*ancestor_types_out)[i][j]), num_samples); 05021 create_zero_vector_u32(&((*ancestor_labels_out)[i][j]),num_samples); 05022 } 05023 } 05024 } 05025 05026 static void free_cross_validation_results 05027 ( 05028 Vector_u32*** nb_freq_labels, 05029 Vector_d*** nb_freq_confs, 05030 Vector_u32*** nb_gauss_labels, 05031 Vector_d*** nb_gauss_confs, 05032 Vector_u32*** nb_gmm_labels, 05033 Vector_d*** nb_gmm_confs, 05034 Vector_u32*** mv_gmm_labels, 05035 Vector_d*** mv_gmm_confs, 05036 Vector_u32*** svm_labels, 05037 Vector_d*** svm_confs, 05038 Vector_u32*** j48_labels, 05039 Vector_d*** j48_confs, 05040 Vector_u32*** part_labels, 05041 Vector_d*** part_confs, 05042 Vector_u32*** nearest_labels, 05043 Vector_d*** nearest_dists, 05044 Vector_u32*** tandem_types, 05045 Vector_u32*** tandem_labels, 05046 Vector_u32*** ancestor_types, 05047 Vector_u32*** ancestor_labels 05048 ) 05049 { 05050 uint32_t i, j; 05051 05052 for (i = 0; i < num_cv_iters; i++) 05053 { 05054 for (j =0; j < num_cv_folds; j++) 05055 { 05056 free_vector_u32(nb_freq_labels[ i ][ j ]); 05057 free_vector_d(nb_freq_confs[ i ][ j ]); 05058 free_vector_u32(nb_gauss_labels[ i ][ j ]); 05059 free_vector_d(nb_gauss_confs[ i ][ j ]); 05060 free_vector_u32(nb_gmm_labels[ i ][ j ]); 05061 free_vector_d(nb_gmm_confs[ i ][ j ]); 05062 free_vector_u32(mv_gmm_labels[ i ][ j ]); 05063 free_vector_d(mv_gmm_confs[ i ][ j ]); 05064 free_vector_u32(svm_labels[ i ][ j ]); 05065 free_vector_d(svm_confs[ i ][ j ]); 05066 free_vector_u32(j48_labels[ i ][ j ]); 05067 free_vector_d(j48_confs[ i ][ j ]); 05068 free_vector_u32(part_labels[ i ][ j ]); 05069 free_vector_d(part_confs[ i ][ j ]); 05070 free_vector_u32(nearest_labels[ i ][ j ]); 05071 free_vector_d(nearest_dists[ i ][ j ]); 05072 free_vector_u32(tandem_types[ i ][ j ]); 05073 free_vector_u32(tandem_labels[ i ][ j ]); 05074 free_vector_u32(ancestor_labels[ i ][ j ]); 05075 free_vector_u32(ancestor_types[ i ][ j ]); 05076 } 05077 05078 free(nb_freq_labels[ i ]); 05079 free(nb_freq_confs[ i ]); 05080 free(nb_gauss_labels[ i ]); 05081 free(nb_gauss_confs[ i ]); 05082 free(nb_gmm_labels[ i ]); 05083 free(nb_gmm_confs[ i ]); 05084 free(mv_gmm_labels[ i ]); 05085 free(mv_gmm_confs[ i ]); 05086 free(svm_labels[ i ]); 05087 free(svm_confs[ i ]); 05088 free(j48_labels[ i ]); 05089 free(j48_confs[ i ]); 05090 free(part_labels[ i ]); 05091 free(part_confs[ i ]); 05092 free(nearest_labels[ i ]); 05093 free(nearest_dists[ i ]); 05094 free(tandem_types[ i ]); 05095 free(tandem_labels[ i ]); 05096 free(ancestor_labels[ i ]); 05097 free(ancestor_types[ i ]); 05098 } 05099 05100 free(nb_freq_labels); 05101 free(nb_freq_confs); 05102 free(nb_gauss_labels); 05103 free(nb_gauss_confs); 05104 free(nb_gmm_labels); 05105 free(nb_gmm_confs); 05106 free(mv_gmm_labels); 05107 free(mv_gmm_confs); 05108 free(svm_labels); 05109 free(svm_confs); 05110 free(j48_labels); 05111 free(j48_confs); 05112 free(part_labels); 05113 free(part_confs); 05114 free(nearest_labels); 05115 free(nearest_dists); 05116 free(tandem_types); 05117 free(tandem_labels); 05118 free(ancestor_labels); 05119 free(ancestor_types); 05120 } 05121 05123 static void cross_validate 05124 ( 05125 const Matblock_u8* data_ids, 05126 const Vector_u32* data_labels, 05127 const Matrix_i32* data_markers 05128 ) 05129 { 05130 Matblock_u8*** train_ids = NULL; 05131 Vector_u32*** train_labels = NULL; 05132 Matrix_i32*** train_markers = NULL; 05133 Matblock_u8*** test_ids = NULL; 05134 Vector_u32*** test_labels = NULL; 05135 Matrix_i32*** test_markers = NULL; 05136 Vector_u32*** nb_freq_labels = NULL; 05137 Vector_d*** nb_freq_confs = NULL; 05138 Vector_u32*** nb_gauss_labels = NULL; 05139 Vector_d*** nb_gauss_confs = NULL; 05140 Vector_u32*** nb_gmm_labels = NULL; 05141 Vector_d*** nb_gmm_confs = NULL; 05142 Vector_u32*** mv_gmm_labels = NULL; 05143 Vector_d*** mv_gmm_confs = NULL; 05144 Vector_u32*** svm_labels = NULL; 05145 Vector_d*** svm_confs = NULL; 05146 Vector_u32*** j48_labels = NULL; 05147 Vector_d*** j48_confs = NULL; 05148 Vector_u32*** part_labels = NULL; 05149 Vector_d*** part_confs = NULL; 05150 Vector_u32*** nearest_labels = NULL; 05151 Vector_d*** nearest_dists = NULL; 05152 Vector_u32*** tandem_types = NULL; 05153 Vector_u32*** tandem_labels = NULL; 05154 Vector_u32*** ancestor_types = NULL; 05155 Vector_u32*** ancestor_labels = NULL; 05156 05157 create_cross_validation_train_and_test_data(&train_ids, &train_labels, 05158 &train_markers, &test_ids, &test_labels, &test_markers, data_ids, 05159 data_labels, data_markers); 05160 05161 allocate_cross_validation_results(&nb_freq_labels, &nb_freq_confs, 05162 &nb_gauss_labels, &nb_gauss_confs, &nb_gmm_labels, &nb_gmm_confs, 05163 &mv_gmm_labels, &mv_gmm_confs, &svm_labels, &svm_confs, &j48_labels, 05164 &j48_confs, &part_labels, &part_confs, &nearest_labels, 05165 &nearest_dists, &tandem_types, &tandem_labels, &ancestor_types, 05166 &ancestor_labels, data_labels->num_elts); 05167 05168 cross_validate_nb_freq(&nb_freq_labels, &nb_freq_confs, train_ids, 05169 train_labels, train_markers, test_ids, test_labels, test_markers, 05170 ancestor_types, ancestor_labels); 05171 05172 cross_validate_nb_gauss(&nb_gauss_labels, &nb_gauss_confs, train_ids, 05173 train_labels, train_markers, test_ids, test_labels, test_markers, 05174 ancestor_types, ancestor_labels); 05175 05176 cross_validate_nb_gmm(&nb_gmm_labels, &nb_gmm_confs, train_ids, 05177 train_labels, train_markers, test_ids, test_labels, test_markers, 05178 ancestor_types, ancestor_labels); 05179 05180 cross_validate_mv_gmm(&mv_gmm_labels, &mv_gmm_confs, train_ids, 05181 train_labels, train_markers, test_ids, test_labels, test_markers, 05182 ancestor_types, ancestor_labels); 05183 05184 cross_validate_svm(&svm_labels, &svm_confs, train_ids, train_labels, 05185 train_markers, test_ids, test_labels, test_markers, 05186 ancestor_types, ancestor_labels); 05187 05188 cross_validate_j48(&j48_labels, &j48_confs, train_ids, train_labels, 05189 train_markers, test_ids, test_labels, test_markers, 05190 ancestor_types, ancestor_labels); 05191 05192 cross_validate_part(&part_labels, &part_confs, train_ids, train_labels, 05193 train_markers, test_ids, test_labels, test_markers, 05194 ancestor_types, ancestor_labels); 05195 05196 cross_validate_nearest(&nearest_labels, &nearest_dists, train_ids, 05197 train_labels, train_markers, test_ids, test_labels, test_markers, 05198 ancestor_types, ancestor_labels); 05199 05200 if (tandem) 05201 { 05202 cross_validate_tandem_agree(&tandem_types, &tandem_labels, 05203 nb_freq_labels, nb_gauss_labels, nb_gmm_labels, mv_gmm_labels, 05204 svm_labels, j48_labels, part_labels, nearest_labels, test_ids, 05205 test_labels, ancestor_types, ancestor_labels); 05206 05207 cross_validate_tandem(tandem_types, tandem_labels, test_ids, 05208 test_labels, ancestor_types, ancestor_labels); 05209 } 05210 05211 free_cross_validation_results(nb_freq_labels, nb_freq_confs, 05212 nb_gauss_labels, nb_gauss_confs, nb_gmm_labels, nb_gmm_confs, 05213 mv_gmm_labels, mv_gmm_confs, svm_labels, svm_confs, j48_labels, 05214 j48_confs, part_labels, part_confs, nearest_labels, nearest_dists, 05215 tandem_types, tandem_labels, ancestor_types, ancestor_labels); 05216 05217 free_cross_validation_train_and_test_data(train_ids, train_labels, 05218 train_markers, test_ids, test_labels, test_markers); 05219 } 05220 05221 05223 int main(int argc, const char** argv) 05224 { 05225 int argi; 05226 const char* data_fname = "/dev/stdin"; 05227 Error* err; 05228 05229 Matblock_u8* ids = NULL; 05230 Vector_u32* labels = NULL; 05231 Matrix_i32* markers = NULL; 05232 05233 init_test_options(); 05234 05235 if ((err = process_options(argc, argv, &argi, NUM_OPTS_NO_ARG, opts_no_arg, 05236 NUM_OPTS_WITH_ARG, opts_with_arg)) != NULL) 05237 { 05238 print_error_msg_exit("haplo-test", err->msg); 05239 } 05240 05241 if ((argc - argi) == 1) 05242 { 05243 data_fname = argv[ argi ]; 05244 } 05245 05246 if (num_models_to_test() == 0) 05247 { 05248 print_error_msg_exit("haplo-test", "No models to test"); 05249 } 05250 05251 if ((err = read_haplo_groups(opts.labels_fname))) 05252 { 05253 print_error_msg_exit("haplo-test", err->msg); 05254 } 05255 05256 if ((err = read_input(&ids, &labels, &markers, data_fname))) 05257 { 05258 print_error_msg_exit("haplo-test", err->msg); 05259 } 05260 05261 if (!labels) 05262 { 05263 print_error_msg("haplo-test", NULL); 05264 print_error_msg_exit(data_fname, "No labels to test with"); 05265 } 05266 05267 switch (test_type) 05268 { 05269 case HAPLO_TEST_LEAVE_ONE_OUT: 05270 leave_one_out(ids, labels, markers); 05271 break; 05272 case HAPLO_TEST_CROSS_VALIDATE: 05273 cross_validate(ids, labels, markers); 05274 break; 05275 } 05276 05277 free_matblock_u8(ids); 05278 free_vector_u32(labels); 05279 free_matrix_i32(markers); 05280 05281 if (get_num_unhandled_errors() > 0) 05282 { 05283 print_error_msg_exit("haplo-test", "Unhandled errors"); 05284 } 05285 05286 return EXIT_SUCCESS; 05287 }