Haplo Prediction
predict haplogroups
|
00001 /* 00002 * This work is licensed under a Creative Commons 00003 * Attribution-Noncommercial-Share Alike 3.0 United States License. 00004 * 00005 * http://creativecommons.org/licenses/by-nc-sa/3.0/us/ 00006 * 00007 * You are free: 00008 * 00009 * to Share - to copy, distribute, display, and perform the work 00010 * to Remix - to make derivative works 00011 * 00012 * Under the following conditions: 00013 * 00014 * Attribution. You must attribute the work in the manner specified by the 00015 * author or licensor (but not in any way that suggests that they endorse you 00016 * or your use of the work). 00017 * 00018 * Noncommercial. You may not use this work for commercial purposes. 00019 * 00020 * Share Alike. If you alter, transform, or build upon this work, you may 00021 * distribute the resulting work only under the same or similar license to 00022 * this one. 00023 * 00024 * For any reuse or distribution, you must make clear to others the license 00025 * terms of this work. The best way to do this is by including this header. 00026 * 00027 * Any of the above conditions can be waived if you get permission from the 00028 * copyright holder. 00029 * 00030 * Apart from the remix rights granted under this license, nothing in this 00031 * license impairs or restricts the author's moral rights. 00032 */ 00033 00034 00046 #include <config.h> 00047 00048 #include <stdlib.h> 00049 #include <stdio.h> 00050 #include <string.h> 00051 #include <ctype.h> 00052 #include <inttypes.h> 00053 #include <assert.h> 00054 #include <errno.h> 00055 00056 #include <libxml/tree.h> 00057 #include <libxml/parser.h> 00058 #include <libxml/valid.h> 00059 00060 #ifdef HAPLO_HAVE_DMALLOC 00061 #include <dmalloc.h> 00062 #endif 00063 00064 #include <jwsc/base/error.h> 00065 #include <jwsc/vector/vector.h> 00066 #include <jwsc/matrix/matrix.h> 00067 #include <jwsc/matblock/matblock.h> 00068 00069 #include <xml.h> 00070 #include <haplo_groups.h> 00071 #include <options.h> 00072 #include <input.h> 00073 00074 00075 #define MAX_COL_SIZE 256 00076 00077 00083 static uint32_t read_block_into_buffer(char* buf, uint32_t size, FILE* fp) 00084 { 00085 uint32_t n; 00086 char c; 00087 00088 assert(size > 0); 00089 n = 0; 00090 00091 while (n < size && (c = fgetc(fp)) != EOF) 00092 { 00093 *buf = c; 00094 buf++; 00095 n++; 00096 } 00097 00098 return n; 00099 } 00100 00101 00103 static Error* read_file_into_buffer 00104 ( 00105 char** buf_out, 00106 uint32_t* buf_len_out, 00107 const char* fname 00108 ) 00109 { 00110 FILE* fp; 00111 char* buf; 00112 char* buf_p; 00113 uint32_t buf_len; 00114 uint32_t block_size; 00115 uint32_t n; 00116 int slen = 256; 00117 char str[slen]; 00118 00119 assert(*buf_out == NULL); 00120 00121 if ((fp = fopen(fname, "r")) == NULL) 00122 { 00123 snprintf(str, slen, "%s: %s", fname, strerror(errno)); 00124 return JWSC_EIO(str); 00125 } 00126 00127 block_size = 1024; 00128 buf_len = block_size; 00129 assert(buf = malloc(buf_len*sizeof(char))); 00130 buf_p = buf; 00131 00132 while ((n = read_block_into_buffer(buf_p, block_size, fp)) == block_size) 00133 { 00134 buf_len += block_size; 00135 assert(buf = realloc(buf, buf_len*sizeof(char))); 00136 buf_p = buf + (buf_len - block_size); 00137 } 00138 buf_len -= block_size - n; 00139 00140 if (buf_len == 0) 00141 { 00142 snprintf(str, slen, "%s: %s", fname, "No data to read"); 00143 return JWSC_EIO(str); 00144 } 00145 00146 assert(buf = realloc(buf, buf_len*sizeof(char))); 00147 00148 if (fclose(fp) != 0) 00149 { 00150 snprintf(str, slen, "%s: %s", fname, strerror(errno)); 00151 return JWSC_EIO(str); 00152 } 00153 00154 *buf_out = buf; 00155 *buf_len_out = buf_len; 00156 00157 return NULL; 00158 } 00159 00160 00174 static uint8_t skip_to_csv_column 00175 ( 00176 const char** buf_in_out, 00177 uint32_t buf_len, 00178 uint32_t col 00179 ) 00180 { 00181 const char* buf_in; 00182 const char* buf_p; 00183 uint32_t cur_col; 00184 00185 assert(buf_len > 0); 00186 00187 buf_in = *buf_in_out; 00188 buf_p = *buf_in_out; 00189 cur_col = 1; 00190 00191 while ((cur_col != col) && ((buf_p - buf_in) < buf_len-1) && 00192 (*buf_p != '\n')) 00193 { 00194 if (*buf_p == ',') 00195 { 00196 cur_col++; 00197 } 00198 buf_p++; 00199 } 00200 00201 *buf_in_out = buf_p; 00202 00203 if (cur_col != col) 00204 { 00205 return 0; 00206 } 00207 00208 return 1; 00209 } 00210 00211 00223 static const char* read_csv_column 00224 ( 00225 const char* src, 00226 uint32_t src_len, 00227 char* dst, 00228 uint32_t dst_len 00229 ) 00230 { 00231 const char* src_p; 00232 char* dst_p; 00233 00234 assert(src_len > 0 && dst_len > 0); 00235 00236 src_p = src; 00237 dst_p = dst; 00238 00239 while (((src_p - src) < src_len-1) && (*src_p != ',')) 00240 { 00241 if ((dst_p - dst) == dst_len-1) 00242 { 00243 break; 00244 } 00245 00246 *dst_p = *src_p; 00247 dst_p++; 00248 src_p++; 00249 } 00250 00251 *dst_p = '\0'; 00252 00253 if (*src_p == ',') 00254 { 00255 src_p++; 00256 } 00257 00258 return src_p; 00259 } 00260 00261 00275 static uint8_t skip_to_txt_column 00276 ( 00277 const char** buf_in_out, 00278 uint32_t buf_len, 00279 uint32_t col 00280 ) 00281 { 00282 const char* buf_in; 00283 const char* buf_p; 00284 uint32_t cur_col; 00285 00286 assert(buf_len > 0); 00287 00288 buf_in = *buf_in_out; 00289 buf_p = *buf_in_out; 00290 cur_col = 1; 00291 00292 while ((cur_col != col) && ((buf_p - buf_in) < buf_len-1) && 00293 (*buf_p != '\n')) 00294 { 00295 if (*buf_p == ' ') 00296 { 00297 cur_col++; 00298 while (*buf_p == ' ') 00299 buf_p++; 00300 } 00301 else 00302 { 00303 buf_p++; 00304 } 00305 } 00306 00307 *buf_in_out = buf_p; 00308 00309 if (cur_col != col) 00310 { 00311 return 0; 00312 } 00313 00314 return 1; 00315 } 00316 00317 00329 static const char* read_txt_column 00330 ( 00331 const char* src, 00332 uint32_t src_len, 00333 char* dst, 00334 uint32_t dst_len 00335 ) 00336 { 00337 const char* src_p; 00338 char* dst_p; 00339 00340 assert(src_len > 0 && dst_len > 0); 00341 00342 src_p = src; 00343 dst_p = dst; 00344 00345 while (((src_p - src) < src_len-1) && (*src_p != ' ')) 00346 { 00347 if ((dst_p - dst) == dst_len-1) 00348 { 00349 break; 00350 } 00351 00352 *dst_p = *src_p; 00353 dst_p++; 00354 src_p++; 00355 } 00356 00357 *dst_p = '\0'; 00358 00359 while (*src_p == ' ') 00360 { 00361 src_p++; 00362 } 00363 00364 return src_p; 00365 } 00366 00367 00374 static uint8_t skip_line 00375 ( 00376 const char** buf_in_out, 00377 const char* buf, 00378 uint32_t buf_len 00379 ) 00380 { 00381 const char* buf_p; 00382 00383 buf_p = *buf_in_out; 00384 00385 while (((buf_p - buf) < buf_len) && (*buf_p != '\n')) 00386 { 00387 buf_p++; 00388 } 00389 buf_p++; 00390 00391 *buf_in_out = buf_p; 00392 00393 return ((buf_p - buf) < buf_len); 00394 } 00395 00396 00398 static Error* get_num_samples_from_txt_csv 00399 ( 00400 uint32_t* num_samples_out, 00401 const char* file_buf, 00402 uint32_t file_len 00403 ) 00404 { 00405 const char* file_buf_p; 00406 uint32_t num_samples; 00407 00408 file_buf_p = file_buf; 00409 00410 if (opts.header_in) 00411 { 00412 if (!skip_line(&file_buf_p, file_buf, file_len)) 00413 { 00414 return JWSC_EIO("No samples to read"); 00415 } 00416 } 00417 00418 num_samples = 1; 00419 while (skip_line(&file_buf_p, file_buf, file_len)) 00420 { 00421 num_samples++; 00422 } 00423 00424 *num_samples_out = num_samples; 00425 00426 return NULL; 00427 } 00428 00429 00441 static Error* read_ids_from_txt_csv 00442 ( 00443 Matblock_u8** ids_out, 00444 const char* file_buf, 00445 uint32_t file_len, 00446 uint32_t num_samples, 00447 uint8_t aux 00448 ) 00449 { 00450 const char* file_buf_p; 00451 const char* file_buf_p_saved; 00452 uint32_t sample; 00453 uint32_t num_ids; 00454 uint32_t id; 00455 uint32_t col; 00456 uint32_t len; 00457 int slen = 256; 00458 char s[slen]; 00459 00460 if (!aux && !opts.id_cols) 00461 return NULL; 00462 else if (aux && !opts.aux_id_cols) 00463 return NULL; 00464 00465 file_buf_p = file_buf; 00466 00467 if (opts.header_in) 00468 { 00469 skip_line(&file_buf_p, file_buf, file_len); 00470 } 00471 00472 num_ids = (aux) ? opts.aux_id_cols->num_elts : opts.id_cols->num_elts; 00473 create_zero_matblock_u8(ids_out, num_samples, num_ids, MAX_COL_SIZE); 00474 00475 for (sample = 0; sample < num_samples; sample++) 00476 { 00477 file_buf_p_saved = file_buf_p; 00478 00479 for (id = 0; id < num_ids; id++) 00480 { 00481 file_buf_p = file_buf_p_saved; 00482 00483 col = (aux) ? opts.aux_id_cols->elts[ id ] : 00484 opts.id_cols->elts[ id ]; 00485 len = file_len - (file_buf_p - file_buf); 00486 00487 switch (opts.input_format) 00488 { 00489 case HAPLO_INPUT_TXT: 00490 if (!skip_to_txt_column(&file_buf_p, len, col)) 00491 { 00492 snprintf(s, slen, "ID %d missing in sample %d", 00493 id+1, sample+1); 00494 return JWSC_EIO(s); 00495 } 00496 read_txt_column(file_buf_p, len, (char*) 00497 ((*ids_out)->elts[ sample ][ id ]), MAX_COL_SIZE); 00498 break; 00499 case HAPLO_INPUT_CSV: 00500 if (!skip_to_csv_column(&file_buf_p, len, col)) 00501 { 00502 snprintf(s, slen, "ID %d missing in sample %d", 00503 id+1, sample+1); 00504 return JWSC_EIO(s); 00505 } 00506 read_csv_column(file_buf_p, len, (char*) 00507 ((*ids_out)->elts[ sample ][ id ]), MAX_COL_SIZE); 00508 break; 00509 case HAPLO_INPUT_XML: 00510 abort(); 00511 } 00512 } 00513 00514 skip_line(&file_buf_p, file_buf, file_len); 00515 } 00516 00517 return NULL; 00518 } 00519 00520 00533 static Error* read_labels_from_txt_csv 00534 ( 00535 Vector_u32** labels_out, 00536 const char* file_buf, 00537 uint32_t file_len, 00538 uint32_t num_samples, 00539 uint8_t aux 00540 ) 00541 { 00542 const char* file_buf_p; 00543 uint32_t label_col; 00544 uint32_t sample; 00545 uint32_t len; 00546 char col_buf[ MAX_COL_SIZE ] = {0}; 00547 int slen = 256; 00548 char s[slen]; 00549 00550 if (!aux && opts.label_col == 0) 00551 return NULL; 00552 else if (aux && opts.aux_label_col == 0) 00553 return NULL; 00554 00555 label_col = (aux) ? opts.aux_label_col : opts.label_col; 00556 00557 file_buf_p = file_buf; 00558 00559 if (opts.header_in) 00560 { 00561 skip_line(&file_buf_p, file_buf, file_len); 00562 } 00563 00564 create_zero_vector_u32(labels_out, num_samples); 00565 00566 for (sample = 0; sample < num_samples; sample++) 00567 { 00568 len = file_len - (file_buf_p - file_buf); 00569 00570 switch (opts.input_format) 00571 { 00572 case HAPLO_INPUT_TXT: 00573 if (!skip_to_txt_column(&file_buf_p, len, label_col)) 00574 { 00575 snprintf(s, slen, "No label in sample %d", sample+1); 00576 return JWSC_EIO(s); 00577 } 00578 read_txt_column(file_buf_p, len, col_buf, MAX_COL_SIZE); 00579 break; 00580 case HAPLO_INPUT_CSV: 00581 if (!skip_to_csv_column(&file_buf_p, len, label_col)) 00582 { 00583 snprintf(s, slen, "No label in sample %d", sample+1); 00584 return JWSC_EIO(s); 00585 } 00586 read_csv_column(file_buf_p, len, col_buf, MAX_COL_SIZE); 00587 break; 00588 case HAPLO_INPUT_XML: 00589 abort(); 00590 } 00591 00592 if (lookup_haplo_group_index_from_label( 00593 &((*labels_out)->elts[ sample ]), 00594 (const char*) col_buf) != NULL) 00595 { 00596 snprintf(s, slen, "Label '%s' in sample %d not recognized", 00597 col_buf, sample+1); 00598 return JWSC_EIO(s); 00599 } 00600 00601 skip_line(&file_buf_p, file_buf, file_len); 00602 } 00603 00604 return NULL; 00605 } 00606 00607 00619 static Error* read_markers_from_txt_csv 00620 ( 00621 Matrix_i32** markers_out, 00622 const char* file_buf, 00623 uint32_t file_len, 00624 uint32_t num_samples, 00625 uint8_t aux 00626 ) 00627 { 00628 const char* file_buf_p; 00629 const char* file_buf_p_saved; 00630 uint32_t num_markers; 00631 uint32_t first_marker_col; 00632 uint32_t sample; 00633 uint32_t marker; 00634 uint32_t col; 00635 uint32_t len; 00636 int32_t marker_val; 00637 char col_buf[ MAX_COL_SIZE ] = {0}; 00638 const Vector_u32* marker_cols; 00639 int slen = 256; 00640 char s[slen]; 00641 00642 if (aux) 00643 assert(opts.aux_first_marker_col > 0); 00644 else 00645 assert(opts.first_marker_col > 0); 00646 00647 file_buf_p = file_buf; 00648 00649 if (opts.header_in) 00650 { 00651 skip_line(&file_buf_p, file_buf, file_len); 00652 } 00653 00654 num_markers = (aux) ? opts.aux_num_markers : opts.num_markers; 00655 marker_cols = (aux) ? opts.aux_marker_cols : opts.marker_cols; 00656 first_marker_col = (aux) ? opts.aux_first_marker_col : 00657 opts.first_marker_col; 00658 00659 create_matrix_i32(markers_out, num_samples, num_markers); 00660 00661 for (sample = 0; sample < num_samples; sample++) 00662 { 00663 file_buf_p_saved = file_buf_p; 00664 00665 for (marker = 0; marker < num_markers; marker++) 00666 { 00667 file_buf_p = file_buf_p_saved; 00668 00669 col = (marker_cols) ? marker_cols->elts[ marker ] 00670 : first_marker_col + marker; 00671 00672 len = file_len - (file_buf_p - file_buf); 00673 00674 switch (opts.input_format) 00675 { 00676 case HAPLO_INPUT_TXT: 00677 if (!skip_to_txt_column(&file_buf_p, len, col)) 00678 { 00679 snprintf(s, slen, "Marker %d missing in sample %d", 00680 marker+1, sample+1); 00681 return JWSC_EIO(s); 00682 } 00683 read_txt_column(file_buf_p, len, col_buf, MAX_COL_SIZE); 00684 break; 00685 case HAPLO_INPUT_CSV: 00686 if (!skip_to_csv_column(&file_buf_p, len, col)) 00687 { 00688 snprintf(s, slen, "Marker %d missing in sample %d", 00689 marker+1, sample+1); 00690 return JWSC_EIO(s); 00691 } 00692 read_csv_column(file_buf_p, len, col_buf, MAX_COL_SIZE); 00693 break; 00694 case HAPLO_INPUT_XML: 00695 abort(); 00696 } 00697 00698 if (sscanf(col_buf, "%d", &marker_val) != 1) 00699 { 00700 snprintf(s, slen, "Marker %d in sample %d not valid", 00701 marker+1, sample+1); 00702 return JWSC_EIO(s); 00703 } 00704 (*markers_out)->elts[ sample ][ marker ] = marker_val; 00705 } 00706 00707 skip_line(&file_buf_p, file_buf, file_len); 00708 } 00709 00710 return NULL; 00711 } 00712 00713 00715 static Error* read_input_from_txt_csv 00716 ( 00717 Matblock_u8** ids_out, 00718 Vector_u32** labels_out, 00719 Matrix_i32** markers_out, 00720 const char* file_buf, 00721 uint32_t file_len, 00722 uint8_t aux 00723 ) 00724 { 00725 uint32_t num_samples; 00726 Error* err; 00727 00728 if ((err = get_num_samples_from_txt_csv(&num_samples, file_buf, file_len))) 00729 { 00730 return err; 00731 } 00732 00733 if ((err = read_ids_from_txt_csv(ids_out, file_buf, file_len, num_samples, 00734 aux))) 00735 { 00736 return err; 00737 } 00738 00739 if ((err = read_labels_from_txt_csv(labels_out, file_buf, file_len, 00740 num_samples, aux))) 00741 { 00742 return err; 00743 } 00744 00745 if ((err = read_markers_from_txt_csv(markers_out, file_buf, file_len, 00746 num_samples, aux))) 00747 { 00748 return err; 00749 } 00750 00751 return NULL; 00752 } 00753 00754 00756 static uint32_t get_num_samples_from_xml(xmlDoc* doc) 00757 { 00758 const xmlNode* root; 00759 const xmlNode* it; 00760 uint32_t N = 0; 00761 00762 root = xmlDocGetRootElement(doc); 00763 00764 for (it = root->children; it; it = it->next) 00765 { 00766 if (it->type == XML_ELEMENT_NODE && XMLStrEqual(it->name, "sample")) 00767 { 00768 N++; 00769 } 00770 } 00771 00772 return N; 00773 } 00774 00775 00777 static Error* read_ids_from_xml 00778 ( 00779 Matblock_u8** ids_out, 00780 xmlDoc* doc, 00781 uint32_t num_samples, 00782 uint8_t aux 00783 ) 00784 { 00785 uint32_t i, j; 00786 uint32_t num_ids; 00787 const xmlNode* root; 00788 const xmlNode* it; 00789 const xmlNode* itt; 00790 int slen = 256; 00791 char str[slen]; 00792 00793 if (!aux && !opts.id_cols) 00794 return NULL; 00795 else if (aux && !opts.aux_id_cols) 00796 return NULL; 00797 00798 num_ids = (aux) ? opts.aux_id_cols->num_elts : opts.id_cols->num_elts; 00799 create_zero_matblock_u8(ids_out, num_samples, num_ids, MAX_COL_SIZE); 00800 00801 root = xmlDocGetRootElement(doc); 00802 00803 i = 0; 00804 for (it = root->children; it; it = it->next) 00805 { 00806 if (it->type == XML_ELEMENT_NODE && XMLStrEqual(it->name, "sample")) 00807 { 00808 j = 0; 00809 for (itt = it->children; itt; itt = itt->next) 00810 { 00811 if (itt->type == XML_ELEMENT_NODE && 00812 XMLStrEqual(itt->name, "id")) 00813 { 00814 if (j == num_ids) 00815 { 00816 snprintf(str, slen, 00817 "Sample %d has too many IDs", i+1); 00818 return JWSC_EARG(str); 00819 } 00820 strncpy((char*)(*ids_out)->elts[ i ][ j ], 00821 (const char*)itt->children->content, MAX_COL_SIZE); 00822 j++; 00823 } 00824 } 00825 if (j < num_ids) 00826 { 00827 snprintf(str, slen, "Sample %d missing IDs", i+1); 00828 return JWSC_EARG(str); 00829 } 00830 i++; 00831 } 00832 } 00833 assert(i == num_samples); 00834 00835 return NULL; 00836 } 00837 00838 00840 static Error* read_labels_from_xml 00841 ( 00842 Vector_u32** labels_out, 00843 xmlDoc* doc, 00844 uint32_t num_samples, 00845 uint8_t aux 00846 ) 00847 { 00848 uint32_t i; 00849 const char* label; 00850 const xmlNode* root; 00851 const xmlNode* it; 00852 const xmlNode* itt; 00853 Error* err; 00854 int slen = 256; 00855 char str[slen]; 00856 00857 if (!aux && opts.label_col == 0) 00858 return NULL; 00859 else if (aux && opts.aux_label_col == 0) 00860 return NULL; 00861 00862 create_zero_vector_u32(labels_out, num_samples); 00863 00864 root = xmlDocGetRootElement(doc); 00865 00866 i = 0; 00867 for (it = root->children; it; it = it->next) 00868 { 00869 if (it->type == XML_ELEMENT_NODE && XMLStrEqual(it->name, "sample")) 00870 { 00871 label = 0; 00872 for (itt = it->children; itt; itt = itt->next) 00873 { 00874 if (itt->type == XML_ELEMENT_NODE && 00875 XMLStrEqual(itt->name, "label")) 00876 { 00877 label = (const char*) itt->children->content; 00878 00879 if ((err = lookup_haplo_group_index_from_label( 00880 &((*labels_out)->elts[ i ]), label))) 00881 { 00882 return err; 00883 } 00884 break; 00885 } 00886 } 00887 if (!label) 00888 { 00889 snprintf(str, slen, "Sample %d missing label", i+1); 00890 return JWSC_EARG(str); 00891 } 00892 i++; 00893 } 00894 } 00895 assert(i == num_samples); 00896 00897 return NULL; 00898 } 00899 00900 00902 static Error* read_markers_from_xml 00903 ( 00904 Matrix_i32** markers_out, 00905 xmlDoc* doc, 00906 uint32_t num_samples, 00907 uint8_t aux 00908 ) 00909 { 00910 uint32_t i, j; 00911 uint32_t num_markers; 00912 int32_t marker_val; 00913 const xmlNode* root; 00914 const xmlNode* it; 00915 const xmlNode* itt; 00916 int slen = 256; 00917 char str[slen]; 00918 00919 if (aux) 00920 assert(opts.aux_first_marker_col > 0); 00921 else 00922 assert(opts.first_marker_col > 0); 00923 00924 num_markers = (aux) ? opts.aux_num_markers : opts.num_markers; 00925 00926 create_matrix_i32(markers_out, num_samples, num_markers); 00927 00928 root = xmlDocGetRootElement(doc); 00929 00930 i = 0; 00931 for (it = root->children; it; it = it->next) 00932 { 00933 if (it->type == XML_ELEMENT_NODE && XMLStrEqual(it->name, "sample")) 00934 { 00935 j = 0; 00936 for (itt = it->children; itt; itt = itt->next) 00937 { 00938 if (itt->type == XML_ELEMENT_NODE && 00939 XMLStrEqual(itt->name, "marker")) 00940 { 00941 if (j == num_markers) 00942 { 00943 snprintf(str, slen, 00944 "Sample %d has too many markers", i+1); 00945 return JWSC_EARG(str); 00946 } 00947 if (sscanf((const char*)itt->children->content, "%d", 00948 &marker_val) != 1) 00949 { 00950 snprintf(str, slen, 00951 "Marker %d in sample %d not valid", j+1, i+1); 00952 return JWSC_EARG(str); 00953 } 00954 (*markers_out)->elts[ i ][ j ] = marker_val; 00955 j++; 00956 } 00957 } 00958 if (j < num_markers) 00959 { 00960 snprintf(str, slen, "Sample %d missing markers", i+1); 00961 return JWSC_EARG(str); 00962 } 00963 i++; 00964 } 00965 } 00966 assert(i == num_samples); 00967 00968 return NULL; 00969 } 00970 00971 00973 static Error* read_input_from_xml 00974 ( 00975 Matblock_u8** ids_out, 00976 Vector_u32** labels_out, 00977 Matrix_i32** markers_out, 00978 const char* file_buf, 00979 uint32_t file_len, 00980 const char* fname, 00981 uint8_t aux 00982 ) 00983 { 00984 uint32_t num_samples; 00985 xmlParserCtxt* xml_parse_ctxt; 00986 xmlValidCtxt* xml_valid_ctxt; 00987 xmlDoc* xml_doc; 00988 xmlDtd* xml_dtd; 00989 Error* err; 00990 int slen = 256; 00991 char str[slen]; 00992 00993 assert(xml_parse_ctxt = xmlNewParserCtxt()); 00994 00995 if (!(xml_doc = xmlCtxtReadMemory(xml_parse_ctxt, file_buf, file_len, fname, 00996 NULL, 0))) 00997 { 00998 return JWSC_EARG("Could not parse file"); 00999 } 01000 01001 xmlFreeParserCtxt(xml_parse_ctxt); 01002 01003 if (opts.input_dtd_fname) 01004 { 01005 assert(xml_valid_ctxt = xmlNewValidCtxt()); 01006 01007 if (!(xml_dtd = xmlParseDTD(NULL, (xmlChar*)opts.input_dtd_fname))) 01008 { 01009 snprintf(str, slen, "%s: %s", opts.input_dtd_fname, 01010 "Could not parse DTD"); 01011 return JWSC_EARG(str); 01012 } 01013 01014 if (!xmlValidateDtd(xml_valid_ctxt, xml_doc, xml_dtd)) 01015 { 01016 return JWSC_EARG("XML file not valid"); 01017 } 01018 01019 xmlFreeValidCtxt(xml_valid_ctxt); 01020 xmlFreeDtd(xml_dtd); 01021 } 01022 01023 num_samples = get_num_samples_from_xml(xml_doc); 01024 01025 if ((err = read_ids_from_xml(ids_out, xml_doc, num_samples, aux))) 01026 { 01027 return err; 01028 } 01029 01030 if ((err = read_labels_from_xml(labels_out, xml_doc, num_samples, aux))) 01031 { 01032 return err; 01033 } 01034 01035 if ((err = read_markers_from_xml(markers_out, xml_doc, num_samples, aux))) 01036 { 01037 return err; 01038 } 01039 01040 return NULL; 01041 } 01042 01043 01053 Error* read_input 01054 ( 01055 Matblock_u8** ids_out, 01056 Vector_u32** labels_out, 01057 Matrix_i32** markers_out, 01058 const char* fname 01059 ) 01060 { 01061 uint32_t file_len; 01062 char* file_buf = NULL; 01063 Error* err; 01064 int slen = 256; 01065 char str[slen]; 01066 01067 if ((err = read_file_into_buffer(&file_buf, &file_len, fname))) 01068 { 01069 return err; 01070 } 01071 01072 switch (opts.input_format) 01073 { 01074 case HAPLO_INPUT_TXT: 01075 case HAPLO_INPUT_CSV: 01076 if ((err = read_input_from_txt_csv(ids_out, labels_out, markers_out, 01077 file_buf, file_len, 0))) 01078 { 01079 snprintf(str, slen, "%s: %s", fname, err->msg); 01080 return JWSC_EIO(str); 01081 } 01082 break; 01083 case HAPLO_INPUT_XML: 01084 if ((err = read_input_from_xml(ids_out, labels_out, markers_out, 01085 file_buf, file_len, fname, 0))) 01086 { 01087 snprintf(str, slen, "%s: %s", fname, err->msg); 01088 return JWSC_EIO(str); 01089 } 01090 break; 01091 } 01092 01093 free(file_buf); 01094 01095 return err; 01096 } 01097 01098 01108 Error* read_aux_input 01109 ( 01110 Matblock_u8** ids_out, 01111 Vector_u32** labels_out, 01112 Matrix_i32** markers_out, 01113 const char* fname 01114 ) 01115 { 01116 uint32_t file_len; 01117 char* file_buf = NULL; 01118 Error* err; 01119 int slen = 256; 01120 char str[slen]; 01121 01122 if ((err = read_file_into_buffer(&file_buf, &file_len, fname))) 01123 { 01124 return err; 01125 } 01126 01127 switch (opts.input_format) 01128 { 01129 case HAPLO_INPUT_TXT: 01130 case HAPLO_INPUT_CSV: 01131 if ((err = read_input_from_txt_csv(ids_out, labels_out, 01132 markers_out, file_buf, file_len, 1))) 01133 { 01134 snprintf(str, slen, "%s: %s", fname, err->msg); 01135 return JWSC_EIO(str); 01136 } 01137 break; 01138 case HAPLO_INPUT_XML: 01139 if ((err = read_input_from_xml(ids_out, labels_out, markers_out, 01140 file_buf, file_len, fname, 1))) 01141 { 01142 snprintf(str, slen, "%s: %s", fname, err->msg); 01143 return JWSC_EIO(str); 01144 } 01145 break; 01146 } 01147 01148 free(file_buf); 01149 01150 return err; 01151 } 01152 01153 01167 Error* impute_marker_from_parent_of_haplogroup_index( 01168 uint32_t haplo_group_index, 01169 uint32_t marker_no, 01170 uint32_t sample_no, 01171 Matrix_i32* marker_sums, 01172 Matrix_i32* imp_markers 01173 ){ 01174 01175 uint32_t parent_i; // where the index of the parent will be stored 01176 Error* e= lookup_haplo_group_ancestor_index_from_haplo_group_index(&parent_i,haplo_group_index); 01177 if(e){ 01178 return e; 01179 } 01180 01181 01182 if (marker_sums->elts[ parent_i ][ marker_no ] == 0) 01183 { 01184 return impute_marker_from_parent_of_haplogroup_index(parent_i,marker_no,sample_no,marker_sums,imp_markers); 01185 } 01186 else 01187 { 01188 01189 imp_markers->elts[ sample_no ][ marker_no ] = marker_sums->elts[ parent_i ][ marker_no]; 01190 } 01191 01192 return NULL; 01193 01194 } 01195 01196 01207 Error* impute_missing_markers_from_avg 01208 ( 01209 const Vector_u32* imp_labels, 01210 Matrix_i32* imp_markers, 01211 const Vector_u32* src_labels, 01212 const Matrix_i32* src_markers 01213 ) 01214 { 01215 uint32_t s, num_imp_samples; 01216 uint32_t num_src_samples; 01217 uint32_t m, num_markers; 01218 uint32_t l, ll, num_labels; 01219 01220 Matrix_i32* marker_sums = NULL; 01221 Matrix_u32* marker_cnts = NULL; 01222 int slen = 256; 01223 char str[slen]; 01224 01225 assert(imp_markers->num_cols == src_markers->num_cols); 01226 01227 num_imp_samples = imp_markers->num_rows; 01228 num_src_samples = src_markers->num_rows; 01229 num_markers = imp_markers->num_cols; 01230 num_labels = get_num_haplo_groups(); 01231 01232 create_zero_matrix_i32(&marker_sums, num_labels, num_markers); 01233 create_zero_matrix_u32(&marker_cnts, num_labels, num_markers); 01234 01235 for (s = 0; s < num_src_samples; s++) 01236 { 01237 l = src_labels->elts[ s ]; 01238 for (ll = 0; ll < num_labels; ll++) 01239 { 01240 if (is_ancestor(l, ll)) 01241 { 01242 for (m = 0; m < num_markers; m++) 01243 { 01244 if (src_markers->elts[ s ][ m ]) 01245 { 01246 marker_sums->elts[ ll ][ m ] += 01247 src_markers->elts[ s ][ m ]; 01248 marker_cnts->elts[ ll ][ m ]++; 01249 } 01250 } 01251 } 01252 } 01253 } 01254 01255 for (l = 0; l < num_labels; l++) 01256 { 01257 for (m = 0; m < num_markers; m++) 01258 { 01259 if (marker_cnts->elts[ l ][ m ]) 01260 { 01261 marker_sums->elts[ l ][ m ] /= marker_cnts->elts[ l ][ m ]; 01262 } 01263 } 01264 } 01265 01266 for (s = 0; s < num_imp_samples; s++) 01267 { 01268 l = imp_labels->elts[ s ]; 01269 for (m = 0; m < num_markers; m++) 01270 { 01271 if (!imp_markers->elts[ s ][ m ] && 01272 marker_cnts->elts[ l ][ m ] == 0) 01273 { 01274 if(impute_marker_from_parent_of_haplogroup_index(l,m,s,marker_sums,imp_markers)){ 01275 snprintf(str, slen, "Sample %d: %s", s+1, 01276 "No data to impute missing markers"); 01277 return JWSC_EARG(str); 01278 } 01279 } 01280 else if (!imp_markers->elts[ s ][ m ]) 01281 { 01282 imp_markers->elts[ s ][ m ] = marker_sums->elts[ l ][ m ]; 01283 } 01284 } 01285 } 01286 01287 free_matrix_u32(marker_cnts); 01288 free_matrix_i32(marker_sums); 01289 01290 return NULL; 01291 } 01292 01293 01300 Error* impute_missing_markers_from_nn 01301 ( 01302 Matrix_i32* imp_markers, 01303 const Matrix_i32* src_markers 01304 ) 01305 { 01306 // TODO 01307 abort(); 01308 01309 return NULL; 01310 }