Haplo Prediction
predict haplogroups
input.c
Go to the documentation of this file.
00001 /*
00002  * This work is licensed under a Creative Commons 
00003  * Attribution-Noncommercial-Share Alike 3.0 United States License.
00004  * 
00005  *    http://creativecommons.org/licenses/by-nc-sa/3.0/us/
00006  * 
00007  * You are free:
00008  * 
00009  *    to Share - to copy, distribute, display, and perform the work
00010  *    to Remix - to make derivative works
00011  * 
00012  * Under the following conditions:
00013  * 
00014  *    Attribution. You must attribute the work in the manner specified by the
00015  *    author or licensor (but not in any way that suggests that they endorse you
00016  *    or your use of the work).
00017  * 
00018  *    Noncommercial. You may not use this work for commercial purposes.
00019  * 
00020  *    Share Alike. If you alter, transform, or build upon this work, you may
00021  *    distribute the resulting work only under the same or similar license to
00022  *    this one.
00023  * 
00024  * For any reuse or distribution, you must make clear to others the license
00025  * terms of this work. The best way to do this is by including this header.
00026  * 
00027  * Any of the above conditions can be waived if you get permission from the
00028  * copyright holder.
00029  * 
00030  * Apart from the remix rights granted under this license, nothing in this
00031  * license impairs or restricts the author's moral rights.
00032  */
00033 
00034 
00046 #include <config.h>
00047 
00048 #include <stdlib.h>
00049 #include <stdio.h>
00050 #include <string.h>
00051 #include <ctype.h>
00052 #include <inttypes.h>
00053 #include <assert.h>
00054 #include <errno.h>
00055 
00056 #include <libxml/tree.h>
00057 #include <libxml/parser.h>
00058 #include <libxml/valid.h>
00059 
00060 #ifdef HAPLO_HAVE_DMALLOC
00061 #include <dmalloc.h>
00062 #endif
00063 
00064 #include <jwsc/base/error.h>
00065 #include <jwsc/vector/vector.h>
00066 #include <jwsc/matrix/matrix.h>
00067 #include <jwsc/matblock/matblock.h>
00068 
00069 #include <xml.h>
00070 #include <haplo_groups.h>
00071 #include <options.h>
00072 #include <input.h>
00073 
00074 
00075 #define  MAX_COL_SIZE  256
00076 
00077 
00083 static uint32_t read_block_into_buffer(char* buf, uint32_t size, FILE* fp)
00084 {
00085     uint32_t n;
00086     char     c;
00087 
00088     assert(size > 0);
00089     n = 0;
00090 
00091     while (n < size && (c = fgetc(fp)) != EOF)
00092     {
00093         *buf = c;
00094         buf++;
00095         n++;
00096     }
00097 
00098     return n;
00099 }
00100 
00101 
00103 static Error* read_file_into_buffer
00104 (
00105     char**      buf_out, 
00106     uint32_t*   buf_len_out, 
00107     const char* fname
00108 )
00109 {
00110     FILE*    fp;
00111     char*    buf;
00112     char*    buf_p;
00113     uint32_t buf_len;
00114     uint32_t block_size;
00115     uint32_t n;
00116     int      slen = 256;
00117     char     str[slen];
00118 
00119     assert(*buf_out == NULL);
00120 
00121     if ((fp = fopen(fname, "r")) == NULL)
00122     {
00123         snprintf(str, slen, "%s: %s", fname, strerror(errno));
00124         return JWSC_EIO(str);
00125     }
00126 
00127     block_size = 1024;
00128     buf_len = block_size;
00129     assert(buf = malloc(buf_len*sizeof(char)));
00130     buf_p = buf;
00131 
00132     while ((n = read_block_into_buffer(buf_p, block_size, fp)) == block_size)
00133     {
00134         buf_len += block_size;
00135         assert(buf = realloc(buf, buf_len*sizeof(char)));
00136         buf_p = buf + (buf_len - block_size);
00137     }
00138     buf_len -= block_size - n;
00139 
00140     if (buf_len == 0)
00141     {
00142         snprintf(str, slen, "%s: %s", fname, "No data to read");
00143         return JWSC_EIO(str);
00144     }
00145 
00146     assert(buf = realloc(buf, buf_len*sizeof(char)));
00147 
00148     if (fclose(fp) != 0)
00149     {
00150         snprintf(str, slen, "%s: %s", fname, strerror(errno));
00151         return JWSC_EIO(str);
00152     }
00153 
00154     *buf_out = buf;
00155     *buf_len_out = buf_len;
00156 
00157     return NULL;
00158 }
00159 
00160 
00174 static uint8_t skip_to_csv_column
00175 (
00176     const char** buf_in_out, 
00177     uint32_t     buf_len, 
00178     uint32_t     col
00179 )
00180 {
00181     const char* buf_in;
00182     const char* buf_p;
00183     uint32_t cur_col;
00184 
00185     assert(buf_len > 0);
00186 
00187     buf_in = *buf_in_out;
00188     buf_p  = *buf_in_out;
00189     cur_col = 1;
00190 
00191     while ((cur_col != col) && ((buf_p - buf_in) < buf_len-1) && 
00192             (*buf_p != '\n'))
00193     {
00194         if (*buf_p == ',')
00195         {
00196             cur_col++;
00197         }
00198         buf_p++;
00199     }
00200 
00201     *buf_in_out = buf_p;
00202 
00203     if (cur_col != col)
00204     {
00205         return 0;
00206     }
00207 
00208     return 1;
00209 }
00210 
00211 
00223 static const char* read_csv_column
00224 (
00225     const char* src, 
00226     uint32_t    src_len, 
00227     char*       dst, 
00228     uint32_t    dst_len
00229 )
00230 {
00231     const char* src_p;
00232     char* dst_p;
00233 
00234     assert(src_len > 0 && dst_len > 0);
00235 
00236     src_p = src;
00237     dst_p = dst;
00238 
00239     while (((src_p - src) < src_len-1) && (*src_p != ','))
00240     {
00241         if ((dst_p - dst) == dst_len-1)
00242         {
00243             break;
00244         }
00245 
00246         *dst_p = *src_p;
00247         dst_p++;
00248         src_p++;
00249     }
00250 
00251     *dst_p = '\0';
00252 
00253     if (*src_p == ',')
00254     {
00255         src_p++;
00256     }
00257 
00258     return src_p;
00259 }
00260 
00261 
00275 static uint8_t skip_to_txt_column
00276 (
00277     const char** buf_in_out, 
00278     uint32_t     buf_len, 
00279     uint32_t     col
00280 )
00281 {
00282     const char* buf_in;
00283     const char* buf_p;
00284     uint32_t cur_col;
00285 
00286     assert(buf_len > 0);
00287 
00288     buf_in = *buf_in_out;
00289     buf_p  = *buf_in_out;
00290     cur_col = 1;
00291 
00292     while ((cur_col != col) && ((buf_p - buf_in) < buf_len-1) && 
00293             (*buf_p != '\n'))
00294     {
00295         if (*buf_p == ' ')
00296         {
00297             cur_col++;
00298             while (*buf_p == ' ') 
00299                 buf_p++;
00300         }
00301         else
00302         {
00303             buf_p++;
00304         }
00305     }
00306 
00307     *buf_in_out = buf_p;
00308 
00309     if (cur_col != col)
00310     {
00311         return 0;
00312     }
00313 
00314     return 1;
00315 }
00316 
00317 
00329 static const char* read_txt_column
00330 (
00331     const char* src, 
00332     uint32_t    src_len, 
00333     char*       dst, 
00334     uint32_t    dst_len
00335 )
00336 {
00337     const char* src_p;
00338     char* dst_p;
00339 
00340     assert(src_len > 0 && dst_len > 0);
00341 
00342     src_p = src;
00343     dst_p = dst;
00344 
00345     while (((src_p - src) < src_len-1) && (*src_p != ' '))
00346     {
00347         if ((dst_p - dst) == dst_len-1)
00348         {
00349             break;
00350         }
00351 
00352         *dst_p = *src_p;
00353         dst_p++;
00354         src_p++;
00355     }
00356 
00357     *dst_p = '\0';
00358 
00359     while (*src_p == ' ')
00360     {
00361         src_p++;
00362     }
00363 
00364     return src_p;
00365 }
00366 
00367 
00374 static uint8_t skip_line
00375 (
00376     const char** buf_in_out, 
00377     const char*  buf, 
00378     uint32_t     buf_len
00379 )
00380 {
00381     const char* buf_p;
00382 
00383     buf_p = *buf_in_out;
00384 
00385     while (((buf_p - buf) < buf_len) && (*buf_p != '\n')) 
00386     {
00387         buf_p++;
00388     }
00389     buf_p++;
00390 
00391     *buf_in_out = buf_p;
00392 
00393     return ((buf_p - buf) < buf_len);
00394 }
00395 
00396 
00398 static Error* get_num_samples_from_txt_csv
00399 (
00400     uint32_t*   num_samples_out, 
00401     const char* file_buf, 
00402     uint32_t    file_len
00403 )
00404 {
00405     const char* file_buf_p;
00406     uint32_t num_samples;
00407 
00408     file_buf_p = file_buf;
00409 
00410     if (opts.header_in) 
00411     {
00412         if (!skip_line(&file_buf_p, file_buf, file_len))
00413         {
00414             return JWSC_EIO("No samples to read");
00415         }
00416     }
00417 
00418     num_samples = 1;
00419     while (skip_line(&file_buf_p, file_buf, file_len))
00420     {
00421         num_samples++;
00422     }
00423 
00424     *num_samples_out = num_samples;
00425 
00426     return NULL;
00427 }
00428 
00429 
00441 static Error* read_ids_from_txt_csv
00442 (
00443     Matblock_u8** ids_out,
00444     const char*   file_buf,
00445     uint32_t      file_len,
00446     uint32_t      num_samples,
00447     uint8_t       aux
00448 )
00449 {
00450     const char* file_buf_p;
00451     const char* file_buf_p_saved;
00452     uint32_t sample;
00453     uint32_t num_ids;
00454     uint32_t id;
00455     uint32_t col;
00456     uint32_t len;
00457     int      slen = 256;
00458     char     s[slen];
00459 
00460     if (!aux && !opts.id_cols)
00461         return NULL;
00462     else if (aux && !opts.aux_id_cols)
00463         return NULL;
00464 
00465     file_buf_p = file_buf;
00466 
00467     if (opts.header_in) 
00468     {
00469         skip_line(&file_buf_p, file_buf, file_len);
00470     }
00471 
00472     num_ids = (aux) ? opts.aux_id_cols->num_elts : opts.id_cols->num_elts;
00473     create_zero_matblock_u8(ids_out, num_samples, num_ids, MAX_COL_SIZE);
00474 
00475     for (sample = 0; sample < num_samples; sample++)
00476     {
00477         file_buf_p_saved = file_buf_p;
00478 
00479         for (id = 0; id < num_ids; id++)
00480         {
00481             file_buf_p = file_buf_p_saved;
00482 
00483             col = (aux) ? opts.aux_id_cols->elts[ id ] : 
00484                 opts.id_cols->elts[ id ];
00485             len = file_len - (file_buf_p - file_buf);
00486 
00487             switch (opts.input_format)
00488             {
00489                 case HAPLO_INPUT_TXT:
00490                     if (!skip_to_txt_column(&file_buf_p, len, col))
00491                     {
00492                         snprintf(s, slen, "ID %d missing in sample %d", 
00493                                 id+1, sample+1);
00494                         return JWSC_EIO(s);
00495                     }
00496                     read_txt_column(file_buf_p, len, (char*)
00497                             ((*ids_out)->elts[ sample ][ id ]), MAX_COL_SIZE);
00498                     break;
00499                 case HAPLO_INPUT_CSV:
00500                     if (!skip_to_csv_column(&file_buf_p, len, col))
00501                     {
00502                         snprintf(s, slen, "ID %d missing in sample %d", 
00503                                 id+1, sample+1);
00504                         return JWSC_EIO(s);
00505                     }
00506                     read_csv_column(file_buf_p, len, (char*)
00507                             ((*ids_out)->elts[ sample ][ id ]), MAX_COL_SIZE);
00508                     break;
00509                 case HAPLO_INPUT_XML:
00510                     abort();
00511             }
00512         }
00513 
00514         skip_line(&file_buf_p, file_buf, file_len);
00515     }
00516 
00517     return NULL;
00518 }
00519 
00520 
00533 static Error* read_labels_from_txt_csv
00534 (
00535     Vector_u32** labels_out,
00536     const char*  file_buf,
00537     uint32_t     file_len,
00538     uint32_t     num_samples,
00539     uint8_t      aux
00540 )
00541 {
00542     const char* file_buf_p;
00543     uint32_t label_col;
00544     uint32_t sample;
00545     uint32_t len;
00546     char     col_buf[ MAX_COL_SIZE ] = {0};
00547     int      slen = 256;
00548     char     s[slen];
00549 
00550     if (!aux && opts.label_col == 0)
00551         return NULL;
00552     else if (aux && opts.aux_label_col == 0)
00553         return NULL;
00554 
00555     label_col = (aux) ? opts.aux_label_col : opts.label_col;
00556 
00557     file_buf_p = file_buf;
00558 
00559     if (opts.header_in) 
00560     {
00561         skip_line(&file_buf_p, file_buf, file_len);
00562     }
00563 
00564     create_zero_vector_u32(labels_out, num_samples);
00565 
00566     for (sample = 0; sample < num_samples; sample++)
00567     {
00568         len = file_len - (file_buf_p - file_buf);
00569 
00570         switch (opts.input_format)
00571         {
00572             case HAPLO_INPUT_TXT:
00573                 if (!skip_to_txt_column(&file_buf_p, len, label_col))
00574                 {
00575                     snprintf(s, slen, "No label in sample %d", sample+1);
00576                     return JWSC_EIO(s);
00577                 }
00578                 read_txt_column(file_buf_p, len, col_buf, MAX_COL_SIZE);
00579                 break;
00580             case HAPLO_INPUT_CSV:
00581                 if (!skip_to_csv_column(&file_buf_p, len, label_col))
00582                 {
00583                     snprintf(s, slen, "No label in sample %d", sample+1);
00584                     return JWSC_EIO(s);
00585                 }
00586                 read_csv_column(file_buf_p, len, col_buf, MAX_COL_SIZE);
00587                 break;
00588             case HAPLO_INPUT_XML:
00589                 abort();
00590         }
00591 
00592         if (lookup_haplo_group_index_from_label(
00593                     &((*labels_out)->elts[ sample ]),
00594                     (const char*) col_buf) != NULL)
00595         {
00596             snprintf(s, slen, "Label '%s' in sample %d not recognized", 
00597                     col_buf, sample+1);
00598             return JWSC_EIO(s);
00599         }
00600 
00601         skip_line(&file_buf_p, file_buf, file_len);
00602     }
00603 
00604     return NULL;
00605 }
00606 
00607 
00619 static Error* read_markers_from_txt_csv
00620 (
00621     Matrix_i32** markers_out,
00622     const char*  file_buf,
00623     uint32_t     file_len,
00624     uint32_t     num_samples,
00625     uint8_t      aux
00626 )
00627 {
00628     const char* file_buf_p;
00629     const char* file_buf_p_saved;
00630     uint32_t num_markers;
00631     uint32_t first_marker_col;
00632     uint32_t sample;
00633     uint32_t marker;
00634     uint32_t col;
00635     uint32_t len;
00636     int32_t  marker_val;
00637     char     col_buf[ MAX_COL_SIZE ] = {0};
00638     const Vector_u32* marker_cols;
00639     int       slen = 256;
00640     char      s[slen];
00641 
00642     if (aux)
00643         assert(opts.aux_first_marker_col > 0);
00644     else
00645         assert(opts.first_marker_col > 0);
00646 
00647     file_buf_p = file_buf;
00648 
00649     if (opts.header_in) 
00650     {
00651         skip_line(&file_buf_p, file_buf, file_len);
00652     }
00653 
00654     num_markers = (aux) ? opts.aux_num_markers : opts.num_markers;
00655     marker_cols = (aux) ? opts.aux_marker_cols : opts.marker_cols;
00656     first_marker_col = (aux) ? opts.aux_first_marker_col :
00657         opts.first_marker_col;
00658 
00659     create_matrix_i32(markers_out, num_samples, num_markers);
00660 
00661     for (sample = 0; sample < num_samples; sample++)
00662     {
00663         file_buf_p_saved = file_buf_p;
00664 
00665         for (marker = 0; marker < num_markers; marker++)
00666         {
00667             file_buf_p = file_buf_p_saved;
00668 
00669             col = (marker_cols) ? marker_cols->elts[ marker ] 
00670                                 : first_marker_col + marker;
00671 
00672             len = file_len - (file_buf_p - file_buf);
00673 
00674             switch (opts.input_format)
00675             {
00676                 case HAPLO_INPUT_TXT:
00677                     if (!skip_to_txt_column(&file_buf_p, len, col))
00678                     {
00679                         snprintf(s, slen, "Marker %d missing in sample %d",
00680                                 marker+1, sample+1);
00681                         return JWSC_EIO(s);
00682                     }
00683                     read_txt_column(file_buf_p, len, col_buf, MAX_COL_SIZE);
00684                     break;
00685                 case HAPLO_INPUT_CSV:
00686                     if (!skip_to_csv_column(&file_buf_p, len, col))
00687                     {
00688                         snprintf(s, slen, "Marker %d missing in sample %d",
00689                                 marker+1, sample+1);
00690                         return JWSC_EIO(s);
00691                     }
00692                     read_csv_column(file_buf_p, len, col_buf, MAX_COL_SIZE);
00693                     break;
00694                 case HAPLO_INPUT_XML:
00695                     abort();
00696             }
00697 
00698             if (sscanf(col_buf, "%d", &marker_val) != 1)
00699             {
00700                 snprintf(s, slen, "Marker %d in sample %d not valid",
00701                         marker+1, sample+1);
00702                 return JWSC_EIO(s);
00703             }
00704             (*markers_out)->elts[ sample ][ marker ] = marker_val;
00705         }
00706 
00707         skip_line(&file_buf_p, file_buf, file_len);
00708     }
00709 
00710     return NULL;
00711 }
00712 
00713 
00715 static Error* read_input_from_txt_csv
00716 (
00717     Matblock_u8** ids_out,
00718     Vector_u32**  labels_out,
00719     Matrix_i32**  markers_out,
00720     const char*   file_buf,
00721     uint32_t      file_len,
00722     uint8_t       aux
00723 )
00724 {
00725     uint32_t num_samples;
00726     Error* err;
00727 
00728     if ((err = get_num_samples_from_txt_csv(&num_samples, file_buf, file_len)))
00729     {
00730         return err;
00731     }
00732 
00733     if ((err = read_ids_from_txt_csv(ids_out, file_buf, file_len, num_samples,
00734                     aux)))
00735     {
00736         return err;
00737     }
00738     
00739     if ((err = read_labels_from_txt_csv(labels_out, file_buf, file_len, 
00740                     num_samples, aux)))
00741     {
00742         return err;
00743     }
00744 
00745     if ((err = read_markers_from_txt_csv(markers_out, file_buf, file_len, 
00746                     num_samples, aux)))
00747     {
00748         return err;
00749     }
00750 
00751     return NULL;
00752 }
00753 
00754 
00756 static uint32_t get_num_samples_from_xml(xmlDoc* doc)
00757 {
00758     const xmlNode* root;
00759     const xmlNode* it;
00760     uint32_t N = 0;
00761 
00762     root = xmlDocGetRootElement(doc);
00763 
00764     for (it = root->children; it; it = it->next)
00765     {
00766         if (it->type == XML_ELEMENT_NODE && XMLStrEqual(it->name, "sample"))
00767         {
00768             N++;
00769         }
00770     }
00771 
00772     return N;
00773 }
00774 
00775 
00777 static Error* read_ids_from_xml
00778 (
00779     Matblock_u8** ids_out, 
00780     xmlDoc*       doc, 
00781     uint32_t      num_samples,
00782     uint8_t       aux
00783 )
00784 {
00785     uint32_t i, j;
00786     uint32_t num_ids;
00787     const xmlNode* root;
00788     const xmlNode* it;
00789     const xmlNode* itt;
00790     int  slen = 256;
00791     char str[slen];
00792 
00793     if (!aux && !opts.id_cols)
00794         return NULL;
00795     else if (aux && !opts.aux_id_cols)
00796         return NULL;
00797 
00798     num_ids = (aux) ? opts.aux_id_cols->num_elts : opts.id_cols->num_elts;
00799     create_zero_matblock_u8(ids_out, num_samples, num_ids, MAX_COL_SIZE);
00800 
00801     root = xmlDocGetRootElement(doc);
00802 
00803     i = 0;
00804     for (it = root->children; it; it = it->next)
00805     {
00806         if (it->type == XML_ELEMENT_NODE && XMLStrEqual(it->name, "sample"))
00807         {
00808             j = 0;
00809             for (itt = it->children; itt; itt = itt->next)
00810             {
00811                 if (itt->type == XML_ELEMENT_NODE && 
00812                         XMLStrEqual(itt->name, "id"))
00813                 {
00814                     if (j == num_ids)
00815                     {
00816                         snprintf(str, slen, 
00817                                 "Sample %d has too many IDs", i+1);
00818                         return JWSC_EARG(str);
00819                     }
00820                     strncpy((char*)(*ids_out)->elts[ i ][ j ], 
00821                             (const char*)itt->children->content, MAX_COL_SIZE);
00822                     j++;
00823                 }
00824             }
00825             if (j < num_ids)
00826             {
00827                 snprintf(str, slen, "Sample %d missing IDs", i+1);
00828                 return JWSC_EARG(str);
00829             }
00830             i++;
00831         }
00832     }
00833     assert(i == num_samples);
00834 
00835     return NULL;
00836 }
00837 
00838 
00840 static Error* read_labels_from_xml
00841 (
00842     Vector_u32** labels_out, 
00843     xmlDoc*      doc, 
00844     uint32_t     num_samples,
00845     uint8_t      aux
00846 )
00847 {
00848     uint32_t i;
00849     const char*    label;
00850     const xmlNode* root;
00851     const xmlNode* it;
00852     const xmlNode* itt;
00853     Error* err;
00854     int    slen = 256;
00855     char   str[slen];
00856 
00857     if (!aux && opts.label_col == 0)
00858         return NULL;
00859     else if (aux && opts.aux_label_col == 0)
00860         return NULL;
00861 
00862     create_zero_vector_u32(labels_out, num_samples);
00863 
00864     root = xmlDocGetRootElement(doc);
00865 
00866     i = 0;
00867     for (it = root->children; it; it = it->next)
00868     {
00869         if (it->type == XML_ELEMENT_NODE && XMLStrEqual(it->name, "sample"))
00870         {
00871             label = 0;
00872             for (itt = it->children; itt; itt = itt->next)
00873             {
00874                 if (itt->type == XML_ELEMENT_NODE &&
00875                         XMLStrEqual(itt->name, "label"))
00876                 {
00877                     label = (const char*) itt->children->content;
00878 
00879                     if ((err = lookup_haplo_group_index_from_label(
00880                                 &((*labels_out)->elts[ i ]), label)))
00881                     {
00882                         return err;
00883                     }
00884                     break;
00885                 }
00886             }
00887             if (!label)
00888             {
00889                 snprintf(str, slen, "Sample %d missing label", i+1);
00890                 return JWSC_EARG(str);
00891             }
00892             i++;
00893         }
00894     }
00895     assert(i == num_samples);
00896 
00897     return NULL;
00898 }
00899 
00900 
00902 static Error* read_markers_from_xml
00903 (
00904     Matrix_i32** markers_out, 
00905     xmlDoc*      doc, 
00906     uint32_t     num_samples,
00907     uint8_t      aux
00908 )
00909 {
00910     uint32_t i, j;
00911     uint32_t num_markers;
00912     int32_t  marker_val;
00913     const xmlNode* root;
00914     const xmlNode* it;
00915     const xmlNode* itt;
00916     int  slen = 256;
00917     char str[slen];
00918 
00919     if (aux)
00920         assert(opts.aux_first_marker_col > 0);
00921     else
00922         assert(opts.first_marker_col > 0);
00923 
00924     num_markers = (aux) ? opts.aux_num_markers : opts.num_markers;
00925 
00926     create_matrix_i32(markers_out, num_samples, num_markers);
00927 
00928     root = xmlDocGetRootElement(doc);
00929 
00930     i = 0;
00931     for (it = root->children; it; it = it->next)
00932     {
00933         if (it->type == XML_ELEMENT_NODE && XMLStrEqual(it->name, "sample"))
00934         {
00935             j = 0;
00936             for (itt = it->children; itt; itt = itt->next)
00937             {
00938                 if (itt->type == XML_ELEMENT_NODE &&
00939                         XMLStrEqual(itt->name, "marker"))
00940                 {
00941                     if (j == num_markers)
00942                     {
00943                         snprintf(str, slen, 
00944                                 "Sample %d has too many markers", i+1);
00945                         return JWSC_EARG(str);
00946                     }
00947                     if (sscanf((const char*)itt->children->content, "%d",  
00948                             &marker_val) != 1)
00949                     {
00950                         snprintf(str, slen, 
00951                                 "Marker %d in sample %d not valid", j+1, i+1);
00952                         return JWSC_EARG(str);
00953                     }
00954                     (*markers_out)->elts[ i ][ j ] = marker_val;
00955                     j++;
00956                 }
00957             }
00958             if (j < num_markers)
00959             {
00960                 snprintf(str, slen, "Sample %d missing markers", i+1);
00961                 return JWSC_EARG(str);
00962             }
00963             i++;
00964         }
00965     }
00966     assert(i == num_samples);
00967 
00968     return NULL;
00969 }
00970 
00971 
00973 static Error* read_input_from_xml
00974 (
00975     Matblock_u8** ids_out,
00976     Vector_u32**  labels_out,
00977     Matrix_i32**  markers_out,
00978     const char*   file_buf,
00979     uint32_t      file_len,
00980     const char*   fname,
00981     uint8_t       aux
00982 )
00983 {
00984     uint32_t       num_samples;
00985     xmlParserCtxt* xml_parse_ctxt;
00986     xmlValidCtxt*  xml_valid_ctxt;
00987     xmlDoc*        xml_doc;
00988     xmlDtd*        xml_dtd;
00989     Error*         err;
00990     int            slen = 256;
00991     char           str[slen];
00992 
00993     assert(xml_parse_ctxt = xmlNewParserCtxt());
00994 
00995     if (!(xml_doc = xmlCtxtReadMemory(xml_parse_ctxt, file_buf, file_len, fname,
00996                     NULL, 0)))
00997     {
00998         return JWSC_EARG("Could not parse file");
00999     } 
01000 
01001     xmlFreeParserCtxt(xml_parse_ctxt);
01002 
01003     if (opts.input_dtd_fname)
01004     {
01005         assert(xml_valid_ctxt = xmlNewValidCtxt());
01006 
01007         if (!(xml_dtd = xmlParseDTD(NULL, (xmlChar*)opts.input_dtd_fname)))
01008         {
01009             snprintf(str, slen, "%s: %s", opts.input_dtd_fname,
01010                     "Could not parse DTD");
01011             return JWSC_EARG(str);
01012         }
01013 
01014         if (!xmlValidateDtd(xml_valid_ctxt, xml_doc, xml_dtd))
01015         {
01016             return JWSC_EARG("XML file not valid");
01017         }
01018 
01019         xmlFreeValidCtxt(xml_valid_ctxt);
01020         xmlFreeDtd(xml_dtd);
01021     }
01022 
01023     num_samples = get_num_samples_from_xml(xml_doc);
01024 
01025     if ((err = read_ids_from_xml(ids_out, xml_doc, num_samples, aux)))
01026     {
01027         return err;
01028     }
01029 
01030     if ((err = read_labels_from_xml(labels_out, xml_doc, num_samples, aux)))
01031     {
01032         return err;
01033     }
01034 
01035     if ((err = read_markers_from_xml(markers_out, xml_doc, num_samples, aux)))
01036     {
01037         return err;
01038     }
01039 
01040     return NULL;
01041 }
01042 
01043 
01053 Error* read_input
01054 (
01055     Matblock_u8** ids_out,
01056     Vector_u32**  labels_out,
01057     Matrix_i32**  markers_out,
01058     const char*   fname
01059 )
01060 {
01061     uint32_t file_len;
01062     char*    file_buf = NULL;
01063     Error*   err;
01064     int  slen = 256;
01065     char str[slen];
01066 
01067     if ((err = read_file_into_buffer(&file_buf, &file_len, fname)))
01068     {
01069         return err;
01070     }
01071 
01072     switch (opts.input_format)
01073     {
01074         case HAPLO_INPUT_TXT:
01075         case HAPLO_INPUT_CSV:
01076             if ((err = read_input_from_txt_csv(ids_out, labels_out, markers_out,
01077                             file_buf, file_len, 0)))
01078             {
01079                 snprintf(str, slen, "%s: %s", fname, err->msg);
01080                 return JWSC_EIO(str);
01081             }
01082             break;
01083         case HAPLO_INPUT_XML:
01084             if ((err = read_input_from_xml(ids_out, labels_out, markers_out,
01085                             file_buf, file_len, fname, 0)))
01086             {
01087                 snprintf(str, slen, "%s: %s", fname, err->msg);
01088                 return JWSC_EIO(str);
01089             }
01090             break;
01091     }
01092 
01093     free(file_buf);
01094 
01095     return err;
01096 }
01097 
01098 
01108 Error* read_aux_input
01109 (
01110     Matblock_u8** ids_out,
01111     Vector_u32**  labels_out,
01112     Matrix_i32**  markers_out,
01113     const char*   fname
01114 )
01115 {
01116     uint32_t file_len;
01117     char*    file_buf = NULL;
01118     Error*   err;
01119     int  slen = 256;
01120     char str[slen];
01121 
01122     if ((err = read_file_into_buffer(&file_buf, &file_len, fname)))
01123     {
01124         return err;
01125     }
01126 
01127     switch (opts.input_format)
01128     {
01129         case HAPLO_INPUT_TXT:
01130         case HAPLO_INPUT_CSV:
01131             if ((err = read_input_from_txt_csv(ids_out, labels_out,
01132                             markers_out, file_buf, file_len, 1)))
01133             {
01134                 snprintf(str, slen, "%s: %s", fname, err->msg);
01135                 return JWSC_EIO(str);
01136             }
01137             break;
01138         case HAPLO_INPUT_XML:
01139             if ((err = read_input_from_xml(ids_out, labels_out, markers_out,
01140                             file_buf, file_len, fname, 1)))
01141             {
01142                 snprintf(str, slen, "%s: %s", fname, err->msg);
01143                 return JWSC_EIO(str);
01144             }
01145             break;
01146     }
01147 
01148     free(file_buf);
01149 
01150     return err;
01151 }
01152 
01153 
01167 Error* impute_marker_from_parent_of_haplogroup_index(
01168         uint32_t haplo_group_index,
01169         uint32_t marker_no,
01170         uint32_t sample_no,
01171         Matrix_i32* marker_sums,
01172         Matrix_i32* imp_markers
01173         ){
01174                 
01175                 uint32_t parent_i;  // where the index of the parent will be stored
01176                 Error* e= lookup_haplo_group_ancestor_index_from_haplo_group_index(&parent_i,haplo_group_index);
01177                 if(e){
01178                         return e;
01179                 }
01180                 
01181         
01182                 if (marker_sums->elts[ parent_i ][ marker_no ] == 0)
01183             {
01184                         return impute_marker_from_parent_of_haplogroup_index(parent_i,marker_no,sample_no,marker_sums,imp_markers);
01185             }
01186             else
01187             {
01188                         
01189                 imp_markers->elts[ sample_no ][ marker_no ] = marker_sums->elts[ parent_i ][ marker_no];
01190             }
01191 
01192                 return NULL;
01193                 
01194         }
01195 
01196 
01207 Error* impute_missing_markers_from_avg
01208 (
01209     const Vector_u32* imp_labels, 
01210     Matrix_i32*       imp_markers,
01211     const Vector_u32* src_labels, 
01212     const Matrix_i32* src_markers
01213 )
01214 {
01215     uint32_t s, num_imp_samples;
01216     uint32_t num_src_samples;
01217     uint32_t m, num_markers;
01218     uint32_t l, ll, num_labels;
01219 
01220     Matrix_i32* marker_sums = NULL;
01221     Matrix_u32* marker_cnts = NULL;
01222     int  slen = 256;
01223     char str[slen];
01224 
01225     assert(imp_markers->num_cols == src_markers->num_cols);
01226 
01227     num_imp_samples = imp_markers->num_rows;
01228     num_src_samples = src_markers->num_rows;
01229     num_markers = imp_markers->num_cols;
01230     num_labels  = get_num_haplo_groups();
01231 
01232     create_zero_matrix_i32(&marker_sums, num_labels, num_markers);
01233     create_zero_matrix_u32(&marker_cnts, num_labels, num_markers);
01234 
01235     for (s = 0; s < num_src_samples; s++)
01236     {
01237         l = src_labels->elts[ s ];
01238         for (ll = 0; ll < num_labels; ll++)
01239         {
01240             if (is_ancestor(l, ll))
01241             {
01242                 for (m = 0; m < num_markers; m++)
01243                 {
01244                     if (src_markers->elts[ s ][ m ])
01245                     {
01246                         marker_sums->elts[ ll ][ m ] += 
01247                             src_markers->elts[ s ][ m ];
01248                         marker_cnts->elts[ ll ][ m ]++;
01249                     }
01250                 }
01251             }
01252         }
01253     }
01254 
01255     for (l = 0; l < num_labels; l++)
01256     {
01257         for (m = 0; m < num_markers; m++)
01258         {
01259             if (marker_cnts->elts[ l ][ m ])
01260             {
01261                 marker_sums->elts[ l ][ m ] /= marker_cnts->elts[ l ][ m ];
01262             }
01263         }       
01264     }
01265 
01266     for (s = 0; s < num_imp_samples; s++)
01267     {
01268         l = imp_labels->elts[ s ];
01269         for (m = 0; m < num_markers; m++)
01270         {
01271             if (!imp_markers->elts[ s ][ m ] && 
01272                     marker_cnts->elts[ l ][ m ] == 0)
01273             {
01274                 if(impute_marker_from_parent_of_haplogroup_index(l,m,s,marker_sums,imp_markers)){
01275                         snprintf(str, slen, "Sample %d: %s", s+1, 
01276                                 "No data to impute missing markers");
01277                         return JWSC_EARG(str);
01278                   }
01279             }
01280             else if (!imp_markers->elts[ s ][ m ])
01281             {
01282                 imp_markers->elts[ s ][ m ] = marker_sums->elts[ l ][ m ];
01283             }
01284         }
01285     }
01286 
01287     free_matrix_u32(marker_cnts);
01288     free_matrix_i32(marker_sums);
01289 
01290     return NULL;
01291 }
01292 
01293 
01300 Error* impute_missing_markers_from_nn
01301 (
01302     Matrix_i32*       imp_markers,
01303     const Matrix_i32* src_markers
01304 )
01305 {
01306     // TODO
01307     abort();
01308 
01309     return NULL;
01310 }