Haplo Prediction
predict haplogroups
haplo_groups.c
Go to the documentation of this file.
00001 /*
00002  * This work is licensed under a Creative Commons 
00003  * Attribution-Noncommercial-Share Alike 3.0 United States License.
00004  * 
00005  *    http://creativecommons.org/licenses/by-nc-sa/3.0/us/
00006  * 
00007  * You are free:
00008  * 
00009  *    to Share - to copy, distribute, display, and perform the work
00010  *    to Remix - to make derivative works
00011  * 
00012  * Under the following conditions:
00013  * 
00014  *    Attribution. You must attribute the work in the manner specified by the
00015  *    author or licensor (but not in any way that suggests that they endorse you
00016  *    or your use of the work).
00017  * 
00018  *    Noncommercial. You may not use this work for commercial purposes.
00019  * 
00020  *    Share Alike. If you alter, transform, or build upon this work, you may
00021  *    distribute the resulting work only under the same or similar license to
00022  *    this one.
00023  * 
00024  * For any reuse or distribution, you must make clear to others the license
00025  * terms of this work. The best way to do this is by including this header.
00026  * 
00027  * Any of the above conditions can be waived if you get permission from the
00028  * copyright holder.
00029  * 
00030  * Apart from the remix rights granted under this license, nothing in this
00031  * license impairs or restricts the author's moral rights.
00032  */
00033 
00034 
00046 #include <config.h>
00047 
00048 #include <stdlib.h>
00049 #include <inttypes.h>
00050 #include <string.h>
00051 #include <assert.h>
00052 
00053 #include <libxml/tree.h>
00054 #include <libxml/parser.h>
00055 
00056 #ifdef HAPLO_HAVE_DMALLOC
00057 #include <dmalloc.h>
00058 #endif
00059 
00060 #include <jwsc/base/error.h>
00061 #include <jwsc/vector/vector.h>
00062 
00063 #include "options.h"
00064 #include "xml.h"
00065 #include "haplo_groups.h"
00066 
00067 
00069 static uint32_t num_haplo_groups = 0;
00070 
00071 
00077 static const xmlNode** haplo_groups = NULL;
00078 
00079 
00081 static xmlDoc* haplo_groups_xml = NULL;
00082 
00083 
00089 static int compare_label_and_group_label
00090 (
00091     const char*     label_1,
00092     const xmlNode** group_2
00093 )
00094 {
00095     const char*    label_2;
00096     const xmlNode* it;
00097 
00098     for (it = (*group_2)->children; it; it = it->next)
00099     {
00100         if (it->type == XML_ELEMENT_NODE && XMLStrEqual(it->name, "label"))
00101         {
00102             label_2 = (const char*) it->children->content;
00103             break;
00104         }
00105     }
00106 
00107     return strcmp(label_1, label_2);
00108 }
00109 
00110 
00116 static int compare_label_and_group_altlabels
00117 (
00118     const char*    label_1,
00119     const xmlNode** group_2
00120 )
00121 {
00122     const char*    altlabel_2;
00123     const xmlNode* it;
00124 
00125     for (it = (*group_2)->children; it; it = it->next)
00126     {
00127         if (it->type == XML_ELEMENT_NODE && XMLStrEqual(it->name, "altlabel"))
00128         {
00129             altlabel_2 = (const char*) it->children->content;
00130 
00131             if (strcmp(label_1, altlabel_2) == 0)
00132             {
00133                 return 0;
00134             }
00135         }
00136     }
00137 
00138     return 1;
00139 }
00140 
00141 
00147 static int compare_group_labels
00148 (
00149     const xmlNode** group_1,
00150     const xmlNode** group_2
00151 )
00152 {
00153     const char*    label_1;
00154     const xmlNode* it;
00155 
00156     for (it = (*group_1)->children; it; it = it->next)
00157     {
00158         if (it->type == XML_ELEMENT_NODE && XMLStrEqual(it->name, "label"))
00159         {
00160             label_1 = (const char*) it->children->content;
00161             break;
00162         }
00163     }
00164 
00165     return compare_label_and_group_label(label_1, group_2);
00166 }
00167 
00168 
00175 static uint32_t count_num_haplo_groups(const xmlNode* group)
00176 {
00177     const xmlNode* it;
00178     uint32_t N = 0;
00179 
00180     for (it = group; it; it = it->next)
00181     {
00182         if (it->type == XML_ELEMENT_NODE && XMLStrEqual(it->name, "group"))
00183         {
00184             N += 1 + count_num_haplo_groups(it->children);
00185         }
00186     }
00187 
00188     return N;
00189 }
00190 
00191 
00198 static void enumerate_haplo_groups
00199 (
00200     xmlNode*         group, 
00201     const xmlNode*** groups_in_out
00202 )
00203 {
00204     xmlNode* it;
00205 
00206     for (it = group; it; it = it->next)
00207     {
00208         if (it->type == XML_ELEMENT_NODE && XMLStrEqual(it->name, "group"))
00209         {
00210             **groups_in_out = it;
00211             (*groups_in_out)++;
00212 
00213             enumerate_haplo_groups(it->children, groups_in_out);
00214         }
00215     }
00216 
00217     assert((*groups_in_out - haplo_groups) <= num_haplo_groups);
00218 }
00219 
00220 
00221 static void set_haplo_group_back_pointers()
00222 {
00223     uint32_t i;
00224     xmlNode* group;
00225 
00226     for (i = 0; i < num_haplo_groups; i++)
00227     {
00228         group = (xmlNode*)(haplo_groups[ i ]);
00229         group->_private = haplo_groups + i;
00230     }
00231 }
00232 
00233 
00238 static void build_haplo_groups_array_from_xml()
00239 {
00240     const xmlNode*  root;
00241     const xmlNode** groups;
00242 
00243     root = xmlDocGetRootElement(haplo_groups_xml);
00244 
00245     num_haplo_groups = count_num_haplo_groups(root->children);
00246     assert((haplo_groups = malloc(num_haplo_groups * sizeof(xmlNode*))));
00247 
00248     groups = haplo_groups;
00249     enumerate_haplo_groups(root->children, &groups);
00250 
00251     qsort(haplo_groups, num_haplo_groups, sizeof(xmlNode*), 
00252             (int (*)(const void*, const void*))compare_group_labels);
00253 
00254     set_haplo_group_back_pointers();
00255 }
00256 
00257 
00265 static const xmlNode* binary_search_for_haplo_group_label
00266 (
00267     uint32_t*   index_out, 
00268     const char* label
00269 )
00270 {
00271     const xmlNode** group;
00272 
00273     if ((group = bsearch(label, haplo_groups, num_haplo_groups, 
00274                     sizeof(xmlNode*), (int (*)(const void*, const void*))
00275                     compare_label_and_group_label)))
00276     {
00277         *index_out = group - haplo_groups;
00278         return *group;
00279     }
00280 
00281     return NULL;
00282 }
00283 
00284 
00293 static const xmlNode* linear_search_for_haplo_group_altlabel
00294 (
00295     uint32_t*   index_out, 
00296     const char* altlabel
00297 )
00298 {
00299     const xmlNode** group;
00300     uint32_t i;
00301 
00302     for (i = 0; i < num_haplo_groups; i++)
00303     {
00304         group = &(haplo_groups[ i ]);
00305 
00306         if (compare_label_and_group_altlabels(altlabel, group) == 0)
00307         {
00308             *index_out = group - haplo_groups;
00309             return *group;
00310         }
00311     }
00312 
00313     return NULL;
00314 }
00315 
00316 
00324 Error* read_haplo_groups(const char* fname)
00325 {
00326     xmlParserCtxt* xml_parse_ctxt;
00327     xmlValidCtxt*  xml_valid_ctxt;
00328     xmlDtd*        xml_dtd;
00329     int  slen = 256;
00330     char s[slen];
00331 
00332     assert(haplo_groups == NULL);
00333     assert(haplo_groups_xml == NULL);
00334 
00335     if (!fname)
00336     {
00337         return JWSC_EARG("No labels file specified");
00338     }
00339 
00340     assert(xml_parse_ctxt = xmlNewParserCtxt());
00341 
00342     if (!(haplo_groups_xml = xmlCtxtReadFile(xml_parse_ctxt, fname, NULL, 0)))
00343     {
00344         snprintf(s, slen, "%s: %s",fname, "Could not parse file");
00345         return JWSC_EARG(s);
00346     } 
00347 
00348     xmlFreeParserCtxt(xml_parse_ctxt);
00349 
00350     if (opts.labels_dtd_fname)
00351     {
00352         assert(xml_valid_ctxt = xmlNewValidCtxt());
00353 
00354         if (!(xml_dtd = xmlParseDTD(NULL, (xmlChar*)opts.labels_dtd_fname)))
00355         {
00356             snprintf(s, slen, "%s: %s", opts.labels_dtd_fname, 
00357                     "Could not parse file");
00358             return JWSC_EARG(s);
00359         }
00360 
00361         if (!xmlValidateDtd(xml_valid_ctxt, haplo_groups_xml, xml_dtd))
00362         {
00363             snprintf(s, slen, "%s: %s", fname, "XML file not valid");
00364             return JWSC_EARG(s);
00365         }
00366 
00367         xmlFreeValidCtxt(xml_valid_ctxt);
00368         xmlFreeDtd(xml_dtd);
00369     }
00370 
00371     build_haplo_groups_array_from_xml();
00372 
00373     return NULL;
00374 }
00375 
00376 
00383 Error* lookup_haplo_group_index_from_label
00384 (
00385     uint32_t*   index_out, 
00386     const char* label
00387 )
00388 {
00389     int slen = 256;
00390     char s[slen];
00391 
00392     if (binary_search_for_haplo_group_label(index_out, label))
00393     {
00394         return NULL;
00395     }
00396 
00397     if (linear_search_for_haplo_group_altlabel(index_out, label))
00398     {
00399         return NULL;
00400     }
00401 
00402     snprintf(s, slen, "%s: %s", label, "Could not find haplo label");
00403     return JWSC_EARG(s);
00404 }
00405 
00406 
00413 Error* lookup_haplo_group_label_from_index
00414 (
00415     const char** label_out, 
00416     uint32_t     index
00417 )
00418 {
00419     xmlNode* it;
00420     char s[256];
00421 
00422     if (index >= num_haplo_groups)
00423     {
00424         snprintf(s, 256, "Haplo label index '%u' too large", index);
00425         return JWSC_EARG(s);
00426     }
00427 
00428     for (it = haplo_groups[ index ]->children; it; it = it->next)
00429     {
00430         if (it->type == XML_ELEMENT_NODE && XMLStrEqual(it->name, "label"))
00431         {
00432             *label_out = (const char*) it->children->content;
00433         }
00434     }
00435 
00436     return NULL;
00437 }
00438 
00439 Error* lookup_haplo_group_ancestor_index_from_haplo_group_index
00440 (
00441     uint32_t*   index_out, 
00442     uint32_t     index
00443 )
00444 {
00445     xmlNode* parent;
00446     int slen = 256;
00447     char s[slen];
00448     xmlNode* it;
00449     const char* label_in;
00450 
00451     if (index >= num_haplo_groups)
00452     {
00453         snprintf(s, slen, "Haplo label index '%u' too large", index);
00454         return JWSC_EARG(s);
00455     }
00456 
00457     parent = haplo_groups[ index ]->parent;
00458 
00459     if(!parent || XMLStrEqual(parent->name, "haplo-labels"))
00460     {
00461         snprintf(s, slen, "Haplo label index '%u' too large", index);
00462         return JWSC_EARG(s);
00463     }
00464 
00465     for (it = parent->children; it; it = it->next)
00466     {
00467         if (it->type == XML_ELEMENT_NODE && XMLStrEqual(it->name, "label"))
00468         {
00469             label_in = (const char*) it->children->content;
00470             break;
00471         }
00472     }
00473 
00474     if(lookup_haplo_group_index_from_label(index_out,label_in))
00475     {
00476         snprintf(s, slen, "Label %s from parent not recognized", label_in);
00477         return JWSC_EARG(s);
00478     }
00479 
00480     return NULL;
00481 }
00482 
00484 uint32_t get_num_haplo_groups()
00485 {
00486     return num_haplo_groups;
00487 }
00488 
00489 
00501 static uint8_t has_ancestor(const xmlNode* group_1, const xmlNode* group_2)
00502 {
00503     if (group_1 == group_2)
00504     {
00505         return 1;
00506     }
00507 
00508     if (group_1->parent)
00509     {
00510         return has_ancestor(group_1->parent, group_2);
00511     }
00512 
00513     return 0;
00514 }
00515 
00516 
00525 static Haplo_ancestor_type find_ancestor_group_2
00526 (
00527     const xmlNode** ancestor_out,
00528     const xmlNode*  group_1, 
00529     const xmlNode*  group_2
00530 )
00531 {
00532     if (group_1 == group_2)
00533     {
00534         *ancestor_out = group_1;
00535         return HAPLO_ANCESTOR_INDIRECT;
00536     }
00537 
00538     if (has_ancestor(group_1, group_2))
00539     {
00540         *ancestor_out = group_2;
00541         return HAPLO_ANCESTOR_INDIRECT;
00542     }
00543 
00544     assert(group_2->parent);
00545     return find_ancestor_group_2(ancestor_out, group_1, group_2->parent);
00546 }
00547 
00548 
00557 static Haplo_ancestor_type find_ancestor_group_1
00558 (
00559     const xmlNode** ancestor_out,
00560     const xmlNode*  group_1, 
00561     const xmlNode*  group_2
00562 )
00563 {
00564     if (group_1 == group_2)
00565     {
00566         *ancestor_out = group_1;
00567         return HAPLO_ANCESTOR_DIRECT;
00568     }
00569 
00570     if (has_ancestor(group_1, group_2))
00571     {
00572         *ancestor_out = group_2;
00573         return HAPLO_ANCESTOR_DIRECT;
00574     }
00575 
00576     if (has_ancestor(group_2, group_1))
00577     {
00578         *ancestor_out = group_1;
00579         return HAPLO_ANCESTOR_DIRECT;
00580     }
00581 
00582     assert(group_2->parent);
00583     return find_ancestor_group_2(ancestor_out, group_1, group_2->parent);
00584 }
00585 
00586 
00594 uint8_t is_ancestor(uint32_t index_1, uint32_t index_2)
00595 {
00596     return has_ancestor(haplo_groups[ index_1 ], haplo_groups[ index_2 ]);
00597 }
00598 
00599 
00607 Haplo_ancestor_type find_ancestor_index_of_pair
00608 (
00609     uint32_t* index_out, 
00610     uint32_t  index_1, 
00611     uint32_t  index_2
00612 )
00613 {
00614     Haplo_ancestor_type ancestor_type;
00615     const xmlNode* ancestor;
00616 
00617     ancestor_type = find_ancestor_group_1(&ancestor, haplo_groups[ index_1 ], 
00618             haplo_groups[ index_2 ]);
00619     if ((xmlDoc*)(ancestor->parent) == haplo_groups_xml)
00620     {
00621         assert(XMLStrEqual(ancestor->name, "haplo-labels"));
00622         return HAPLO_ANCESTOR_NONE;
00623     }
00624 
00625     *index_out = ((const xmlNode**)(ancestor->_private)) - haplo_groups;
00626     assert(*index_out < num_haplo_groups);
00627 
00628     return ancestor_type;
00629 }
00630 
00631 
00638 Haplo_ancestor_type find_ancestor_index_of_set
00639 (
00640     uint32_t*         index_out, 
00641     const Vector_u32* indices
00642 )
00643 {
00644     uint32_t i;
00645     Haplo_ancestor_type t1, t2;
00646 
00647     t1 = HAPLO_ANCESTOR_DIRECT;
00648     *index_out = indices->elts[0];
00649 
00650     for (i = 1; i < indices->num_elts; i++)
00651     {
00652         if (t1 == HAPLO_ANCESTOR_NONE)
00653         {
00654             *index_out = 0;
00655             break;
00656         }
00657 
00658         t2 = find_ancestor_index_of_pair(index_out, *index_out, 
00659                 indices->elts[ i ]);
00660         if (t1 == HAPLO_ANCESTOR_DIRECT && t2 != HAPLO_ANCESTOR_DIRECT)
00661         {
00662             t1 = t2;
00663         }
00664         else if (t1 == HAPLO_ANCESTOR_INDIRECT && t2 == HAPLO_ANCESTOR_NONE)
00665         {
00666             t1 = t2;
00667         }
00668     }
00669 
00670     return t1;
00671 }