Haplo Prediction
predict haplogroups
|
00001 /* 00002 * This work is licensed under a Creative Commons 00003 * Attribution-Noncommercial-Share Alike 3.0 United States License. 00004 * 00005 * http://creativecommons.org/licenses/by-nc-sa/3.0/us/ 00006 * 00007 * You are free: 00008 * 00009 * to Share - to copy, distribute, display, and perform the work 00010 * to Remix - to make derivative works 00011 * 00012 * Under the following conditions: 00013 * 00014 * Attribution. You must attribute the work in the manner specified by the 00015 * author or licensor (but not in any way that suggests that they endorse you 00016 * or your use of the work). 00017 * 00018 * Noncommercial. You may not use this work for commercial purposes. 00019 * 00020 * Share Alike. If you alter, transform, or build upon this work, you may 00021 * distribute the resulting work only under the same or similar license to 00022 * this one. 00023 * 00024 * For any reuse or distribution, you must make clear to others the license 00025 * terms of this work. The best way to do this is by including this header. 00026 * 00027 * Any of the above conditions can be waived if you get permission from the 00028 * copyright holder. 00029 * 00030 * Apart from the remix rights granted under this license, nothing in this 00031 * license impairs or restricts the author's moral rights. 00032 */ 00033 00034 00046 #include <config.h> 00047 00048 #include <stdlib.h> 00049 #include <inttypes.h> 00050 #include <string.h> 00051 #include <assert.h> 00052 00053 #include <libxml/tree.h> 00054 #include <libxml/parser.h> 00055 00056 #ifdef HAPLO_HAVE_DMALLOC 00057 #include <dmalloc.h> 00058 #endif 00059 00060 #include <jwsc/base/error.h> 00061 #include <jwsc/vector/vector.h> 00062 00063 #include "options.h" 00064 #include "xml.h" 00065 #include "haplo_groups.h" 00066 00067 00069 static uint32_t num_haplo_groups = 0; 00070 00071 00077 static const xmlNode** haplo_groups = NULL; 00078 00079 00081 static xmlDoc* haplo_groups_xml = NULL; 00082 00083 00089 static int compare_label_and_group_label 00090 ( 00091 const char* label_1, 00092 const xmlNode** group_2 00093 ) 00094 { 00095 const char* label_2; 00096 const xmlNode* it; 00097 00098 for (it = (*group_2)->children; it; it = it->next) 00099 { 00100 if (it->type == XML_ELEMENT_NODE && XMLStrEqual(it->name, "label")) 00101 { 00102 label_2 = (const char*) it->children->content; 00103 break; 00104 } 00105 } 00106 00107 return strcmp(label_1, label_2); 00108 } 00109 00110 00116 static int compare_label_and_group_altlabels 00117 ( 00118 const char* label_1, 00119 const xmlNode** group_2 00120 ) 00121 { 00122 const char* altlabel_2; 00123 const xmlNode* it; 00124 00125 for (it = (*group_2)->children; it; it = it->next) 00126 { 00127 if (it->type == XML_ELEMENT_NODE && XMLStrEqual(it->name, "altlabel")) 00128 { 00129 altlabel_2 = (const char*) it->children->content; 00130 00131 if (strcmp(label_1, altlabel_2) == 0) 00132 { 00133 return 0; 00134 } 00135 } 00136 } 00137 00138 return 1; 00139 } 00140 00141 00147 static int compare_group_labels 00148 ( 00149 const xmlNode** group_1, 00150 const xmlNode** group_2 00151 ) 00152 { 00153 const char* label_1; 00154 const xmlNode* it; 00155 00156 for (it = (*group_1)->children; it; it = it->next) 00157 { 00158 if (it->type == XML_ELEMENT_NODE && XMLStrEqual(it->name, "label")) 00159 { 00160 label_1 = (const char*) it->children->content; 00161 break; 00162 } 00163 } 00164 00165 return compare_label_and_group_label(label_1, group_2); 00166 } 00167 00168 00175 static uint32_t count_num_haplo_groups(const xmlNode* group) 00176 { 00177 const xmlNode* it; 00178 uint32_t N = 0; 00179 00180 for (it = group; it; it = it->next) 00181 { 00182 if (it->type == XML_ELEMENT_NODE && XMLStrEqual(it->name, "group")) 00183 { 00184 N += 1 + count_num_haplo_groups(it->children); 00185 } 00186 } 00187 00188 return N; 00189 } 00190 00191 00198 static void enumerate_haplo_groups 00199 ( 00200 xmlNode* group, 00201 const xmlNode*** groups_in_out 00202 ) 00203 { 00204 xmlNode* it; 00205 00206 for (it = group; it; it = it->next) 00207 { 00208 if (it->type == XML_ELEMENT_NODE && XMLStrEqual(it->name, "group")) 00209 { 00210 **groups_in_out = it; 00211 (*groups_in_out)++; 00212 00213 enumerate_haplo_groups(it->children, groups_in_out); 00214 } 00215 } 00216 00217 assert((*groups_in_out - haplo_groups) <= num_haplo_groups); 00218 } 00219 00220 00221 static void set_haplo_group_back_pointers() 00222 { 00223 uint32_t i; 00224 xmlNode* group; 00225 00226 for (i = 0; i < num_haplo_groups; i++) 00227 { 00228 group = (xmlNode*)(haplo_groups[ i ]); 00229 group->_private = haplo_groups + i; 00230 } 00231 } 00232 00233 00238 static void build_haplo_groups_array_from_xml() 00239 { 00240 const xmlNode* root; 00241 const xmlNode** groups; 00242 00243 root = xmlDocGetRootElement(haplo_groups_xml); 00244 00245 num_haplo_groups = count_num_haplo_groups(root->children); 00246 assert((haplo_groups = malloc(num_haplo_groups * sizeof(xmlNode*)))); 00247 00248 groups = haplo_groups; 00249 enumerate_haplo_groups(root->children, &groups); 00250 00251 qsort(haplo_groups, num_haplo_groups, sizeof(xmlNode*), 00252 (int (*)(const void*, const void*))compare_group_labels); 00253 00254 set_haplo_group_back_pointers(); 00255 } 00256 00257 00265 static const xmlNode* binary_search_for_haplo_group_label 00266 ( 00267 uint32_t* index_out, 00268 const char* label 00269 ) 00270 { 00271 const xmlNode** group; 00272 00273 if ((group = bsearch(label, haplo_groups, num_haplo_groups, 00274 sizeof(xmlNode*), (int (*)(const void*, const void*)) 00275 compare_label_and_group_label))) 00276 { 00277 *index_out = group - haplo_groups; 00278 return *group; 00279 } 00280 00281 return NULL; 00282 } 00283 00284 00293 static const xmlNode* linear_search_for_haplo_group_altlabel 00294 ( 00295 uint32_t* index_out, 00296 const char* altlabel 00297 ) 00298 { 00299 const xmlNode** group; 00300 uint32_t i; 00301 00302 for (i = 0; i < num_haplo_groups; i++) 00303 { 00304 group = &(haplo_groups[ i ]); 00305 00306 if (compare_label_and_group_altlabels(altlabel, group) == 0) 00307 { 00308 *index_out = group - haplo_groups; 00309 return *group; 00310 } 00311 } 00312 00313 return NULL; 00314 } 00315 00316 00324 Error* read_haplo_groups(const char* fname) 00325 { 00326 xmlParserCtxt* xml_parse_ctxt; 00327 xmlValidCtxt* xml_valid_ctxt; 00328 xmlDtd* xml_dtd; 00329 int slen = 256; 00330 char s[slen]; 00331 00332 assert(haplo_groups == NULL); 00333 assert(haplo_groups_xml == NULL); 00334 00335 if (!fname) 00336 { 00337 return JWSC_EARG("No labels file specified"); 00338 } 00339 00340 assert(xml_parse_ctxt = xmlNewParserCtxt()); 00341 00342 if (!(haplo_groups_xml = xmlCtxtReadFile(xml_parse_ctxt, fname, NULL, 0))) 00343 { 00344 snprintf(s, slen, "%s: %s",fname, "Could not parse file"); 00345 return JWSC_EARG(s); 00346 } 00347 00348 xmlFreeParserCtxt(xml_parse_ctxt); 00349 00350 if (opts.labels_dtd_fname) 00351 { 00352 assert(xml_valid_ctxt = xmlNewValidCtxt()); 00353 00354 if (!(xml_dtd = xmlParseDTD(NULL, (xmlChar*)opts.labels_dtd_fname))) 00355 { 00356 snprintf(s, slen, "%s: %s", opts.labels_dtd_fname, 00357 "Could not parse file"); 00358 return JWSC_EARG(s); 00359 } 00360 00361 if (!xmlValidateDtd(xml_valid_ctxt, haplo_groups_xml, xml_dtd)) 00362 { 00363 snprintf(s, slen, "%s: %s", fname, "XML file not valid"); 00364 return JWSC_EARG(s); 00365 } 00366 00367 xmlFreeValidCtxt(xml_valid_ctxt); 00368 xmlFreeDtd(xml_dtd); 00369 } 00370 00371 build_haplo_groups_array_from_xml(); 00372 00373 return NULL; 00374 } 00375 00376 00383 Error* lookup_haplo_group_index_from_label 00384 ( 00385 uint32_t* index_out, 00386 const char* label 00387 ) 00388 { 00389 int slen = 256; 00390 char s[slen]; 00391 00392 if (binary_search_for_haplo_group_label(index_out, label)) 00393 { 00394 return NULL; 00395 } 00396 00397 if (linear_search_for_haplo_group_altlabel(index_out, label)) 00398 { 00399 return NULL; 00400 } 00401 00402 snprintf(s, slen, "%s: %s", label, "Could not find haplo label"); 00403 return JWSC_EARG(s); 00404 } 00405 00406 00413 Error* lookup_haplo_group_label_from_index 00414 ( 00415 const char** label_out, 00416 uint32_t index 00417 ) 00418 { 00419 xmlNode* it; 00420 char s[256]; 00421 00422 if (index >= num_haplo_groups) 00423 { 00424 snprintf(s, 256, "Haplo label index '%u' too large", index); 00425 return JWSC_EARG(s); 00426 } 00427 00428 for (it = haplo_groups[ index ]->children; it; it = it->next) 00429 { 00430 if (it->type == XML_ELEMENT_NODE && XMLStrEqual(it->name, "label")) 00431 { 00432 *label_out = (const char*) it->children->content; 00433 } 00434 } 00435 00436 return NULL; 00437 } 00438 00439 Error* lookup_haplo_group_ancestor_index_from_haplo_group_index 00440 ( 00441 uint32_t* index_out, 00442 uint32_t index 00443 ) 00444 { 00445 xmlNode* parent; 00446 int slen = 256; 00447 char s[slen]; 00448 xmlNode* it; 00449 const char* label_in; 00450 00451 if (index >= num_haplo_groups) 00452 { 00453 snprintf(s, slen, "Haplo label index '%u' too large", index); 00454 return JWSC_EARG(s); 00455 } 00456 00457 parent = haplo_groups[ index ]->parent; 00458 00459 if(!parent || XMLStrEqual(parent->name, "haplo-labels")) 00460 { 00461 snprintf(s, slen, "Haplo label index '%u' too large", index); 00462 return JWSC_EARG(s); 00463 } 00464 00465 for (it = parent->children; it; it = it->next) 00466 { 00467 if (it->type == XML_ELEMENT_NODE && XMLStrEqual(it->name, "label")) 00468 { 00469 label_in = (const char*) it->children->content; 00470 break; 00471 } 00472 } 00473 00474 if(lookup_haplo_group_index_from_label(index_out,label_in)) 00475 { 00476 snprintf(s, slen, "Label %s from parent not recognized", label_in); 00477 return JWSC_EARG(s); 00478 } 00479 00480 return NULL; 00481 } 00482 00484 uint32_t get_num_haplo_groups() 00485 { 00486 return num_haplo_groups; 00487 } 00488 00489 00501 static uint8_t has_ancestor(const xmlNode* group_1, const xmlNode* group_2) 00502 { 00503 if (group_1 == group_2) 00504 { 00505 return 1; 00506 } 00507 00508 if (group_1->parent) 00509 { 00510 return has_ancestor(group_1->parent, group_2); 00511 } 00512 00513 return 0; 00514 } 00515 00516 00525 static Haplo_ancestor_type find_ancestor_group_2 00526 ( 00527 const xmlNode** ancestor_out, 00528 const xmlNode* group_1, 00529 const xmlNode* group_2 00530 ) 00531 { 00532 if (group_1 == group_2) 00533 { 00534 *ancestor_out = group_1; 00535 return HAPLO_ANCESTOR_INDIRECT; 00536 } 00537 00538 if (has_ancestor(group_1, group_2)) 00539 { 00540 *ancestor_out = group_2; 00541 return HAPLO_ANCESTOR_INDIRECT; 00542 } 00543 00544 assert(group_2->parent); 00545 return find_ancestor_group_2(ancestor_out, group_1, group_2->parent); 00546 } 00547 00548 00557 static Haplo_ancestor_type find_ancestor_group_1 00558 ( 00559 const xmlNode** ancestor_out, 00560 const xmlNode* group_1, 00561 const xmlNode* group_2 00562 ) 00563 { 00564 if (group_1 == group_2) 00565 { 00566 *ancestor_out = group_1; 00567 return HAPLO_ANCESTOR_DIRECT; 00568 } 00569 00570 if (has_ancestor(group_1, group_2)) 00571 { 00572 *ancestor_out = group_2; 00573 return HAPLO_ANCESTOR_DIRECT; 00574 } 00575 00576 if (has_ancestor(group_2, group_1)) 00577 { 00578 *ancestor_out = group_1; 00579 return HAPLO_ANCESTOR_DIRECT; 00580 } 00581 00582 assert(group_2->parent); 00583 return find_ancestor_group_2(ancestor_out, group_1, group_2->parent); 00584 } 00585 00586 00594 uint8_t is_ancestor(uint32_t index_1, uint32_t index_2) 00595 { 00596 return has_ancestor(haplo_groups[ index_1 ], haplo_groups[ index_2 ]); 00597 } 00598 00599 00607 Haplo_ancestor_type find_ancestor_index_of_pair 00608 ( 00609 uint32_t* index_out, 00610 uint32_t index_1, 00611 uint32_t index_2 00612 ) 00613 { 00614 Haplo_ancestor_type ancestor_type; 00615 const xmlNode* ancestor; 00616 00617 ancestor_type = find_ancestor_group_1(&ancestor, haplo_groups[ index_1 ], 00618 haplo_groups[ index_2 ]); 00619 if ((xmlDoc*)(ancestor->parent) == haplo_groups_xml) 00620 { 00621 assert(XMLStrEqual(ancestor->name, "haplo-labels")); 00622 return HAPLO_ANCESTOR_NONE; 00623 } 00624 00625 *index_out = ((const xmlNode**)(ancestor->_private)) - haplo_groups; 00626 assert(*index_out < num_haplo_groups); 00627 00628 return ancestor_type; 00629 } 00630 00631 00638 Haplo_ancestor_type find_ancestor_index_of_set 00639 ( 00640 uint32_t* index_out, 00641 const Vector_u32* indices 00642 ) 00643 { 00644 uint32_t i; 00645 Haplo_ancestor_type t1, t2; 00646 00647 t1 = HAPLO_ANCESTOR_DIRECT; 00648 *index_out = indices->elts[0]; 00649 00650 for (i = 1; i < indices->num_elts; i++) 00651 { 00652 if (t1 == HAPLO_ANCESTOR_NONE) 00653 { 00654 *index_out = 0; 00655 break; 00656 } 00657 00658 t2 = find_ancestor_index_of_pair(index_out, *index_out, 00659 indices->elts[ i ]); 00660 if (t1 == HAPLO_ANCESTOR_DIRECT && t2 != HAPLO_ANCESTOR_DIRECT) 00661 { 00662 t1 = t2; 00663 } 00664 else if (t1 == HAPLO_ANCESTOR_INDIRECT && t2 == HAPLO_ANCESTOR_NONE) 00665 { 00666 t1 = t2; 00667 } 00668 } 00669 00670 return t1; 00671 }