Haplo Prediction
predict haplogroups
|
Read the a row-by-row collection of data samples from a file. More...
#include <config.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <ctype.h>
#include <inttypes.h>
#include <assert.h>
#include <errno.h>
#include <libxml/tree.h>
#include <libxml/parser.h>
#include <libxml/valid.h>
#include <jwsc/base/error.h>
#include <jwsc/vector/vector.h>
#include <jwsc/matrix/matrix.h>
#include <jwsc/matblock/matblock.h>
#include <xml.h>
#include <haplo_groups.h>
#include <options.h>
#include <input.h>
Go to the source code of this file.
Defines | |
#define | MAX_COL_SIZE 256 |
Functions | |
static uint32_t | read_block_into_buffer (char *buf, uint32_t size, FILE *fp) |
Reads a block of characters off a file stream into a buffer. | |
static Error * | read_file_into_buffer (char **buf_out, uint32_t *buf_len_out, const char *fname) |
Opens a file stream and reads in a file into a string buffer. | |
static uint8_t | skip_to_csv_column (const char **buf_in_out, uint32_t buf_len, uint32_t col) |
Skips a buffer pointer ahead to the specified CSV column. | |
static const char * | read_csv_column (const char *src, uint32_t src_len, char *dst, uint32_t dst_len) |
Reads the current CSV column from a source buffer into a destination and returns a pointer to the next column. | |
static uint8_t | skip_to_txt_column (const char **buf_in_out, uint32_t buf_len, uint32_t col) |
Skips a buffer pointer ahead to the specified text column. | |
static const char * | read_txt_column (const char *src, uint32_t src_len, char *dst, uint32_t dst_len) |
Reads the current text column from a source buffer into a destination and returns a pointer to the next column. | |
static uint8_t | skip_line (const char **buf_in_out, const char *buf, uint32_t buf_len) |
static Error * | get_num_samples_from_txt_csv (uint32_t *num_samples_out, const char *file_buf, uint32_t file_len) |
Counts the number of samples (rows) in a file buffer. | |
static Error * | read_ids_from_txt_csv (Matblock_u8 **ids_out, const char *file_buf, uint32_t file_len, uint32_t num_samples, uint8_t aux) |
Reads the sample IDs from a file buffer. | |
static Error * | read_labels_from_txt_csv (Vector_u32 **labels_out, const char *file_buf, uint32_t file_len, uint32_t num_samples, uint8_t aux) |
Reads the haplo group labels from a file buffer. | |
static Error * | read_markers_from_txt_csv (Matrix_i32 **markers_out, const char *file_buf, uint32_t file_len, uint32_t num_samples, uint8_t aux) |
Reads the marker data from a file. | |
static Error * | read_input_from_txt_csv (Matblock_u8 **ids_out, Vector_u32 **labels_out, Matrix_i32 **markers_out, const char *file_buf, uint32_t file_len, uint8_t aux) |
Reads file data from a text or csv file. | |
static uint32_t | get_num_samples_from_xml (xmlDoc *doc) |
Counts the number of samples (rows) in an xml document. | |
static Error * | read_ids_from_xml (Matblock_u8 **ids_out, xmlDoc *doc, uint32_t num_samples, uint8_t aux) |
Reads the sample ids from an xml document. | |
static Error * | read_labels_from_xml (Vector_u32 **labels_out, xmlDoc *doc, uint32_t num_samples, uint8_t aux) |
Reads the sample labels from an xml document. | |
static Error * | read_markers_from_xml (Matrix_i32 **markers_out, xmlDoc *doc, uint32_t num_samples, uint8_t aux) |
Reads the sample markers from an xml document. | |
static Error * | read_input_from_xml (Matblock_u8 **ids_out, Vector_u32 **labels_out, Matrix_i32 **markers_out, const char *file_buf, uint32_t file_len, const char *fname, uint8_t aux) |
Reads the input data structures from an xml file in a buffer. | |
Error * | read_input (Matblock_u8 **ids_out, Vector_u32 **labels_out, Matrix_i32 **markers_out, const char *fname) |
Reads the IDs, haplo labels and markers from a file. | |
Error * | read_aux_input (Matblock_u8 **ids_out, Vector_u32 **labels_out, Matrix_i32 **markers_out, const char *fname) |
Reads the IDs, haplo labels and markers from an auxiliary file. | |
Error * | impute_marker_from_parent_of_haplogroup_index (uint32_t haplo_group_index, uint32_t marker_no, uint32_t sample_no, Matrix_i32 *marker_sums, Matrix_i32 *imp_markers) |
Error * | impute_missing_markers_from_avg (const Vector_u32 *imp_labels, Matrix_i32 *imp_markers, const Vector_u32 *src_labels, const Matrix_i32 *src_markers) |
Imputes missing marker values (zero) with their label-dependent mean value. | |
Error * | impute_missing_markers_from_nn (Matrix_i32 *imp_markers, const Matrix_i32 *src_markers) |
Imputes missing marker values (zero) from nearest neighbor interpolation. |
Read the a row-by-row collection of data samples from a file.
Definition in file input.c.
static uint32_t read_block_into_buffer | ( | char * | buf, |
uint32_t | size, | ||
FILE * | fp | ||
) | [static] |
static Error* read_file_into_buffer | ( | char ** | buf_out, |
uint32_t * | buf_len_out, | ||
const char * | fname | ||
) | [static] |
static uint8_t skip_to_csv_column | ( | const char ** | buf_in_out, |
uint32_t | buf_len, | ||
uint32_t | col | ||
) | [static] |
Skips a buffer pointer ahead to the specified CSV column.
Skips ahead col columns from the current column. Stops if a newline character is encountered.
buf_in_out | Buffer pointer to skip to the specified column. |
buf_len | Number of buffer elements to potentially skip through. |
col | Column to skip to. |
static const char* read_csv_column | ( | const char * | src, |
uint32_t | src_len, | ||
char * | dst, | ||
uint32_t | dst_len | ||
) | [static] |
Reads the current CSV column from a source buffer into a destination and returns a pointer to the next column.
src | Buffer to read from. |
src_len | Length of the source buffer. |
dst | Buffer to read into. |
dst_len | Length of the destination buffer. |
static uint8_t skip_to_txt_column | ( | const char ** | buf_in_out, |
uint32_t | buf_len, | ||
uint32_t | col | ||
) | [static] |
Skips a buffer pointer ahead to the specified text column.
Skips ahead col columns from the current column. Stops if a newline character is encountered.
buf_in_out | Buffer pointer to skip to the specified column. |
buf_len | Number of buffer elements to potentially skip through. |
col | Column to skip to. |
static const char* read_txt_column | ( | const char * | src, |
uint32_t | src_len, | ||
char * | dst, | ||
uint32_t | dst_len | ||
) | [static] |
Reads the current text column from a source buffer into a destination and returns a pointer to the next column.
src | Buffer to read from. |
src_len | Length of the source buffer. |
dst | Buffer to read into. |
dst_len | Length of the destination buffer. |
static uint8_t skip_line | ( | const char ** | buf_in_out, |
const char * | buf, | ||
uint32_t | buf_len | ||
) | [static] |
static Error* get_num_samples_from_txt_csv | ( | uint32_t * | num_samples_out, |
const char * | file_buf, | ||
uint32_t | file_len | ||
) | [static] |
static Error* read_ids_from_txt_csv | ( | Matblock_u8 ** | ids_out, |
const char * | file_buf, | ||
uint32_t | file_len, | ||
uint32_t | num_samples, | ||
uint8_t | aux | ||
) | [static] |
Reads the sample IDs from a file buffer.
Column count begins with 1. If id_cols is zero, this function is a NOOP.
ids_out | Result parameter. |
file_buf | File buffer containing the data file. |
file_len | Length of the file buffer. |
num_samples | Number of samples (rows) in the file buffer. |
aux | Whether this is an auxilary file. |
static Error* read_labels_from_txt_csv | ( | Vector_u32 ** | labels_out, |
const char * | file_buf, | ||
uint32_t | file_len, | ||
uint32_t | num_samples, | ||
uint8_t | aux | ||
) | [static] |
Reads the haplo group labels from a file buffer.
Column count begins with 1. If label_col is set to zero, this function is a NOOP.
labels_out | Result parameter. |
file_buf | File buffer containing the data file. |
file_len | Length of the file buffer. |
num_samples | Number of samples (rows) in the file buffer. |
aux | Whether this is an auxilary file. |
static Error* read_markers_from_txt_csv | ( | Matrix_i32 ** | markers_out, |
const char * | file_buf, | ||
uint32_t | file_len, | ||
uint32_t | num_samples, | ||
uint8_t | aux | ||
) | [static] |
Reads the marker data from a file.
Column count begins with 1.
markers_out | Result parameter. |
file_buf | File buffer containing the data file. |
file_len | Length of the file buffer. |
num_samples | Number of samples (rows) in the file buffer. |
aux | Whether this is an auxilary file. |
static Error* read_input_from_txt_csv | ( | Matblock_u8 ** | ids_out, |
Vector_u32 ** | labels_out, | ||
Matrix_i32 ** | markers_out, | ||
const char * | file_buf, | ||
uint32_t | file_len, | ||
uint8_t | aux | ||
) | [static] |
static uint32_t get_num_samples_from_xml | ( | xmlDoc * | doc | ) | [static] |
static Error* read_ids_from_xml | ( | Matblock_u8 ** | ids_out, |
xmlDoc * | doc, | ||
uint32_t | num_samples, | ||
uint8_t | aux | ||
) | [static] |
static Error* read_labels_from_xml | ( | Vector_u32 ** | labels_out, |
xmlDoc * | doc, | ||
uint32_t | num_samples, | ||
uint8_t | aux | ||
) | [static] |
static Error* read_markers_from_xml | ( | Matrix_i32 ** | markers_out, |
xmlDoc * | doc, | ||
uint32_t | num_samples, | ||
uint8_t | aux | ||
) | [static] |
static Error* read_input_from_xml | ( | Matblock_u8 ** | ids_out, |
Vector_u32 ** | labels_out, | ||
Matrix_i32 ** | markers_out, | ||
const char * | file_buf, | ||
uint32_t | file_len, | ||
const char * | fname, | ||
uint8_t | aux | ||
) | [static] |
Error* read_input | ( | Matblock_u8 ** | ids_out, |
Vector_u32 ** | labels_out, | ||
Matrix_i32 ** | markers_out, | ||
const char * | fname | ||
) |
Reads the IDs, haplo labels and markers from a file.
ids_out | Result parameter. If opts.id_cols is zero *ids_out will be NULL. |
labels_out | Result parameter. If opts.labels_col is zero *labels_out will be NULL. |
markers_out | Result parameter. Will always be defined, unless there was an error reading the file. |
fname | Data file to read. |
Error* read_aux_input | ( | Matblock_u8 ** | ids_out, |
Vector_u32 ** | labels_out, | ||
Matrix_i32 ** | markers_out, | ||
const char * | fname | ||
) |
Reads the IDs, haplo labels and markers from an auxiliary file.
ids_out | Result parameter. If opts.id_cols is zero *ids_out will be NULL. |
labels_out | Result parameter. If opts.labels_col is zero *labels_out will be NULL. |
markers_out | Result parameter. Will always be defined, unless there was an error reading the file. |
fname | Data file to read. |
Error* impute_marker_from_parent_of_haplogroup_index | ( | uint32_t | haplo_group_index, |
uint32_t | marker_no, | ||
uint32_t | sample_no, | ||
Matrix_i32 * | marker_sums, | ||
Matrix_i32 * | imp_markers | ||
) |
haplo_group_index | Index of the haplogroup we look to impute with ancestor marker average. |
marker_no | Index of the missing marker to impute. |
sample_no | Number of sample being processed from the input file. |
marker_sums | Matrix containing the computed average for each of the haplogroups. |
imp_markers | vector of markers from the source sample to be imputed. |
Error* impute_missing_markers_from_avg | ( | const Vector_u32 * | imp_labels, |
Matrix_i32 * | imp_markers, | ||
const Vector_u32 * | src_labels, | ||
const Matrix_i32 * | src_markers | ||
) |
Imputes missing marker values (zero) with their label-dependent mean value.
imp_labels | Labels to use for selecting samples to impute markers from. |
imp_markers | Marker values of zero are imputed. |
src_labels | Labels to use for selecting samples to impute markers from. |
src_markers | Markers to use for imputing values. |
Error* impute_missing_markers_from_nn | ( | Matrix_i32 * | imp_markers, |
const Matrix_i32 * | src_markers | ||
) |