Haplo Prediction
predict haplogroups
Defines | Functions
input.c File Reference

Read the a row-by-row collection of data samples from a file. More...

#include <config.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <ctype.h>
#include <inttypes.h>
#include <assert.h>
#include <errno.h>
#include <libxml/tree.h>
#include <libxml/parser.h>
#include <libxml/valid.h>
#include <jwsc/base/error.h>
#include <jwsc/vector/vector.h>
#include <jwsc/matrix/matrix.h>
#include <jwsc/matblock/matblock.h>
#include <xml.h>
#include <haplo_groups.h>
#include <options.h>
#include <input.h>

Go to the source code of this file.

Defines

#define MAX_COL_SIZE   256

Functions

static uint32_t read_block_into_buffer (char *buf, uint32_t size, FILE *fp)
 Reads a block of characters off a file stream into a buffer.
static Errorread_file_into_buffer (char **buf_out, uint32_t *buf_len_out, const char *fname)
 Opens a file stream and reads in a file into a string buffer.
static uint8_t skip_to_csv_column (const char **buf_in_out, uint32_t buf_len, uint32_t col)
 Skips a buffer pointer ahead to the specified CSV column.
static const char * read_csv_column (const char *src, uint32_t src_len, char *dst, uint32_t dst_len)
 Reads the current CSV column from a source buffer into a destination and returns a pointer to the next column.
static uint8_t skip_to_txt_column (const char **buf_in_out, uint32_t buf_len, uint32_t col)
 Skips a buffer pointer ahead to the specified text column.
static const char * read_txt_column (const char *src, uint32_t src_len, char *dst, uint32_t dst_len)
 Reads the current text column from a source buffer into a destination and returns a pointer to the next column.
static uint8_t skip_line (const char **buf_in_out, const char *buf, uint32_t buf_len)
static Errorget_num_samples_from_txt_csv (uint32_t *num_samples_out, const char *file_buf, uint32_t file_len)
 Counts the number of samples (rows) in a file buffer.
static Errorread_ids_from_txt_csv (Matblock_u8 **ids_out, const char *file_buf, uint32_t file_len, uint32_t num_samples, uint8_t aux)
 Reads the sample IDs from a file buffer.
static Errorread_labels_from_txt_csv (Vector_u32 **labels_out, const char *file_buf, uint32_t file_len, uint32_t num_samples, uint8_t aux)
 Reads the haplo group labels from a file buffer.
static Errorread_markers_from_txt_csv (Matrix_i32 **markers_out, const char *file_buf, uint32_t file_len, uint32_t num_samples, uint8_t aux)
 Reads the marker data from a file.
static Errorread_input_from_txt_csv (Matblock_u8 **ids_out, Vector_u32 **labels_out, Matrix_i32 **markers_out, const char *file_buf, uint32_t file_len, uint8_t aux)
 Reads file data from a text or csv file.
static uint32_t get_num_samples_from_xml (xmlDoc *doc)
 Counts the number of samples (rows) in an xml document.
static Errorread_ids_from_xml (Matblock_u8 **ids_out, xmlDoc *doc, uint32_t num_samples, uint8_t aux)
 Reads the sample ids from an xml document.
static Errorread_labels_from_xml (Vector_u32 **labels_out, xmlDoc *doc, uint32_t num_samples, uint8_t aux)
 Reads the sample labels from an xml document.
static Errorread_markers_from_xml (Matrix_i32 **markers_out, xmlDoc *doc, uint32_t num_samples, uint8_t aux)
 Reads the sample markers from an xml document.
static Errorread_input_from_xml (Matblock_u8 **ids_out, Vector_u32 **labels_out, Matrix_i32 **markers_out, const char *file_buf, uint32_t file_len, const char *fname, uint8_t aux)
 Reads the input data structures from an xml file in a buffer.
Errorread_input (Matblock_u8 **ids_out, Vector_u32 **labels_out, Matrix_i32 **markers_out, const char *fname)
 Reads the IDs, haplo labels and markers from a file.
Errorread_aux_input (Matblock_u8 **ids_out, Vector_u32 **labels_out, Matrix_i32 **markers_out, const char *fname)
 Reads the IDs, haplo labels and markers from an auxiliary file.
Errorimpute_marker_from_parent_of_haplogroup_index (uint32_t haplo_group_index, uint32_t marker_no, uint32_t sample_no, Matrix_i32 *marker_sums, Matrix_i32 *imp_markers)
Errorimpute_missing_markers_from_avg (const Vector_u32 *imp_labels, Matrix_i32 *imp_markers, const Vector_u32 *src_labels, const Matrix_i32 *src_markers)
 Imputes missing marker values (zero) with their label-dependent mean value.
Errorimpute_missing_markers_from_nn (Matrix_i32 *imp_markers, const Matrix_i32 *src_markers)
 Imputes missing marker values (zero) from nearest neighbor interpolation.

Detailed Description

Read the a row-by-row collection of data samples from a file.

Author:
Joseph Schlecht
License:
Creative Commons BY-NC-SA 3.0

Definition in file input.c.


Define Documentation

#define MAX_COL_SIZE   256

Definition at line 75 of file input.c.


Function Documentation

static uint32_t read_block_into_buffer ( char *  buf,
uint32_t  size,
FILE *  fp 
) [static]

Reads a block of characters off a file stream into a buffer.

Assumes that size space is available in buf.

Definition at line 83 of file input.c.

static Error* read_file_into_buffer ( char **  buf_out,
uint32_t *  buf_len_out,
const char *  fname 
) [static]

Opens a file stream and reads in a file into a string buffer.

Definition at line 104 of file input.c.

static uint8_t skip_to_csv_column ( const char **  buf_in_out,
uint32_t  buf_len,
uint32_t  col 
) [static]

Skips a buffer pointer ahead to the specified CSV column.

Skips ahead col columns from the current column. Stops if a newline character is encountered.

Parameters:
buf_in_outBuffer pointer to skip to the specified column.
buf_lenNumber of buffer elements to potentially skip through.
colColumn to skip to.
Returns:
0 if the buffer does not have enough columns to support the operation; 1 otherwise.

Definition at line 175 of file input.c.

static const char* read_csv_column ( const char *  src,
uint32_t  src_len,
char *  dst,
uint32_t  dst_len 
) [static]

Reads the current CSV column from a source buffer into a destination and returns a pointer to the next column.

Parameters:
srcBuffer to read from.
src_lenLength of the source buffer.
dstBuffer to read into.
dst_lenLength of the destination buffer.
Returns:
A pointer to the next CSV column, or NULL if there isn't one.

Definition at line 224 of file input.c.

static uint8_t skip_to_txt_column ( const char **  buf_in_out,
uint32_t  buf_len,
uint32_t  col 
) [static]

Skips a buffer pointer ahead to the specified text column.

Skips ahead col columns from the current column. Stops if a newline character is encountered.

Parameters:
buf_in_outBuffer pointer to skip to the specified column.
buf_lenNumber of buffer elements to potentially skip through.
colColumn to skip to.
Returns:
0 if the buffer does not have enough columns to support the operation; 1 otherwise.

Definition at line 276 of file input.c.

static const char* read_txt_column ( const char *  src,
uint32_t  src_len,
char *  dst,
uint32_t  dst_len 
) [static]

Reads the current text column from a source buffer into a destination and returns a pointer to the next column.

Parameters:
srcBuffer to read from.
src_lenLength of the source buffer.
dstBuffer to read into.
dst_lenLength of the destination buffer.
Returns:
A pointer to the next text column, or NULL if there isn't one.

Definition at line 330 of file input.c.

static uint8_t skip_line ( const char **  buf_in_out,
const char *  buf,
uint32_t  buf_len 
) [static]

Skips the file pointer ahead, one position past the newline character '\n'.

Returns:
0 if *buf_in_out points beyond the buffer after skipping a line; 1 otherwise.

Definition at line 375 of file input.c.

static Error* get_num_samples_from_txt_csv ( uint32_t *  num_samples_out,
const char *  file_buf,
uint32_t  file_len 
) [static]

Counts the number of samples (rows) in a file buffer.

Definition at line 399 of file input.c.

static Error* read_ids_from_txt_csv ( Matblock_u8 **  ids_out,
const char *  file_buf,
uint32_t  file_len,
uint32_t  num_samples,
uint8_t  aux 
) [static]

Reads the sample IDs from a file buffer.

Column count begins with 1. If id_cols is zero, this function is a NOOP.

Parameters:
ids_outResult parameter.
file_bufFile buffer containing the data file.
file_lenLength of the file buffer.
num_samplesNumber of samples (rows) in the file buffer.
auxWhether this is an auxilary file.

Definition at line 442 of file input.c.

static Error* read_labels_from_txt_csv ( Vector_u32 **  labels_out,
const char *  file_buf,
uint32_t  file_len,
uint32_t  num_samples,
uint8_t  aux 
) [static]

Reads the haplo group labels from a file buffer.

Column count begins with 1. If label_col is set to zero, this function is a NOOP.

Parameters:
labels_outResult parameter.
file_bufFile buffer containing the data file.
file_lenLength of the file buffer.
num_samplesNumber of samples (rows) in the file buffer.
auxWhether this is an auxilary file.

Definition at line 534 of file input.c.

static Error* read_markers_from_txt_csv ( Matrix_i32 **  markers_out,
const char *  file_buf,
uint32_t  file_len,
uint32_t  num_samples,
uint8_t  aux 
) [static]

Reads the marker data from a file.

Column count begins with 1.

Parameters:
markers_outResult parameter.
file_bufFile buffer containing the data file.
file_lenLength of the file buffer.
num_samplesNumber of samples (rows) in the file buffer.
auxWhether this is an auxilary file.

Definition at line 620 of file input.c.

static Error* read_input_from_txt_csv ( Matblock_u8 **  ids_out,
Vector_u32 **  labels_out,
Matrix_i32 **  markers_out,
const char *  file_buf,
uint32_t  file_len,
uint8_t  aux 
) [static]

Reads file data from a text or csv file.

Definition at line 716 of file input.c.

static uint32_t get_num_samples_from_xml ( xmlDoc doc) [static]

Counts the number of samples (rows) in an xml document.

Definition at line 756 of file input.c.

static Error* read_ids_from_xml ( Matblock_u8 **  ids_out,
xmlDoc doc,
uint32_t  num_samples,
uint8_t  aux 
) [static]

Reads the sample ids from an xml document.

Definition at line 778 of file input.c.

static Error* read_labels_from_xml ( Vector_u32 **  labels_out,
xmlDoc doc,
uint32_t  num_samples,
uint8_t  aux 
) [static]

Reads the sample labels from an xml document.

Definition at line 841 of file input.c.

static Error* read_markers_from_xml ( Matrix_i32 **  markers_out,
xmlDoc doc,
uint32_t  num_samples,
uint8_t  aux 
) [static]

Reads the sample markers from an xml document.

Definition at line 903 of file input.c.

static Error* read_input_from_xml ( Matblock_u8 **  ids_out,
Vector_u32 **  labels_out,
Matrix_i32 **  markers_out,
const char *  file_buf,
uint32_t  file_len,
const char *  fname,
uint8_t  aux 
) [static]

Reads the input data structures from an xml file in a buffer.

Definition at line 974 of file input.c.

Error* read_input ( Matblock_u8 **  ids_out,
Vector_u32 **  labels_out,
Matrix_i32 **  markers_out,
const char *  fname 
)

Reads the IDs, haplo labels and markers from a file.

Parameters:
ids_outResult parameter. If opts.id_cols is zero *ids_out will be NULL.
labels_outResult parameter. If opts.labels_col is zero *labels_out will be NULL.
markers_outResult parameter. Will always be defined, unless there was an error reading the file.
fnameData file to read.

Definition at line 1054 of file input.c.

Error* read_aux_input ( Matblock_u8 **  ids_out,
Vector_u32 **  labels_out,
Matrix_i32 **  markers_out,
const char *  fname 
)

Reads the IDs, haplo labels and markers from an auxiliary file.

Parameters:
ids_outResult parameter. If opts.id_cols is zero *ids_out will be NULL.
labels_outResult parameter. If opts.labels_col is zero *labels_out will be NULL.
markers_outResult parameter. Will always be defined, unless there was an error reading the file.
fnameData file to read.

Definition at line 1109 of file input.c.

Error* impute_marker_from_parent_of_haplogroup_index ( uint32_t  haplo_group_index,
uint32_t  marker_no,
uint32_t  sample_no,
Matrix_i32 marker_sums,
Matrix_i32 imp_markers 
)
Parameters:
haplo_group_indexIndex of the haplogroup we look to impute with ancestor marker average.
marker_noIndex of the missing marker to impute.
sample_noNumber of sample being processed from the input file.
marker_sumsMatrix containing the computed average for each of the haplogroups.
imp_markersvector of markers from the source sample to be imputed.
Note:
This function will keep looking for a parent with a value different than zero, and if doesn't find any it fails, and we will have o remove this sample

Definition at line 1167 of file input.c.

Error* impute_missing_markers_from_avg ( const Vector_u32 imp_labels,
Matrix_i32 imp_markers,
const Vector_u32 src_labels,
const Matrix_i32 src_markers 
)

Imputes missing marker values (zero) with their label-dependent mean value.

Parameters:
imp_labelsLabels to use for selecting samples to impute markers from.
imp_markersMarker values of zero are imputed.
src_labelsLabels to use for selecting samples to impute markers from.
src_markersMarkers to use for imputing values.
Note:
The imputed and source sets can be the same.

Definition at line 1208 of file input.c.

Error* impute_missing_markers_from_nn ( Matrix_i32 imp_markers,
const Matrix_i32 src_markers 
)

Imputes missing marker values (zero) from nearest neighbor interpolation.

Parameters:
imp_markersMarker values of zero are imputed.
src_markersMarkers to use for imputing values.
Note:
The imputed and source sets can be the same.

Definition at line 1301 of file input.c.