Skip to content


Mods to accommodate core changes to DB.[ch] utility. Small documentat…
Browse files Browse the repository at this point in the history
…ion correction.
  • Loading branch information
thegenemyers committed May 6, 2018
1 parent 2f51ccb commit 8bd4693
Show file tree
Hide file tree
Showing 8 changed files with 679 additions and 175 deletions.
498 changes: 421 additions & 77 deletions DB.c

Large diffs are not rendered by default.

230 changes: 195 additions & 35 deletions DB.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,9 @@

#ifndef _HITS_DB
#ifndef _DAZZ_DB

#define _HITS_DB
#define _DAZZ_DB

#include <stdio.h>

Expand Down Expand Up @@ -59,6 +59,8 @@ typedef signed long long int64;
typedef float float32;
typedef double float64;

#define LAST_READ_SYMBOL '$'
#define BLOCK_SYMBOL '@'

Expand All @@ -74,11 +76,6 @@ extern char Ebuffer[];


#define SYSTEM_ERROR \
{ EPRINTF(EPLACE,"%s: System error, read failed!\n",Prog_Name); \
exit (2); \

#define ARG_INIT(name) \
Prog_Name = Strdup(name,""); \
for (i = 0; i < 128; i++) \
Expand Down Expand Up @@ -125,6 +122,108 @@ extern char Ebuffer[];
exit (1); \


// Utilitieis

int Count_Args(char *arg);

{ fprintf(stderr,"%s: System error, read failed!\n",Prog_Name); \
exit (2); \

{ fprintf(stderr,"%s: System error, write failed!\n",Prog_Name); \
exit (2); \

{ fprintf(stderr,"%s: System error, file close failed!\n",Prog_Name); \
exit (2); \

// Output

#define FWRITE(v,s,n,file) \
{ if (fwrite(v,s,n,file) != (size_t) n) \

#define FPRINTF(file,...) \
{ if (fprintf(file,__VA_ARGS__) < 0) \

#define PRINTF(...) \
{ if (printf(__VA_ARGS__) < 0) \

#define FPUTS(x,file) \
{ if (fputs(x,file) == EOF) \

// Close

#define FCLOSE(file) \
{ if (fclose(file) != 0) \

// Input

#define FREAD(v,s,n,file) \
{ if (fread(v,s,n,file) != (size_t) n) \
{ if (ferror(file)) \
else \
{ fprintf(stderr,"%s: The file %s is corrupted\n",Prog_Name,file ## _name); \
exit (1); \
} \
} \

#define FSCANF(file,...) \
{ if (fscanf(file,__VA_ARGS__) != Count_Args(#__VA_ARGS__)-1) \
{ if (ferror(file)) \
else \
{ fprintf(stderr,"%s: The file %s is corrupted\n",Prog_Name,file ## _name); \
exit (1); \
} \
} \

#define FGETS(v,n,file) \
{ if (fgets(v,n,file) == NULL) \
{ if (ferror(file)) \
else \
{ fprintf(stderr,"%s: The file %s is corrupted\n",Prog_Name,file ## _name); \
exit (1); \
} \
} \

#define FSEEKO(file,p,d) \
{ if (fseeko(file,p,d) < 0) \

#define FTELLO(file) \
( { int x = ftello(file); \
if (x < 0) \
; x; \
} )

Expand Down Expand Up @@ -193,7 +292,7 @@ typedef struct
// Offset (in bytes) of scaffold header string in '.hdr' file (DAM)
// 4 compressed shorts containing snr info if an arrow DB.
int flags; // QV of read + flags above (DB only)

// A track can be of 3 types:
// data == NULL: there are nreads 'anno' records of size 'size'.
Expand All @@ -208,9 +307,31 @@ typedef struct _track
int size; // Size in bytes of anno records
void *anno; // over [0,nreads]: read i annotation: int, int64, or 'size' records
void *data; // data[anno[i] .. anno[i+1]-1] is data if data != NULL

// The tailing part of a .anno track file can contain meta-information produced by the
// command that produced the track. For example, the coverage, or good/bad parameters
// for trimming, or even say a histogram of QV values. Each item is an array of 'nelem'
// 64-bit ints or floats ('vtype' = DB_INT or DB_REAL), has a 'name' string that
// describes it, and an indicator as to whether the values should be equal accross all
// block tracks, or summed accross all block tracks (by Catrack). 'value' points at the
// array of values

// The information for accessing QV streams is in a HITS_QV record that is a "pseudo-track"
#define DB_INT 0
#define DB_REAL 1

#define DB_EXACT 0
#define DB_SUM 1

typedef struct
{ int vtype; // INT64 or FLOAST64
int nelem; // >= 1
int accum; // EXACT, SUM
char *name;
void *value;

// The information for accessing QV streams is in a DAZZ_QV record that is a "pseudo-track"
// named ".@qvs" and is always the first track record in the list (if present). Since normal
// track names cannot begin with a . (this is enforced), this pseudo-track is never confused
// with a normal track.
Expand All @@ -223,11 +344,11 @@ typedef struct
uint16 *table; // for i in [0,db->nreads-1]: read i should be decompressed with
// scheme coding[table[i]]
FILE *quiva; // the open file pointer to the .qvs file

// The DB record holds all information about the current state of an active DB including an
// array of HITS_READS, one per read, and a linked list of HITS_TRACKs the first of which
// is always a HITS_QV pseudo-track (if the QVs have been loaded).
// array of DAZZ_READS, one per read, and a linked list of DAZZ_TRACKs the first of which
// is always a DAZZ_QV pseudo-track (if the QVs have been loaded).

typedef struct
{ int ureads; // Total number of reads in untrimmed DB
Expand Down Expand Up @@ -257,9 +378,9 @@ typedef struct
int loaded; // Are reads loaded in memory?
void *bases; // file pointer for bases file (to fetch reads from),
// or memory pointer to uncompressed block of all sequences.
HITS_READ *reads; // Array [-1..nreads] of HITS_READ
HITS_TRACK *tracks; // Linked list of loaded tracks
DAZZ_READ *reads; // Array [-1..nreads] of DAZZ_READ
DAZZ_TRACK *tracks; // Linked list of loaded tracks

Expand Down Expand Up @@ -294,42 +415,42 @@ typedef struct
// contain N-separated contigs), and .fpulse the first base of the contig in the
// fasta entry

// Open the given database or dam, "path" into the supplied HITS_DB record "db". If the name has
// Open the given database or dam, "path" into the supplied DAZZ_DB record "db". If the name has
// a part # in it then just the part is opened. The index array is allocated (for all or
// just the part) and read in.
// Return status of routine:
// -1: The DB could not be opened for a reason reported by the routine to EPLACE
// 0: Open of DB proceeded without mishap
// 1: Open of DAM proceeded without mishap

int Open_DB(char *path, HITS_DB *db);
int Open_DB(char *path, DAZZ_DB *db);

// Trim the DB or part thereof and all loaded tracks according to the cutoff and all settings
// of the current DB partition. Reallocate smaller memory blocks for the information kept
// for the retained reads.

void Trim_DB(HITS_DB *db);
void Trim_DB(DAZZ_DB *db);

// Shut down an open 'db' by freeing all associated space, including tracks and QV structures,
// and any open file pointers. The record pointed at by db however remains (the user
// supplied it and so should free it).

void Close_DB(HITS_DB *db);
void Close_DB(DAZZ_DB *db);

// Return the size in bytes of the given DB

int64 sizeof_DB(HITS_DB *db);
int64 sizeof_DB(DAZZ_DB *db);

// If QV pseudo track is not already in db's track list, then load it and set it up.
// The database must not have been trimmed yet. -1 is returned if a .qvs file is not
// present, and 1 is returned if an error (reported to EPLACE) occured and INTERACTIVE
// is defined. Otherwise a 0 is returned.

int Load_QVs(HITS_DB *db);
int Load_QVs(DAZZ_DB *db);

// Remove the QV pseudo track, all space associated with it, and close the .qvs file.

void Close_QVs(HITS_DB *db);
void Close_QVs(DAZZ_DB *db);

// Look up the file and header in the file of the indicated track. Return:
// 1: Track is for trimmed DB
Expand All @@ -344,41 +465,60 @@ void Close_QVs(HITS_DB *db);
#define CUSTOM_TRACK 0
#define MASK_TRACK 1

int Check_Track(HITS_DB *db, char *track, int *kind);
int Check_Track(DAZZ_DB *db, char *track, int *kind);

// If track is not already in the db's track list, then allocate all the storage for it,
// read it in from the appropriate file, add it to the track list, and return a pointer
// to the newly created HITS_TRACK record. If the track does not exist or cannot be
// to the newly created DAZZ_TRACK record. If the track does not exist or cannot be
// opened for some reason, then NULL is returned if INTERACTIVE is defined. Otherwise
// the routine prints an error message to stderr and exits if an error occurs, and returns
// with NULL only if the track does not exist.

HITS_TRACK *Load_Track(HITS_DB *db, char *track);
DAZZ_TRACK *Load_Track(DAZZ_DB *db, char *track);

// Assumming file pointer for afile is correctly positioned at the start of a extra item,
// and aname is the name of the .anno file, decode the value present and places it in
// extra if extra->nelem == 0, otherwise reduce the value just read into extra according
// according the to the directive given by 'accum'. Leave the read poinrt at the next
// extra or end-of-file.
// Returns:
// 1 if at the end of file,
// 0 if item was read and folded correctly,
// -1 if there was a system IO or allocation error (if interactive), and
// -2 if the new value could not be reduced into the currenct value of extra (interactive)

int Read_Extra(FILE *afile, char *aname, DAZZ_EXTRA *extra);

// Write extra record to end of file afile and advance write pointer
// If interactive, then return non-zero on error, if bash, then print
// and halt if an error

int Write_Extra(FILE *afile, DAZZ_EXTRA *extra);

// If track is on the db's track list, then it is removed and all storage associated with it
// is freed.

void Close_Track(HITS_DB *db, char *track);
void Close_Track(DAZZ_DB *db, char *track);

// Allocate and return a buffer big enough for the largest read in 'db'.
// **NB** free(x-1) if x is the value returned as *prefix* and suffix '\0'(4)-byte
// are needed by the alignment algorithms. If cannot allocate memory then return NULL
// if INTERACTIVE is defined, or print error to stderr and exit otherwise.

char *New_Read_Buffer(HITS_DB *db);
char *New_Read_Buffer(DAZZ_DB *db);

// Load into 'read' the i'th read in 'db'. As a lower case ascii string if ascii is 1, an
// upper case ascii string if ascii is 2, and a numeric string over 0(A), 1(C), 2(G), and 3(T)
// otherwise. A '\0' (or 4) is prepended and appended to the string so it has a delimeter
// for traversals in either direction. A non-zero value is returned if an error occured
// and INTERACTIVE is defined.

int Load_Read(HITS_DB *db, int i, char *read, int ascii);
int Load_Read(DAZZ_DB *db, int i, char *read, int ascii);

// Exactly the same as Load_Read, save the arrow information is loaded, not the DNA sequence,
// and there is only a choice between numeric (0) or ascii (1);

int Load_Arrow(HITS_DB *db, int i, char *read, int ascii);
int Load_Arrow(DAZZ_DB *db, int i, char *read, int ascii);

// Load into 'read' the subread [beg,end] of the i'th read in 'db' and return a pointer to the
// the start of the subinterval (not necessarily = to read !!! ). As a lower case ascii
Expand All @@ -387,7 +527,7 @@ int Load_Arrow(HITS_DB *db, int i, char *read, int ascii);
// the string holding the substring so it has a delimeter for traversals in either direction.
// A NULL pointer is returned if an error occured and INTERACTIVE is defined.

char *Load_Subread(HITS_DB *db, int i, int beg, int end, char *read, int ascii);
char *Load_Subread(DAZZ_DB *db, int i, int beg, int end, char *read, int ascii);

// Allocate a set of 5 vectors large enough to hold the longest QV stream that will occur
// in the database. If cannot allocate memory then return NULL if INTERACTIVE is defined,
Expand All @@ -399,13 +539,13 @@ char *Load_Subread(HITS_DB *db, int i, int beg, int end, char *read, int ascii);
#define SUB_QV 3 // The substitution QVs
#define MRG_QV 4 // The merge QVs

char **New_QV_Buffer(HITS_DB *db);
char **New_QV_Buffer(DAZZ_DB *db);

// Load into 'entry' the 5 QV vectors for i'th read in 'db'. The deletion tag or characters
// are converted to a numeric or upper/lower case ascii string as per ascii. Return with
// a zero, except when an error occurs and INTERACTIVE is defined in which case return wtih 1.

int Load_QVentry(HITS_DB *db, int i, char **entry, int ascii);
int Load_QVentry(DAZZ_DB *db, int i, char **entry, int ascii);

// Allocate a block big enough for all the uncompressed sequences, read them into it,
// reset the 'off' in each read record to be its in-memory offset, and set the
Expand All @@ -415,7 +555,7 @@ int Load_QVentry(HITS_DB *db, int i, char **entry, int ascii);
// Return with a zero, except when an error occurs and INTERACTIVE is defined in which
// case return wtih 1.

int Read_All_Sequences(HITS_DB *db, int ascii);
int Read_All_Sequences(DAZZ_DB *db, int ascii);

// For the DB or DAM "path" = "prefix/root.[db|dam]", find all the files for that DB, i.e. all
// those of the form "prefix/[.]root.part" and call actor with the complete path to each file
Expand All @@ -429,4 +569,24 @@ int Read_All_Sequences(HITS_DB *db, int ascii);

int List_DB_Files(char *path, void actor(char *path, char *extension));

#endif // _HITS_DB
// Take a command line argument and interpret the '@' block number ranges.
// Parse_Block_Arg produces a Block_Looper iterator object that can then
// be invoked multiple times to iterate through all the files implied by
// the @ pattern/range. Next_Block_Slice returns a string encoing the next
// slice files represented by an @-notation, and advances the iterator by
// that many files.

typedef void Block_Looper;

Block_Looper *Parse_Block_Arg(char *arg);

FILE *Next_Block_Arg(Block_Looper *e_parse);

char *Next_Block_Slice(Block_Looper *e_parse,int slice);

void Reset_Block_Arg(Block_Looper *e_parse); // Reset iterator to first file
char *Block_Arg_Path(Block_Looper *e_parse); // Path of current file
char *Block_Arg_Root(Block_Looper *e_parse); // Root name of current file
void Free_Block_Arg(Block_Looper *e_parse); // Free the iterator

#endif // _DAZZ_DB
2 changes: 1 addition & 1 deletion
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ a header that contains the movie name and the 4 channel SNR values.
3. (-q) a FASTQ-like .quiva file containing for each subread the same header as the
.fasta file above, save that it starts with an @-sign, followed by the 5 quality
value streams used by Quiver, one per line, where the order of the streams is:
deletion QVs, deletion Tags, insertion QVs, substitution QVs, and last merge QVs.
deletion QVs, deletion Tags, insertion QVs, merge QVs, and last substitution QVs.

If the -v option is set then the program reports the processing of each PacBio input
file, otherwise it runs silently. If none of the -f, -a, or -q flags is set, then by
Expand Down

0 comments on commit 8bd4693

Please sign in to comment.