-
Notifications
You must be signed in to change notification settings - Fork 16
/
Copy pathONElib.h
410 lines (331 loc) · 18.7 KB
/
ONElib.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
/******************************************************************************************
*
* File: ONElib.h
* Header for ONE file reading and writing
*
* Authors: Richard Durbin ([email protected]), Gene Myers ([email protected])
* Copyright (C) Richard Durbin, Gene Myers, 2019-
*
* HISTORY:
* Last edited: Dec 3 06:08 2022 (rd109)
* * Dec 3 06:01 2022 (rd109): remove oneWriteHeader(), switch to stdarg for oneWriteComment etc.
* * Dec 27 09:46 2019 (gene): style edits
* * Created: Sat Feb 23 10:12:43 2019 (rd109)
*
*****************************************************************************************/
#ifndef ONE_DEFINED
#define ONE_DEFINED
#include <stdio.h> // for FILE etc.
#include <stdarg.h> // for formatted writing in oneWriteComment(), oneAddProvenance()
#include <inttypes.h> // for standard size int types and their PRI print macros
#include <stdbool.h> // for standard bool types
#include <limits.h> // for INT_MAX etc.
#include <pthread.h>
/***********************************************************************************
*
* DATA TYPES
*
**********************************************************************************/
// Basic Types
#ifndef U8_DEFINED
#define U8_DEFINED
typedef int8_t I8;
typedef int16_t I16;
typedef int32_t I32;
typedef int64_t I64;
typedef unsigned char U8;
#endif // U8_DEFINED
typedef enum { oneINT = 1, oneREAL, oneCHAR, oneSTRING,
oneINT_LIST, oneREAL_LIST, oneSTRING_LIST, oneDNA } OneType;
extern char* oneTypeString[] ;
// = { 0, "INT", "REAL", "CHAR", "STRING", "INT_LIST", "REAL_LIST", "STRING_LIST", "DNA" } ;
typedef union
{ I64 i;
double r;
char c;
I64 len; // For lists : top 8 bits encode excess bytes, low 56 length
} OneField;
typedef struct
{ char *program;
char *version;
char *command;
char *date;
} OneProvenance;
typedef struct
{ char *filename;
I64 count;
} OneReference;
typedef struct
{ I64 count;
I64 max;
I64 total;
I64 groupCount;
I64 groupTotal;
} OneCounts;
// OneCodecs are a private package for binary one file compression
typedef void OneCodec; // forward declaration of opaque type for compression codecs
// DNAcodec is a special pre-existing compressor one should use for DNA.
// It compresses every base to 2-bits, where any non-ACGT letter is
// effectively converted to an A. Compression is case insensitive,
// but decompression always delivers lower-case.
extern OneCodec *DNAcodec;
// Record for a particular line type. There is at most one list element.
typedef struct
{ OneCounts accum; // counts read or written to this moment
OneCounts given; // counts read from header
I64 gCount; // used internally to calculate groupCount and groupTotal
I64 gTotal;
I64 oCount; // # of objects in prefix before first group (if any)
I64 oTotal; // + of objects in prefix (these 2 are for thread parallel apps)
int nField; // number of fields
OneType *fieldType; // type of each field
int listEltSize; // size of list field elements (if present, else 0)
int listField; // field index of list
char *comment; // the comment on the definition line in the schema
bool isUserBuf; // flag for whether buffer is owned by user
I64 bufSize; // system buffer and size if not user supplied
void *buffer;
OneCodec *listCodec; // compression codec and flags
bool isUseListCodec; // on once enough data collected to train associated codec
char binaryTypePack; // binary code for line type, bit 8 set.
// bit 0: list compressed
I64 listTack; // accumulated training data for this threads codeCodec (master)
} OneInfo;
// the schema type - the first record is the header spec, then a linked list of primary classes
typedef struct OneSchema
{
char *primary ;
int nSecondary ;
char **secondary ;
OneInfo *info[128] ;
int nFieldMax ;
char objectType ;
char groupType ;
struct OneSchema *nxt ;
} OneSchema ;
typedef struct OneHeaderText
{ char *text ;
struct OneHeaderText *nxt ;
} OneHeaderText ;
// The main OneFile type - this is the primary handle used by the end user
typedef struct
{
// this field may be set by the user
bool isCheckString; // set if want to validate string char by char
// these fields may be read by user - but don't change them!
char *fileType;
char *subType;
char lineType; // current lineType
char objectType; // line designation character for primary objects
char groupType; // line designation character for groups (optional)
I64 line; // current line number
I64 byte; // current byte position when writing binary
I64 object; // current object - incremented when object line read
I64 group; // current group - incremented when group line read
OneProvenance *provenance; // if non-zero then count['!'] entries
OneReference *reference; // if non-zero then count['<'] entries
OneReference *deferred; // if non-zero then count['>'] entries
OneField *field; // used to hold the current line - accessed by macros
OneInfo *info[128]; // all the per-linetype information
I64 codecTrainingSize; // amount of data to see before building codec
// fields below here are private to the package
FILE *f;
bool isWrite; // true if open for writing
bool isHeaderOut; // true if header already written
bool isBinary; // true if writing a binary file
bool inGroup; // set once inside a group
bool isLastLineBinary; // needed to deal with newlines on ascii files
bool isIndexIn; // index read in
bool isBig; // are we on a big-endian machine?
bool isNoAsciiHeader; // backdoor for ONEview to avoid writing header in ascii
char lineBuf[128]; // working buffers
char numberBuf[32];
int nFieldMax;
I64 codecBufSize;
char *codecBuf;
I64 nBits; // number of bits of list currently in codecBuf
I64 intListBytes; // number of bytes per integer in the compacted INT_LIST
I64 linePos; // current line position
OneHeaderText *headerText; // arbitrary descriptive text that goes with the header
char binaryTypeUnpack[256]; // invert binary line code to ASCII line character.
int share; // index if slave of threaded write, +nthreads > 0 if master
int isFinal; // oneFinalizeCounts has been called on file
pthread_mutex_t fieldLock; // Mutexs to protect training accumumulation stats when threadded
pthread_mutex_t listLock;
} OneFile; // the footer will be in the concatenated result.
/***********************************************************************************
*
* ROUTINES FOR READING & WRITING ONE FILES IN BOTH ASCII & BINARY (TRANSPARENTLY)
*
**********************************************************************************/
// CREATING AND DESTROYING SCHEMAS
OneSchema *oneSchemaCreateFromFile (char *path) ;
OneSchema *oneSchemaCreateFromText (char *text) ;
// These functions create a schema handle that can be used to open One-code data files
// for reading and writing. A schema file is itself a One-code file, consisting of
// a set of objects, one per primary file type. Valid lines in this file are:
// P <primary file type> // a short string
// S <secondary file type> // a short string - any number of these
// O <char> <field_list> // definition of object type
// G <char> <field_list> // definition of group type - first field must be an int
// D <char> <field_list> // definition of line
// <char> must be a lower or upper case letter.
// <field_list> is a list of field types from:
// CHAR, INT, REAL, STRING, INT_LIST, REAL_LIST, STRING_LIST, DNA
// Only one list type (STRING, *_LIST or DNA) is allowed per line type.
// All the D lines following an O line apply to that object type.
// By convention comments on each line explain the definition.
// Example, with lists and strings preceded by their length in OneCode style
// P 3 seq this is a sequence file
// O S 1 3 DNA the DNA sequence - each S line starts an object
// D Q 1 6 STRING the phred encoded quality score + ASCII 33
// D N 4 4 REAL 4 REAL 4 REAL 4 REAL signal to noise ratio in A, C, G, T channels
// G g 2 3 INT 6 STRING group designator: number of objects, name
// The ...FromText() alternative writes the text to a temp file and reads it with
// oneSchemaCreateFromFile(). This allows code to set the schema.
// Internally a schema is a linked list of OneSchema objects, with the first holding
// the (hard-coded) schema for the header and footer, and the remainder each
// corresponding to one primary file type.
void oneSchemaDestroy (OneSchema *schema) ;
// READING ONE FILES:
OneFile *oneFileOpenRead (const char *path, OneSchema *schema, char *type, int nthreads) ;
// Open ONE file 'path', either binary or ascii encoded, for reading.
// If the file doesn't have a header, then 'type' must be specified,
// otherwise, if 'type' is non-zero it must match the header type.
// All header information (if present) is read.
// 'schema' is also optional. If it is NULL then the file must contain its own schema.
// If 'schema' is present then it must support 'type', and if the file contains its
// own schema, then that must be a subset of the one for this type in 'schema'.
// If nthreads > 1 then nthreadds OneFiles are generated as an array and the pointer
// to the first, called the master, is returned. The other nthreads-1 files are
// called slaves. The package routines are aware of when a OneFile argument is a
// slave or master in a parallel group. The master recieves provenance, counts, etc.
// The slaves only read data and have the virtue of sharing indices and codecs with
// the master if relevant.
bool oneFileCheckSchema (OneFile *vf, char *textSchema) ;
// Checks if file schema is consistent with text schema. Mismatches are reported to stderr.
// Filetype and all linetypes in text must match. File schema can contain additional linetypes.
// e.g. if (! oneFileCheckSchema (vf, "P 3 seq\nD S 1 3 DNA\nD Q 1 6 STRING\nD P 0\n")) die () ;
// This is provided to enable a program to ensure that its assumptions about data layout
// are satisfied.
char oneReadLine (OneFile *vf) ;
// Read the next ONE formatted line returning the line type of the line, or 0
// if at the end of the data section. The content macros immediately below are
// used to access the information of the line most recently read.
void *_oneList (OneFile *vf) ; // lazy codec decompression if required
void *_oneCompressedList (OneFile *vf) ; // lazy codec compression if required
#define oneInt(vf,x) ((vf)->field[x].i)
#define oneReal(vf,x) ((vf)->field[x].r)
#define oneChar(vf,x) ((vf)->field[x].c)
#define _LF(vf) ((vf)->info[(int)(vf)->lineType]->listField)
#define oneLen(vf) ((vf)->field[_LF(vf)].len & 0xffffffffffffffll)
#define oneString(vf) (char *) _oneList(vf)
#define oneDNAchar(vf) (char *) _oneList(vf)
#define oneDNA2bit(vf) (U8 *) _oneCompressedList(vf)
#define oneIntList(vf) (I64 *) _oneList(vf)
#define oneRealList(vf) (double *) _oneList(vf)
#define oneNextString(vf,s) (s + strlen(s) + 1)
// Access field information. The index x of a list object is not required as there is
// only one list per line, stored in ->buffer.
// A "string list" is implicitly supported, get the first string with oneString, and
// subsequent strings sequentially with oneNextString, e.g.:
//
// char *s = oneString(vf);
// for (i = 0; i < oneLen(vf); i++)
// { // do something with i'th string
// s = oneNextString(vf,s);
// }
char *oneReadComment (OneFile *vf);
// Can be called after oneReadLine() to read any optional comment text after the fixed fields.
// Returns NULL if there is no comment.
// WRITING ONE FILES:
OneFile *oneFileOpenWriteNew (const char *path, OneSchema *schema, char *type,
bool isBinary, int nthreads);
OneFile *oneFileOpenWriteFrom (const char *path, OneFile *vfIn,
bool isBinary, int nthreads);
// Create a new oneFile that will be written to 'path'. For the 'New' variant supply
// the file type, subtype (if non-zero), and whether it should be binary or ASCII.
// For the 'From' variant, specify binary or ASCII, schema and all other header
// information is inherited from 'vfIn', where the count stats are from vfIn's
// accumulation (assumes vfIn has been fully read or written) if 'useAccum is true,
// and from vfIn's header otherwise.
// If nthreads > 1 then nthreads OneFiles are generated as an array and the pointer
// to the first, called the master, is returned. The other nthreads-1 files are
// called slaves. The package routines are aware of when a OneFile argument is a
// slave or master in a parallel group. The slaves are expected to only write data
// lines, with the master adding provenance, producing the header, and then some
// segment of the initial data lines. Upon close the final result is effectively
// the concatenation of the master, followed by the output of each slave in sequence.
bool oneInheritProvenance (OneFile *vf, OneFile *source);
bool oneInheritReference (OneFile *vf, OneFile *source);
bool oneInheritDeferred (OneFile *vf, OneFile *source);
// Add all provenance/reference/deferred entries in source to header of vf. Must be
// called before first call to oneWriteLine.
bool oneAddProvenance (OneFile *vf, char *prog, char *version, char *format, ...);
bool oneAddReference (OneFile *vf, char *filename, I64 count);
bool oneAddDeferred (OneFile *vf, char *filename);
// Append provenance/reference/deferred to header information. Must be called before
// first call to oneWriteLine.
// For ASCII output, if you want the header to contain count information then you must
// create and fill the relevant OneCounts objects before the first call to oneWriteLine.
// For BINARY output, the OneCounts information is accumulated and written automatically.
void oneWriteLine (OneFile *vf, char lineType, I64 listLen, void *listBuf);
// Set up a line for output just as it would be returned by oneReadLine and then call
// this routine to output the line (ASCII or binary).
// Use the macros above on the l.h.s. of assignments to fill fields (e.g. oneInt(vf,2) = 3).
// For lists, give the length in the listLen argument, and either place the list data in your
// own buffer and give it as listBuf, or put in the line's buffer and set listBuf == NULL.
void oneWriteLineFrom (OneFile *vf, OneFile *source) ; // copies a line from source into vf
void oneWriteLineDNA2bit (OneFile *vf, char lineType, I64 listLen, U8 *dnaBuf);
// Minor variants of oneWriteLine().
// Use oneWriteLineDNA2bit for DNA lists if your DNA is already 2-bit compressed.
void oneWriteComment (OneFile *vf, char *format, ...); // can not include newline \n chars
// Adds a comment to the current line. Extends line in ascii, adds special line type in binary.
// CLOSING FILES (FOR BOTH READ & WRITE)
void oneFileClose (OneFile *vf);
// Close vf (opened either for reading or writing). Finalizes counts, merges theaded files,
// and writes footer if binary. Frees all non-user memory associated with vf.
// GOTO & BUFFER MANAGEMENT
void oneUserBuffer (OneFile *vf, char lineType, void *buffer);
// A buffer is used to capture the list element of each line type that has one.
// This routine allows you to reassign the buffer to one you've allocated, or
// to revert to a default system buffer if 'buffer' = NULL. The previous buffer
// (if any) is freed. The user must ensure that a buffer they supply is large
// enough. BTW, this buffer is overwritten with each new line read of the given type.
bool oneGotoObject (OneFile *vf, I64 i);
// Goto i'th object in the file. This only works on binary files, which have an index.
I64 oneGotoGroup (OneFile *vf, I64 i);
// Goto the first object in group i. Return the size (in objects) of the group, or 0
// if an error (i out of range or vf has not group type). Only works for binary files.
/***********************************************************************************
*
* A BIT ABOUT THE FORMAT OF BINARY FILES
*
**********************************************************************************/
// <bin file> <- <ASCII Prolog> <$-line> <binary data> <footer> <^-line> <footer-size:int64>
//
// '$'-line flags file is binary and gives endian
// The data block ends with a blank line consisting of '\n'
//
// EWM: Removed '-' line, simply write off_t to footer start
//
// <ASCII Prolog> <- <'1'-line> [<'2'-line>] ( <'!'-line> | <'<'-line> | <'>'-line> )*
//
// The ASCII prolog contains the type, subtype, provenance, reference, and deferred lines
// in the ASCII format. The ONE count statistic lines for each data line type are found
// in the footer along with binary ';' and ':' lines that encode their compressors as
// needed. The footer also contains binary '&' and '*' lines that encode the object index
// and group indices, respectively.
//
// <Binary line> <- <Binary line code + tags> <fields> [<list data>]
//
// Line codes are >= 128 for binary encoded lines. The low two order bits of these are flags,
// so each binary-encoded line type has 4 codes and a table maps these to the ASCII code.
// Bit 0 indicates if the fields of the line type are compressed, and Bit 1 indicates if
// the list data (if present) is compressed.
//
// If a field is a list, then the field array element for that field is the list's length
// where the low 56 bits encode length, and the high 8 bits encode the # of high-order
// 0-bytes in every list element if an INT_LIST (0 otherwise).
#endif // ONE_DEFINED
/******************* end of file **************/