Skip to content

Commit

Permalink
implementation to vcf/bed parser
Browse files Browse the repository at this point in the history
  • Loading branch information
zwdzwd committed Apr 26, 2017
1 parent f22fe93 commit 1ff8662
Show file tree
Hide file tree
Showing 9 changed files with 252 additions and 191 deletions.
20 changes: 0 additions & 20 deletions wstr.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@

#include <stdio.h>
#include <stdarg.h>
#include <ctype.h>
#include "kstring.h"

#define kstring_init(s) kstring_t (s); (s).s=0; (s).m=(s).l=0;
Expand All @@ -29,23 +28,4 @@ wasprintf(const char *fmt, ...) {
return s;
}

static inline int strcount_char(char *s, char c) {
int i, n=0;
for (i=0; s[i]; ++i)
if (s[i] == c)
++n;
return n;
}

static inline void ensure_number(char *s) {
int i;
for (i=0;s[i];++i) {
if (!isdigit(s[i]) && s[i]!='.') {
fprintf(stderr, "[%s:%d] Trying to convert nondigit string to number: %s\n", __func__, __LINE__, s);
fflush(stderr);
exit(1);
}
}
}

#endif /* _WZ_STR_H_ */
2 changes: 1 addition & 1 deletion wstring.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
#define swap_tmp(a, b, t) { (t) = (a); (a) = (b); (b) = (t); }
#endif

typedef struct {
typedef struct wstring_t {
char *s; /* \0-ended string */
size_t cap;
} wstring_t;
Expand Down
2 changes: 1 addition & 1 deletion wvec.h
Original file line number Diff line number Diff line change
Expand Up @@ -376,6 +376,6 @@ DEFINE_NATIVE_VECTOR(int32_v, int32_t);
DEFINE_NATIVE_VECTOR(int64_v, int64_t);
#define int32_v int_v

DEFINE_VECTOR(vpvector, void*);
/* DEFINE_VECTOR(vpvector, void*); */

#endif
257 changes: 133 additions & 124 deletions wzbed.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,23 +33,23 @@
/* This file defines several bed file parsers */

/* Bed file for methylation data */
typedef struct {
char *bed;
gzFile FH;
char *chrm;
meth_obs1_t next; /* next object */
kstring_t nextchrom; /* chromosome of next object */
} methbed_t;

static inline void bed_close(methbed_t *m) {
gzclose(m->FH);
}
/* typedef struct { */
/* char *bed; */
/* gzFile FH; */
/* char *chrm; */
/* meth_obs1_t next; /\* next object *\/ */
/* wstring_t nextchrom; /\* chromosome of next object *\/ */
/* } methbed_t; */

/* static inline void bed_close(methbed_t *m) { */
/* gzclose(m->FH); */
/* } */

static inline void free_bed(methbed_t *m) {
free(m->chrm);
free(m->nextchrom.s);
free(m);
}
/* static inline void free_bed(methbed_t *m) { */
/* free(m->chrm); */
/* free(m->nextchrom.s); */
/* free(m); */
/* } */


/*************************************
Expand All @@ -62,17 +62,17 @@ static inline void free_bed(methbed_t *m) {
** Bed Record **
****************/
typedef struct bed1_t {
unsigned tid;
int beg;
int end;
int tid;
int64_t beg;
int64_t end;
void *data;
} bed1_t;

DEFINE_VECTOR(bed1_v, bed1_t)

#define init_data_f (void (*init_bed_data)(bed1_t*))
#define parse_data_f (void (*parse_data)(bed1_t*, char**, int))
#define free_data_f (void (*free_bed_data)(void*))
typedef void (*init_data_f)(bed1_t *b);
typedef void (*parse_data_f)(bed1_t *b, char **fields, int nfields);
typedef void (*free_data_f)(void *data);

static inline bed1_t *init_bed1_core(init_data_f init_data) {
bed1_t *b = calloc(1, sizeof(bed1_t));
Expand All @@ -94,21 +94,30 @@ static inline void free_bed1_core(bed1_t *b, free_data_f free_data) {
typedef struct bed_file_t {
char *file_path;
gzFile fh;
wstring_t line;
char *line;
target_v *targets;
} bed_file_t;

static inline bed_file_t *init_bed_file() {
bed_file_t *bed = calloc(1, sizeof(bed_file_t));
bed->targets = init_target_v(2);
bed->line = NULL;
return bed;
}

static inline void free_bed_file(bed_file_t *bed) {
destroy_target_v(bed->targets);
free(bed->file_path);
free(bed->line);
free(bed);
}

static inline int bed_read1(bed_file_t *bed, bed1_t *b, parse_data_f parse_data, free_data_f free_data) {
if (bed->fh == NULL) return 0;
if (gzFile_read_line(bed->fh, &bed->line) == 0) return 0;

char **fields; int nfields;
line_get_fields(bed->line.s, "\t", &fields, &nfields);
line_get_fields(bed->line, "\t", &fields, &nfields);
if (nfields < 3)
wzfatal("[%s:%d] Bed file has fewer than 3 columns.\n", __func__, __LINE__);

Expand Down Expand Up @@ -155,109 +164,109 @@ static inline int bed_read1(bed_file_t *bed, bed1_t *b, parse_data_f parse_data,
/* return 1; */
/* } */

static inline bed1_v *bed_read_all(char *bedfn) {

target_v *targets = init_target_v(2);
bed1_v *beds = init_bed_v(2);

gzFile FH = gzopen(bedfn);
kstring_t line;
line.l = line.m = 0; line.s = 0;
bed1_t *b;
FILE *fh = open(argv[1],"r");
while (1) {
int c=gzgetc(FH);
if (c=='\n' || c==EOF) {
b = next_ref_bed_v(beds);
bed_parse1(line.s, targets, b, NULL);
line.l = 0;
if (c==EOF) {
break;
}
} else {
kputc(c, &line);
}
free(line.s);
}
return beds;
}
/* static inline bed1_v *bed_read_all(char *bedfn) { */

/* target_v *targets = init_target_v(2); */
/* bed1_v *beds = init_bed_v(2); */

/* gzFile FH = gzopen(bedfn); */
/* kstring_t line; */
/* line.l = line.m = 0; line.s = 0; */
/* bed1_t *b; */
/* FILE *fh = open(argv[1],"r"); */
/* while (1) { */
/* int c=gzgetc(FH); */
/* if (c=='\n' || c==EOF) { */
/* b = next_ref_bed_v(beds); */
/* bed_parse1(line.s, targets, b, NULL); */
/* line.l = 0; */
/* if (c==EOF) { */
/* break; */
/* } */
/* } else { */
/* kputc(c, &line); */
/* } */
/* free(line.s); */
/* } */
/* return beds; */
/* } */

bed1_v *target2bed(target_v *targets) {
unsigned i;
bed1_v *beds = init_bed1_v(2);
for (i=0; i<targets->size; ++i) {
target_t *t = ref_target_v(targets, i);
bed1_t *b = next_ref_bed1_v(beds);
b->tid = i;
b->beg = 0;
b->end = t->len;
b->data = NULL;
}
return beds;
}
/* bed1_v *target2bed(target_v *targets) { */
/* unsigned i; */
/* bed1_v *beds = init_bed1_v(2); */
/* for (i=0; i<targets->size; ++i) { */
/* target_t *t = ref_target_v(targets, i); */
/* bed1_t *b = next_ref_bed1_v(beds); */
/* b->tid = i; */
/* b->beg = 0; */
/* b->end = t->len; */
/* b->data = NULL; */
/* } */
/* return beds; */
/* } */

static inline void bamregion2bed(bed1_t *bed, target_v *targets, char *str) {

char *s;
int i, l, k, name_end;

*ref_id = b->beg = b->end = -1;
name_end = l = strlen(str);
s = (char*)malloc(l+1);
// remove space
for (i = k = 0; i < l; ++i)
if (!isspace(str[i])) s[k++] = str[i];
s[k] = 0; l = k;
// determine the sequence name
for (i = l - 1; i >= 0; --i) if (s[i] == ':') break; // look for colon from the end
if (i >= 0) name_end = i;
if (name_end < l) { // check if this is really the end
int n_hyphen = 0;
for (i = name_end + 1; i < l; ++i) {
if (s[i] == '-') ++n_hyphen;
else if (!isdigit(s[i]) && s[i] != ',') break;
}
if (i < l || n_hyphen > 1) name_end = l; // malformated region string; then take str as the name
s[name_end] = 0;
/* static inline void bamregion2bed(bed1_t *bed, target_v *targets, char *str) { */

/* char *s; */
/* int i, l, k, name_end; */

/* *ref_id = b->beg = b->end = -1; */
/* name_end = l = strlen(str); */
/* s = (char*)malloc(l+1); */
/* // remove space */
/* for (i = k = 0; i < l; ++i) */
/* if (!isspace(str[i])) s[k++] = str[i]; */
/* s[k] = 0; l = k; */
/* // determine the sequence name */
/* for (i = l - 1; i >= 0; --i) if (s[i] == ':') break; // look for colon from the end */
/* if (i >= 0) name_end = i; */
/* if (name_end < l) { // check if this is really the end */
/* int n_hyphen = 0; */
/* for (i = name_end + 1; i < l; ++i) { */
/* if (s[i] == '-') ++n_hyphen; */
/* else if (!isdigit(s[i]) && s[i] != ',') break; */
/* } */
/* if (i < l || n_hyphen > 1) name_end = l; // malformated region string; then take str as the name */
/* s[name_end] = 0; */

target_t *t = get_target(targets, s);
if (!t) { // cannot find the sequence name
t = get_target(targets, str); // try str as the name
if (!t) {
fprintf(stderr, "[%s:%d] fail to determine sequence name.\n", __func__, __LINE__);
fflush(stderr);
free(s); return -1;
} else s[name_end] = ':', name_end = l;
}
} else t = get_target(targets, str);
if (!t) {
free(s);
return -1;
}
b->tid = t->tid;
// parse the interval
if (name_end < l) {
for (i = k = name_end + 1; i < l; ++i)
if (s[i] != ',') s[k++] = s[i];
s[k] = 0;
b->beg = atoi(s + name_end + 1);
for (i = name_end + 1; i != k; ++i) if (s[i] == '-') break;
b->end = i < k? atoi(s + i + 1) : 1<<29;
if (b->beg > 0) b->beg--;
} else b->beg = 0, b->end = 1<<29;
free(s);
return b->beg <= b->end? 0 : -1;
}
/* target_t *t = get_target(targets, s); */
/* if (!t) { // cannot find the sequence name */
/* t = get_target(targets, str); // try str as the name */
/* if (!t) { */
/* fprintf(stderr, "[%s:%d] fail to determine sequence name.\n", __func__, __LINE__); */
/* fflush(stderr); */
/* free(s); return -1; */
/* } else s[name_end] = ':', name_end = l; */
/* } */
/* } else t = get_target(targets, str); */
/* if (!t) { */
/* free(s); */
/* return -1; */
/* } */
/* b->tid = t->tid; */
/* // parse the interval */
/* if (name_end < l) { */
/* for (i = k = name_end + 1; i < l; ++i) */
/* if (s[i] != ',') s[k++] = s[i]; */
/* s[k] = 0; */
/* b->beg = atoi(s + name_end + 1); */
/* for (i = name_end + 1; i != k; ++i) if (s[i] == '-') break; */
/* b->end = i < k? atoi(s + i + 1) : 1<<29; */
/* if (b->beg > 0) b->beg--; */
/* } else b->beg = 0, b->end = 1<<29; */
/* free(s); */
/* return b->beg <= b->end? 0 : -1; */
/* } */

static inline bed1_v *bamregion2bedlist(target_v *targets, char *region) {
bed1_v *beds = init_bed1_v(2);
bed1_t *b = next_ref_bed1_v(beds);
if (bam_region2bed(b, targets, region) < 0) {
fprintf(stderr, "[%s:%d] failed to parse region\n", __func__, __LINE__);
fflush(stderr);
exit(1);
}
return beds;
}
/* static inline bed1_v *bamregion2bedlist(target_v *targets, char *region) { */
/* bed1_v *beds = init_bed1_v(2); */
/* bed1_t *b = next_ref_bed1_v(beds); */
/* if (bam_region2bed(b, targets, region) < 0) { */
/* fprintf(stderr, "[%s:%d] failed to parse region\n", __func__, __LINE__); */
/* fflush(stderr); */
/* exit(1); */
/* } */
/* return beds; */
/* } */

#endif /* _WZBED_H */
Loading

0 comments on commit 1ff8662

Please sign in to comment.