Skip to content

Commit

Permalink
implementation and test to epiread format in nome-seq mode
Browse files Browse the repository at this point in the history
  • Loading branch information
zwdzwd committed Mar 30, 2016
1 parent 9872e6a commit 35121ff
Show file tree
Hide file tree
Showing 2 changed files with 146 additions and 46 deletions.
16 changes: 11 additions & 5 deletions lib/utils/wzcbs.c
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,8 @@ void refine_tmax_fix_alen(int n, int *ps, block_pair_t *bp, int i2j, int chnpnts
}

/* ternary segment (once)
* number of change points can be 0,1,2 */
* number of change points can be 0,1,2
* Assume data is centered! sum(dat) = 0 */
void ternary_segmentation(int *dat, int n, int *n_chnpnts, int chnpnts[2], double *t_gmax) {

block_t *bk = init_blocks(n);
Expand All @@ -92,7 +93,7 @@ void ternary_segmentation(int *dat, int n, int *n_chnpnts, int chnpnts[2], doubl
block1_t *b = bk->a+bi;
block1_t *d = bk->a+bj;
int alenhi = d->end - b->beg;
int alenlo = d->beg - b->end;
int alenlo = bi==bj ? 1 : d->beg - b->end;
int dpsum_bmd = b->psum.max - d->psum.min;
int dpsum_dmb = d->psum.max - b->psum.min;
double t_max = t_stats0((double) n/(double) MIN(alenhi*(n-alenhi), alenlo*(n-alenlo)),
Expand Down Expand Up @@ -123,7 +124,7 @@ void ternary_segmentation(int *dat, int n, int *n_chnpnts, int chnpnts[2], doubl
for (bpi=0; bpi<block_pairs->size; ++bpi) {
block_pair_t *bp = ref_block_pair_v(block_pairs, bpi);
int alenhi = bp->d->end - bp->b->beg;
int alenlo = bp->d->beg - bp->b->end;
int alenlo = bp->b==bp->d ? 1 : bp->d->beg-bp->b->end;

/* when alenlo < n/2, there is chance of minimizing alen*(n-alen) by decreasing alen */
if (alenlo < n/2)
Expand Down Expand Up @@ -194,12 +195,17 @@ int recursive_segmentation(int *dat, int n, int *segends) {

int main(int argc, char *argv[])
{
int dat[20] = {4,4,4,4,4,4,4,4,4,4,
6,6,6,6,6,6,6,6,6,6};
/* int dat[20] = {4,4,4,4,4,4,4,4,4,4, */
/* 6,6,6,6,6,6,6,6,6,6}; */

int dat[20] = {1,1,1,1,1,1,1,1,1,1,
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1};

int n_chnpnts;
int chnpnts[2];
double t_gmax;
ternary_segmentation(dat, 20, &n_chnpnts, chnpnts, &t_gmax);

/* recursive_segmentation() */
return 0;
}
176 changes: 135 additions & 41 deletions src/epiread.c
Original file line number Diff line number Diff line change
Expand Up @@ -113,59 +113,118 @@ static void *epiread_write_func(void *data) {
#define episnp_test(snps, i) snps[(i)>>3]&(1<<((i)&0x7))
#define episnp_set(snps, i) snps[(i)>>3] |= 1<<((i)&0x7)

static void format_epiread(kstring_t *epi, bam1_t *b, refseq_t *rs, uint8_t bsstrand, char *chrm, window_t *w, uint8_t *snps, uint32_t snp_beg) {
static void format_epiread(kstring_t *epi, bam1_t *b, refseq_t *rs, uint8_t bsstrand, char *chrm, window_t *w, uint8_t *snps, uint32_t snp_beg, conf_t *conf) {

kstring_t es; int first_snp_loc = -1;
kstring_t es; int first_snp = -1;
es.s = 0; es.l = es.m = 0;


kstring_t ecg; int first_cg = -1; /* this is hcg in the nome-seq mode */
ecg.s = 0; ecg.l = ecg.m = 0;

kstring_t egc; int first_gc = -1;
egc.s = 0; egc.l = egc.m = 0;

int i; uint32_t j;
bam1_core_t *c = &b->core;
uint32_t rpos = c->pos+1, qpos = 0;
int first_cpg_loc = -1;

char qb, rb;
for (i=0; i<c->n_cigar; ++i) {
uint32_t op = bam_cigar_op(bam1_cigar(b)[i]);
uint32_t oplen = bam_cigar_oplen(bam1_cigar(b)[i]);
switch(op) {
case BAM_CMATCH:
for (j=0; j<oplen; ++j) {

rb = toupper(getbase_refseq(rs, rpos+j));
qb = bscall(b, qpos+j);

if (bsstrand && rb == 'G' && rpos+j-1 >= rs->beg) {
char rb0 = toupper(getbase_refseq(rs, rpos+j-1));
if (rb0 == 'C') { /* CpG context */
if (first_cpg_loc < 0) {
first_cpg_loc = (int) rpos+j-1; /* C in CpG, 1-based */
if ((unsigned) first_cpg_loc < w->beg || (unsigned) first_cpg_loc >= w->end)
return;
ksprintf(epi, "%s\t%s\t%c\t-\t%d\t", chrm, bam1_qname(b),
(b->core.flag&BAM_FREAD2)?'2':'1', first_cpg_loc-1); /* 0-based */

if (conf->is_nome) { /* NOMe-seq */

if (rpos+j+1 <= rs->end) { /* to prevent overflow */
char rb0 = toupper(getbase_refseq(rs, rpos+j-1));
char rb1 = toupper(getbase_refseq(rs, rpos+j+1));
if (rb0 == 'C' && rb1 != 'C') { /* HCG context */
/* Note: measure G in CpG context, record location of C */
if (first_cg < 0) first_cg = (int) rpos+j-1;
if (qb == 'A') {
kputc('T', &ecg);
} else if (qb == 'G') {
kputc('C', &ecg);
} else {
kputc('N', &ecg);
}
} else if (rb0 != 'C' && rb1 == 'C') { /* GCH context */
if (first_gc < 0) first_gc = (int) rpos+j;
if (qb == 'A') {
kputc('T', &egc);
} else if (qb == 'G') {
kputc('C', &egc);
} else {
kputc('N', &egc);
}
}
}
if (qb == 'A') {
kputc('T', epi);
} else if (qb == 'G') {
kputc('C', epi);
} else {
kputc('N', epi);

} else { /* BS-seq */

char rb0 = toupper(getbase_refseq(rs, rpos+j-1));
if (rb0 == 'C') { /* CpG context */
/* Note: measure G in CpG context, record location of C */
if (first_cg < 0) first_cg = (int) rpos+j-1;
if (qb == 'A') {
kputc('T', &ecg);
} else if (qb == 'G') {
kputc('C', &ecg);
} else {
kputc('N', &ecg);
}
}
}
}

if (!bsstrand && rb == 'C' && rpos+j+1 <= rs->end) {
char rb1 = toupper(getbase_refseq(rs, rpos+j+1));
if (rb1 == 'G') { /* CpG context */
if (first_cpg_loc < 0) {
first_cpg_loc = (int) rpos+j; /* C in CpG, 1-based */
if ((unsigned) first_cpg_loc < w->beg || (unsigned) first_cpg_loc >= w->end)
return;
ksprintf(epi, "%s\t%s\t%c\t+\t%d\t", chrm, bam1_qname(b),
(b->core.flag&BAM_FREAD2)?'2':'1', first_cpg_loc-1); /* 0-based */

if (conf->is_nome) { /* NOMe-seq */

if (rpos+j+1 <= rs->end) { /* to prevent overflow */
char rb0 = toupper(getbase_refseq(rs, rpos+j-1));
char rb1 = toupper(getbase_refseq(rs, rpos+j+1));
if (rb0 != 'G' && rb1 == 'G') { /* HCG context */
/* measure C in CpG context */
if (first_cg < 0) first_cg = (int) rpos+j;
if (qb == 'T') {
kputc('T', &ecg);
} else if (qb == 'C') {
kputc('C', &ecg);
} else {
kputc('N', &ecg);
}
} else if (rb0 == 'G' && rb1 != 'G') { /* GCH context */
if (first_gc < 0) first_gc = (int) rpos+j;
if (qb == 'T') {
kputc('T', &egc);
} else if (qb == 'C') {
kputc('C', &egc);
} else {
kputc('N', &egc);
}
}
}
if (qb == 'T') {
kputc('T', epi);
} else if (qb == 'C') {
kputc('C', epi);
} else {
kputc('N', epi);

} else { /* BS-seq */
char rb1 = toupper(getbase_refseq(rs, rpos+j+1));
if (rb1 == 'G') { /* CpG context */
if (first_cg < 0) first_cg = (int) rpos+j;
if (qb == 'T') {
kputc('T', &ecg);
} else if (qb == 'C') {
kputc('C', &ecg);
} else {
kputc('N', &ecg);
}
}
}
}
Expand All @@ -174,8 +233,7 @@ static void format_epiread(kstring_t *epi, bam1_t *b, refseq_t *rs, uint8_t bsst
uint32_t snp_ind = rpos+j-snp_beg;
if (snps && episnp_test(snps, snp_ind)) {
kputc(qb, &es);
if (first_snp_loc < 0)
first_snp_loc = rpos+j;
if (first_snp < 0) first_snp = rpos+j;
}
}
rpos += oplen;
Expand All @@ -198,15 +256,51 @@ static void format_epiread(kstring_t *epi, bam1_t *b, refseq_t *rs, uint8_t bsst
abort();
}
}
if (first_cpg_loc >= 0) {
if (first_snp_loc >= 0)
ksprintf(epi, "\t%d\t%s", first_snp_loc-1, es.s); /* 0-based */
else if (snps)
kputs("\t.\t.", epi);
kputc('\n', epi);

if (conf->is_nome) {
int first_epi = first_cg < first_gc ? first_cg : first_gc;
if (first_epi > 0 && (unsigned) first_epi >= w->beg && (unsigned) first_epi < w->end) {
ksprintf(epi, "%s\t%s\t%c\t%c", chrm, bam1_qname(b), (b->core.flag&BAM_FREAD2)?'2':'1', bsstrand?'-':'+');
/* CpG */
if (first_cg >= 0)
ksprintf(epi, "\t%d\t%s", first_cg-1, ecg.s); /* 0-based */
else
kputs("\t.\t.", epi);

/* GpC */
if (first_gc >= 0)
ksprintf(epi, "\t%d\t%s", first_gc-1, egc.s); /* 0-based */
else
kputs("\t.\t.", epi);

/* SNP */
if (first_snp >= 0)
ksprintf(epi, "\t%d\t%s", first_snp-1, es.s); /* 0-based */
else if (snps)
kputs("\t.\t.", epi);
kputc('\n', epi);
}
} else {
if (first_cg > 0 && (unsigned) first_cg >= w->beg && (unsigned) first_cg < w->end) {
ksprintf(epi, "%s\t%s\t%c\t%c", chrm, bam1_qname(b), (b->core.flag&BAM_FREAD2)?'2':'1', bsstrand?'-':'+');
/* CpG */
if (first_cg >= 0)
ksprintf(epi, "\t%d\t%s", first_cg-1, ecg.s); /* 0-based */
else
kputs("\t.\t.", epi);

/* SNP */
if (first_snp >= 0)
ksprintf(epi, "\t%d\t%s", first_snp-1, es.s); /* 0-based */
else if (snps)
kputs("\t.\t.", epi);
kputc('\n', epi);
}
}

free(es.s);
free(ecg.s);
free(egc.s);
}

static void *process_func(void *data) {
Expand Down Expand Up @@ -273,7 +367,7 @@ static void *process_func(void *data) {
if (cnt_ret > conf->max_retention) continue;

/* produce epiread */
format_epiread(&rec.s, b, rs, bsstrand, chrm, &w, snps, snp_beg);
format_epiread(&rec.s, b, rs, bsstrand, chrm, &w, snps, snp_beg, conf);
}

/* run through cytosines */
Expand Down

0 comments on commit 35121ff

Please sign in to comment.