Skip to content

Commit

Permalink
Fix some issues in subexonInfo. Use FPKM to filter low abundance tran…
Browse files Browse the repository at this point in the history
…scripts.
  • Loading branch information
mourisl committed Sep 3, 2017
1 parent dfd1266 commit 74985c8
Show file tree
Hide file tree
Showing 9 changed files with 144 additions and 28 deletions.
17 changes: 17 additions & 0 deletions CombineSubexons.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,23 @@ int main( int argc, char *argv[] )
++i ;
continue ;
}
else if ( !strcmp( argv[i], "--ls" ) )
{
FILE *fpLs = fopen( argv[i + 1], "r" ) ;
char buffer[1024] ;
while ( fgets( buffer, sizeof( buffer ), fpLs ) != NULL )
{
int len = strlen( buffer ) ;
if ( buffer[len - 1] == '\n' )
{
buffer[len - 1] = '\0' ;
--len ;

}
char *fileName = strdup( buffer ) ;
files.push_back( fileName ) ;
}
}
}
int fileCnt = files.size() ;
// Obtain the chromosome ids through bam file.
Expand Down
4 changes: 4 additions & 0 deletions Constraints.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ struct _constraint
double normAbund ;
double abundance ;
int support ;
int uniqSupport ;
int maxReadLen ; // the longest read length support this constraint.

int info ; // other usages.
int first, last ; // indicate the first and last index of the subexons.
Expand All @@ -28,6 +30,8 @@ struct _matePairConstraint
int i, j ;

int support ;
int uniqSupport ;

double abundance ;
double normAbund ;
int effectiveCount ;
Expand Down
7 changes: 6 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ LINKFLAGS = -lbam -lz -lm -lpthread
DEBUG=
OBJECTS = stats.o subexon-graph.o

all: subexon-info combine-subexons classes
all: subexon-info combine-subexons classes grader

subexon-info: subexon-info.o $(OBJECTS)
$(CXX) -o $@ $(LINKPATH) $(CXXFLAGS) $(OBJECTS) subexon-info.o $(LINKFLAGS)
Expand All @@ -15,6 +15,9 @@ combine-subexons: combine-subexons.o $(OBJECTS)

classes: classes.o constraints.o transcript-decider.o $(OBJECTS)
$(CXX) -o $@ $(LINKPATH) $(CXXFLAGS) $(OBJECTS) constraints.o transcript-decider.o classes.o $(LINKFLAGS)

grader: grader.o
$(CXX) -o $@ $(LINKPATH) $(CXXFLAGS) $(OBJECTS) grader.o $(LINKFLAGS)


subexon-info.o: SubexonInfo.cpp alignments.hpp blocks.hpp support.hpp defs.h stats.hpp
Expand All @@ -31,6 +34,8 @@ transcript-decider.o: TranscriptDecider.cpp TranscriptDecider.hpp Constraints.hp
$(CXX) -c -o $@ $(LINKPATH) $(CXXFLAGS) $< $(LINKFLAGS)
classes.o: classes.cpp SubexonGraph.hpp SubexonCorrelation.hpp BitTable.hpp Constraints.hpp alignments.hpp TranscriptDecider.hpp
$(CXX) -c -o $@ $(LINKPATH) $(CXXFLAGS) $< $(LINKFLAGS)
grader.o: grader.cpp
$(CXX) -c -o $@ $(LINKPATH) $(CXXFLAGS) $< $(LINKFLAGS)

clean:
rm -f *.o *.gch subexon-info combine-subexons
2 changes: 1 addition & 1 deletion SubexonCorrelation.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -238,7 +238,7 @@ class SubexonCorrelation
if ( fileList.size() > 1 )
return correlation[i][j] ;
else
return 1.0 ;
return 0 ;
}
} ;

Expand Down
30 changes: 17 additions & 13 deletions SubexonInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,6 @@ double MixtureGammaEM( double *x, int n, double &pi, double *k, double *theta, i
int i ;
double *z = new double[n] ; // the expectation that it assigned to model 0.
double *oneMinusZ = new double[n] ;

int t = 0 ;
while ( 1 )
{
Expand Down Expand Up @@ -221,7 +220,9 @@ double MixtureGammaEM( double *x, int n, double &pi, double *k, double *theta, i
// Use gradient descent to compute new k and theta.
if ( 1 ) //pi > 0 )
{
GradientDescentGammaDistribution( nk[0], ntheta[0], k[0], theta[0], -k[1], theta[1] * k[1], x, z, n ) ;
double bound ;
bound = ( theta[1] * k[1] > 1 ) ? 1 : theta[1] * k[1] ;
GradientDescentGammaDistribution( nk[0], ntheta[0], k[0], theta[0], -k[1], bound, x, z, n ) ;
GradientDescentGammaDistribution( nk[1], ntheta[1], k[1], theta[1], k[0], -theta[0] * k[0], x, oneMinusZ, n ) ;
}
else
Expand Down Expand Up @@ -347,6 +348,9 @@ int main( int argc, char *argv[] )
{
if ( support <= 0 )
continue ;
if ( !( uniqSupport > 1 ||
( secondarySupport > 10 ) ) )
continue ;
int chrId = alignments.GetChromIdFromName( chrom ) ;
struct _splitSite ss ;
--start ;
Expand Down Expand Up @@ -389,6 +393,7 @@ int main( int argc, char *argv[] )
// Recompute the coverage for each block.
alignments.Rewind() ;
//printf( "Before computeDepth: %d\n", regions.exonBlocks.size() ) ;

regions.ComputeDepth( alignments ) ;
//printf( "After computeDepth: %d\n", regions.exonBlocks.size() ) ;

Expand Down Expand Up @@ -548,15 +553,14 @@ int main( int argc, char *argv[] )

double p1, p2, p ;

/*p1 = MixtureGammaAssignment( covRatio[i], piRatio, kRatio, thetaRatio ) ;
p2 = MixtureGammaAssignment( cov[i], piCov, kCov, thetaCov ) ;
p = p1>p2 ? p1 : p2 ;*/
p1 = MixtureGammaAssignment( covRatio[i], piRatio, kRatio, thetaRatio ) ;
p2 = MixtureGammaAssignment( cov[i], piCov, kCov, thetaCov ) ;

p1 = GetPValue( covRatio[i], kRatio, thetaRatio ) ; //1 - gammad( covRatio[i] / thetaRatio[0], kRatio[0], &fault ) ;
/*p1 = GetPValue( covRatio[i], kRatio, thetaRatio ) ; //1 - gammad( covRatio[i] / thetaRatio[0], kRatio[0], &fault ) ;
if ( piRatio != 0 )
p2 = GetPValue( cov[i], kCov, thetaCov ) ;//1 - gammad( cov[i] / thetaCov[0], kCov[0], &fault ) ;
else
p2 = p1 ;
p2 = p1 ;*/
//printf( "%lf %lf: %lf %lf\n", covRatio[i], cov[i], p1, p2 ) ;
p = p1 > p2 ? p1 : p2 ;

Expand Down Expand Up @@ -589,15 +593,14 @@ int main( int argc, char *argv[] )
//double lf1 = -k[1] * log( theta[1] ) + ( k[1] - 1 ) * log( cov[i]) - cov[i] / theta[1] - lgamma( k[1] ) ;

double p1, p2, p ;
/*p1 = MixtureGammaAssignment( covRatio[i], overhangPiRatio, overhangKRatio, overhangThetaRatio ) ;
p2 = MixtureGammaAssignment( cov[i], overhangPiCov, overhangKCov, overhangThetaCov ) ;
p = p1>p2 ? p1 : p2 ;*/
p1 = MixtureGammaAssignment( covRatio[i], overhangPiRatio, overhangKRatio, overhangThetaRatio ) ;
p2 = MixtureGammaAssignment( cov[i], overhangPiCov, overhangKCov, overhangThetaCov ) ;

p1 = GetPValue( covRatio[i], kRatio, thetaRatio ) ; //1 - gammad( covRatio[i] / thetaRatio[0], kRatio[0], &fault ) ;
/*p1 = GetPValue( covRatio[i], kRatio, thetaRatio ) ; //1 - gammad( covRatio[i] / thetaRatio[0], kRatio[0], &fault ) ;
if ( overhangPiRatio != 0)
p2 = GetPValue( cov[i], kCov, thetaCov ) ;//1 - gammad( cov[i] / thetaCov[0], kCov[0], &fault ) ;
else
p2 = p1 ;
p2 = p1 ;*/

p = p1 > p2 ? p1 : p2 ;

Expand Down Expand Up @@ -632,10 +635,11 @@ int main( int argc, char *argv[] )

// Process the result for subexons seems like single-exon transcript (...)
int islandBlockCnt = islandBlocks.size() ;
std::sort( islandBlocks.begin(), islandBlocks.end(), CompBlocksByAvgDepth ) ;
for ( i = 0, j = 0 ; i < islandBlockCnt ; ++i )
{
if ( regions.GetAvgDepth( islandBlocks[i] ) != regions.GetAvgDepth( islandBlocks[j] ) )
++j ;
j = i ;
leftClassifier[ islandBlocks[i].contigId ] = 1 - (j + 1) / (double)( islandBlockCnt + 1 ) ;
rightClassifier[ islandBlocks[i].contigId ] = 1 - (j + 1) / (double)( islandBlockCnt + 1 ) ;
}
Expand Down
87 changes: 77 additions & 10 deletions TranscriptDecider.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,11 @@ void TranscriptDecider::OutputTranscript( FILE *fp, int baseGeneId, struct _sube

for ( j = 0 ; j < nextCnt ; ++j )
if ( transcript.seVector.Test( subexons[ subexonInd[i] ].next[j] ) )
break ;
{
int a = subexons[ subexonInd[i] ].next[j] ;
if ( subexons[ subexonInd[i] ].end + 1 < subexons[a].start ) // avoid the case like ..(...[...
break ;
}
if ( j < nextCnt )
{
if ( subexons[ subexonInd[i] ].rightStrand == 1 )
Expand All @@ -39,7 +43,6 @@ void TranscriptDecider::OutputTranscript( FILE *fp, int baseGeneId, struct _sube
// TODO: transcript_id
char *chrom = alignments.GetChromName( subexons[0].chrId ) ;
char prefix[10] = "" ;
int a = subexonInd[0] ;
struct _subexon *catSubexons = new struct _subexon[ size + 1 ] ;
// Concatenate adjacent subexons
catSubexons[0] = subexons[ subexonInd[0] ] ;
Expand All @@ -57,29 +60,34 @@ void TranscriptDecider::OutputTranscript( FILE *fp, int baseGeneId, struct _sube
}
}
size = j ;


int gid = GetTranscriptGeneId( subexonInd, baseGeneId ) ;
fprintf( fp, "%s\tCLASSES\ttranscript\t%d\t%d\t1000\t%s\t.\tgene_id \"%s%s.%d\"; transcript_id \"%s%s.%d.%d\"; Abundance \"%.6lf\";\n",
chrom, catSubexons[0].start + 1, catSubexons[size - 1].end + 1, strand,
prefix, chrom, geneId[a],
prefix, chrom, geneId[a], transcriptId[ geneId[a] - baseGeneId ], transcript.abundance ) ;
prefix, chrom, gid,
prefix, chrom, gid, transcriptId[ gid - baseGeneId ], transcript.abundance ) ;
for ( i = 0 ; i < size ; ++i )
{
fprintf( fp, "%s\tCLASSES\texon\t%d\t%d\t1000\t%s\t.\tgene_id \"%s%s.%d\"; "
"transcript_id \"%s%s.%d.%d\"; exon_number \"%d\"; Abundance \"%.6lf\"\n",
chrom, catSubexons[i].start + 1, catSubexons[i].end + 1, strand,
prefix, chrom, geneId[a],
prefix, chrom, geneId[a], transcriptId[ geneId[a] - baseGeneId ],
prefix, chrom, gid,
prefix, chrom, gid, transcriptId[ gid - baseGeneId ],
i + 1, transcript.abundance ) ;
}
++transcriptId[ geneId[a] - baseGeneId ] ;
++transcriptId[ gid - baseGeneId ] ;

delete catSubexons ;
}

void TranscriptDecider::SetGeneId( int tag, struct _subexon *subexons, int id )
{
if ( geneId[tag] != -1 )
{
if ( geneId[tag] != id ) // a subexon may belong to more than one gene.
geneId[tag] = -2 ;
return ;
}
int i ;
geneId[ tag ] = id ;
int cnt = subexons[tag].nextCnt ;
Expand All @@ -91,6 +99,28 @@ void TranscriptDecider::SetGeneId( int tag, struct _subexon *subexons, int id )
SetGeneId( subexons[tag].prev[i], subexons, id ) ;
}

int TranscriptDecider::GetTranscriptGeneId( std::vector<int> &subexonInd, int baseGeneId )
{
int i ;
int size = subexonInd.size() ;

for ( i = 0 ; i < size ; ++i )
if ( geneId[ subexonInd[i] ] != -2 )
return geneId[ subexonInd[i] ] ;
return baseGeneId ; // should never reach here.
}

int TranscriptDecider::GetTranscriptGeneId( struct _transcript &t, int baseGeneId )
{
if ( geneId[ t.first ] != -2 )
return geneId[ t.first ] ;
if ( geneId[ t.last ] != -2 )
return geneId[ t.last ] ;
std::vector<int> subexonInd ;
t.seVector.GetOnesIndices( subexonInd ) ;
return GetTranscriptGeneId( subexonInd, baseGeneId ) ;
}

void TranscriptDecider::InitTranscriptId( int baseGeneId, int usedGeneId )
{
int i ;
Expand Down Expand Up @@ -924,6 +954,8 @@ void TranscriptDecider::PickTranscripts( std::vector<struct _transcript> &alltra
value = 1e-6 ;
if ( value > maxAbundance )
maxAbundance = value ;
//printf( "abundance %d: %lf ", i, value ) ;
//alltranscripts[i].seVector.Print() ;
}
//printf( "%s: %lf\n", __func__, maxAbundance ) ;

Expand Down Expand Up @@ -967,6 +999,7 @@ void TranscriptDecider::PickTranscripts( std::vector<struct _transcript> &alltra
max = score ;
maxtag = i ;
}
//printf( "score: %d %lf %lf\n", i, cnt, score ) ;
}

if ( maxcnt == 0 || maxtag == -1 )
Expand Down Expand Up @@ -1022,15 +1055,47 @@ void TranscriptDecider::PickTranscripts( std::vector<struct _transcript> &alltra
delete[] btable ;
}

int TranscriptDecider::RefineTranscripts( std::vector<struct _transcript> &transcripts, Constraints &constraints )
int TranscriptDecider::RefineTranscripts( int baseGeneId, std::vector<struct _transcript> &transcripts, Constraints &constraints )
{
int i, j ;
int tcnt = transcripts.size() ;
int tcCnt = constraints.matePairs.size() ;

std::vector<struct _matePairConstraint> &tc = constraints.matePairs ;
std::vector<struct _constraint> &scc = constraints.constraints ; //single-end constraints.constraints

// Remove transcripts whose FPKM are too small.
double *geneMaxFPKM = new double[usedGeneId - baseGeneId ] ;
memset( geneMaxFPKM, 0, sizeof( double ) * ( usedGeneId - baseGeneId ) ) ;
int *txptGid = new int[tcnt] ;
for ( i = 0 ; i < tcnt ; ++i )
{
int gid = GetTranscriptGeneId( transcripts[i], baseGeneId ) ;
if ( transcripts[i].abundance > geneMaxFPKM[gid - baseGeneId ] )
geneMaxFPKM[ gid - baseGeneId ] = transcripts[i].abundance ;
txptGid[i] = gid ;
}

for ( i = 0 ; i < tcnt ; ++i )
{
if ( transcripts[i].abundance < 0.05 * geneMaxFPKM[ txptGid[i] - baseGeneId ] )
transcripts[i].abundance = -1 ;
}

j = 0 ;
for ( i = 0 ; i < tcnt ; ++i )
{
if ( transcripts[i].abundance == -1 )
{
transcripts[i].seVector.Release() ; // Don't forget release the memory.
continue ;
}
transcripts[j] = transcripts[i] ;
++j ;
}
transcripts.resize( j ) ;
tcnt = j ;
delete []txptGid ;

/*==================================================================
Remove transcripts that seems duplicated
Expand Down Expand Up @@ -1215,7 +1280,9 @@ int TranscriptDecider::Solve( struct _subexon *subexons, int seCnt, std::vector<

int size = predTranscripts.size() ;
InitTranscriptId( baseGeneId, usedGeneId ) ;
size = RefineTranscripts( predTranscripts, constraints[i] ) ;
for ( j = 0 ; j < size ; ++j )
ConvertTranscriptAbundanceToFPKM( subexons, predTranscripts[j] ) ;
size = RefineTranscripts( baseGeneId, predTranscripts, constraints[i] ) ;
for ( j = 0 ; j < size ; ++j )
{
OutputTranscript( outputFPs[i], baseGeneId, subexons, predTranscripts[j] ) ;
Expand Down
17 changes: 17 additions & 0 deletions TranscriptDecider.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -124,13 +124,30 @@ class TranscriptDecider
return cnt * ( 1 + pow( a / A, 0.25 ) ) + correlation ;
}

void ConvertTranscriptAbundanceToFPKM( struct _subexon *subexons, struct _transcript &t )
{
int txptLen = 0 ;
int i, size ;

std::vector<int> subexonInd ;
t.seVector.GetOnesIndices( subexonInd ) ;
size = subexonInd.size() ;
for ( i = 0 ; i < size ; ++i )
txptLen += ( subexons[ subexonInd[i] ].end - subexons[ subexonInd[i] ].start + 1 ) ;
t.abundance = t.abundance / ( ( alignments.totalReadCnt / 1000000.0 ) * ( txptLen / 1000.0 ) ) ;
}

void CoalesceSameTranscripts( std::vector<struct _transcript> &t ) ;

// The function to assign gene ids to subexons.
void SetGeneId( int tag, struct _subexon *subexons, int id ) ;

// Initialize the structure to store transcript id
void InitTranscriptId( int baseGeneId, int usedGeneId ) ;

int GetTranscriptGeneId( std::vector<int> &subexonInd, int baseGeneId ) ;
int GetTranscriptGeneId( struct _transcript &t, int baseGeneId ) ;
int RefineTranscripts( int baseGeneId, std::vector<struct _transcript> &transcripts, Constraints &constraints ) ;

void OutputTranscript( FILE *fp, int baseGeneId, struct _subexon *subexons, struct _transcript &transcript ) ;
public:
Expand Down
4 changes: 3 additions & 1 deletion blocks.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -217,7 +217,7 @@ class Blocks
for ( i = 0 ; i < segCnt ; ++i )
{
//if ( i == 0 )
// printf( "hi %d %d %d\n", i, segments[i].a, segments[i].b ) ;
// printf( "hi %d %s %d %d\n", i, alignments.GetReadId(), segments[i].a, segments[i].b ) ;
for ( j = tag ; j < (int)exonBlocks.size() ; ++j )
{
if ( exonBlocks[j].end >= segments[i].a - 1 )
Expand Down Expand Up @@ -454,6 +454,8 @@ class Blocks
exonBlocks.pop_back() ;
}*/
if ( start > end )
//|| ( tmpB.start == rawExonBlocks[i].start && tmpB.end != rawExonBlocks[i].end && tmpB.end - tmpB.start + 1 <= 7 )
//|| ( tmpB.end == rawExonBlocks[i].end && tmpB.start != rawExonBlocks[i].start && tmpB.end - tmpB.start + 1 <= 7 )) // the case of too short overhang
{
exonBlocks.pop_back() ;
}
Expand Down
4 changes: 2 additions & 2 deletions stats.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -190,9 +190,9 @@ double LogGammaDensity( double x, double k, double theta )
double MixtureGammaAssignment( double x, double pi, double* k, double *theta )
{
if ( pi == 1 )
return 0 ;
else if ( pi == 0 )
return 1 ;
else if ( pi == 0 )
return 0 ;

double lf0 = LogGammaDensity( x, k[0], theta[0] ) ;
double lf1 = LogGammaDensity( x, k[1], theta[1] ) ;
Expand Down

0 comments on commit 74985c8

Please sign in to comment.