Fix some issues in subexonInfo. Use FPKM to filter low abundance tran…

…scripts.
mbeavitt · Sep 3, 2017 · 74985c8 · 74985c8
1 parent dfd1266
commit 74985c8
Show file tree

Hide file tree

Showing 9 changed files with 144 additions and 28 deletions.
diff --git a/CombineSubexons.cpp b/CombineSubexons.cpp
@@ -213,6 +213,23 @@ int main( int argc, char *argv[] )
 			++i ;
 			continue ;
 		}
+		else if ( !strcmp( argv[i], "--ls" ) )
+		{
+			FILE *fpLs = fopen( argv[i + 1], "r" ) ;
+			char buffer[1024] ;
+			while ( fgets( buffer, sizeof( buffer ), fpLs ) != NULL )
+			{
+				int len = strlen( buffer ) ;
+				if ( buffer[len - 1] == '\n' )
+				{
+					buffer[len - 1] = '\0' ;
+					--len ;
+
+				}
+				char *fileName = strdup( buffer ) ;
+				files.push_back( fileName ) ;
+			}
+		}
 	}
 	int fileCnt = files.size() ;
 	// Obtain the chromosome ids through bam file.

diff --git a/Constraints.hpp b/Constraints.hpp
@@ -18,6 +18,8 @@ struct _constraint
 	double normAbund ;
 	double abundance ;
 	int support ;
+	int uniqSupport ;
+	int maxReadLen ; // the longest read length support this constraint.
 
 	int info ; // other usages.
 	int first, last ; // indicate the first and last index of the subexons. 
@@ -28,6 +30,8 @@ struct _matePairConstraint
 	int i, j ;
 
 	int support ;
+	int uniqSupport ;
+
 	double abundance ;
 	double normAbund ;
 	int effectiveCount ;

diff --git a/Makefile b/Makefile
@@ -5,7 +5,7 @@ LINKFLAGS = -lbam -lz -lm -lpthread
 DEBUG=
 OBJECTS = stats.o subexon-graph.o 
 
-all: subexon-info combine-subexons classes
+all: subexon-info combine-subexons classes grader
 
 subexon-info: subexon-info.o $(OBJECTS)
 	$(CXX) -o $@ $(LINKPATH) $(CXXFLAGS) $(OBJECTS) subexon-info.o $(LINKFLAGS)
@@ -15,6 +15,9 @@ combine-subexons: combine-subexons.o $(OBJECTS)
 
 classes: classes.o constraints.o transcript-decider.o $(OBJECTS)
 	$(CXX) -o $@ $(LINKPATH) $(CXXFLAGS) $(OBJECTS) constraints.o transcript-decider.o classes.o $(LINKFLAGS)
+
+grader: grader.o
+	$(CXX) -o $@ $(LINKPATH) $(CXXFLAGS) $(OBJECTS) grader.o $(LINKFLAGS)
 
 
 subexon-info.o: SubexonInfo.cpp alignments.hpp blocks.hpp support.hpp defs.h stats.hpp
@@ -31,6 +34,8 @@ transcript-decider.o: TranscriptDecider.cpp TranscriptDecider.hpp Constraints.hp
 	$(CXX) -c -o $@ $(LINKPATH) $(CXXFLAGS) $< $(LINKFLAGS)
 classes.o: classes.cpp SubexonGraph.hpp SubexonCorrelation.hpp BitTable.hpp Constraints.hpp alignments.hpp TranscriptDecider.hpp
 	$(CXX) -c -o $@ $(LINKPATH) $(CXXFLAGS) $< $(LINKFLAGS)
+grader.o: grader.cpp
+	$(CXX) -c -o $@ $(LINKPATH) $(CXXFLAGS) $< $(LINKFLAGS)
 
 clean:
 	rm -f *.o *.gch subexon-info combine-subexons 
diff --git a/SubexonCorrelation.hpp b/SubexonCorrelation.hpp
@@ -238,7 +238,7 @@ class SubexonCorrelation
 		if ( fileList.size() > 1 )
 			return correlation[i][j] ;
 		else
-			return 1.0 ;
+			return 0 ;
 	}
 } ;
 

diff --git a/SubexonInfo.cpp b/SubexonInfo.cpp
@@ -192,7 +192,6 @@ double MixtureGammaEM( double *x, int n, double &pi, double *k, double *theta, i
 	int i ;
 	double *z = new double[n] ; // the expectation that it assigned to model 0.
 	double *oneMinusZ = new double[n] ;
-
 	int t = 0 ;
 	while ( 1 )
 	{
@@ -221,7 +220,9 @@ double MixtureGammaEM( double *x, int n, double &pi, double *k, double *theta, i
 		// Use gradient descent to compute new k and theta.
 		if ( 1 ) //pi > 0 )
 		{
-			GradientDescentGammaDistribution( nk[0], ntheta[0], k[0], theta[0], -k[1], theta[1] * k[1], x, z, n ) ;
+			double bound ;
+			bound = ( theta[1] * k[1] > 1 ) ? 1 : theta[1] * k[1] ;
+			GradientDescentGammaDistribution( nk[0], ntheta[0], k[0], theta[0], -k[1], bound, x, z, n ) ;
 			GradientDescentGammaDistribution( nk[1], ntheta[1], k[1], theta[1], k[0], -theta[0] * k[0], x, oneMinusZ,  n ) ;
 		}
 		else
@@ -347,6 +348,9 @@ int main( int argc, char *argv[] )
 	{
 		if ( support <= 0 )
 			continue ;
+		if ( !( uniqSupport > 1 || 
+			( secondarySupport > 10 ) ) )
+			continue ;
 		int chrId = alignments.GetChromIdFromName( chrom ) ; 
 		struct _splitSite ss ;
 		--start ;
@@ -389,6 +393,7 @@ int main( int argc, char *argv[] )
 	// Recompute the coverage for each block. 
 	alignments.Rewind() ;
 	//printf( "Before computeDepth: %d\n", regions.exonBlocks.size() ) ;
+
 	regions.ComputeDepth( alignments ) ;
 	//printf( "After computeDepth: %d\n", regions.exonBlocks.size() ) ;
 
@@ -548,15 +553,14 @@ int main( int argc, char *argv[] )
 
 		double p1, p2, p ;
 
-		/*p1 = MixtureGammaAssignment( covRatio[i], piRatio, kRatio, thetaRatio ) ;
-		  p2 = MixtureGammaAssignment( cov[i], piCov, kCov, thetaCov  ) ;
-		  p = p1>p2 ? p1 : p2 ;*/
+		p1 = MixtureGammaAssignment( covRatio[i], piRatio, kRatio, thetaRatio ) ;
+		p2 = MixtureGammaAssignment( cov[i], piCov, kCov, thetaCov  ) ;
 
-		p1 = GetPValue( covRatio[i], kRatio, thetaRatio ) ; //1 - gammad( covRatio[i] / thetaRatio[0], kRatio[0], &fault ) ;
+		/*p1 = GetPValue( covRatio[i], kRatio, thetaRatio ) ; //1 - gammad( covRatio[i] / thetaRatio[0], kRatio[0], &fault ) ;
 		if ( piRatio != 0 )
 			p2 = GetPValue( cov[i], kCov, thetaCov ) ;//1 - gammad( cov[i] / thetaCov[0], kCov[0], &fault ) ;
 		else
-			p2 = p1 ;
+			p2 = p1 ;*/
 		//printf( "%lf %lf: %lf %lf\n", covRatio[i], cov[i], p1, p2 ) ;
 		p = p1 > p2 ? p1 : p2 ;
 
@@ -589,15 +593,14 @@ int main( int argc, char *argv[] )
 		//double lf1 = -k[1] * log( theta[1] ) + ( k[1] - 1 ) * log( cov[i]) - cov[i] / theta[1] - lgamma( k[1] ) ;
 
 		double p1, p2, p ;
-			/*p1 = MixtureGammaAssignment( covRatio[i], overhangPiRatio, overhangKRatio, overhangThetaRatio ) ;
-			p2 = MixtureGammaAssignment( cov[i], overhangPiCov, overhangKCov, overhangThetaCov  ) ;
-			p = p1>p2 ? p1 : p2 ;*/
+		p1 = MixtureGammaAssignment( covRatio[i], overhangPiRatio, overhangKRatio, overhangThetaRatio ) ;
+		p2 = MixtureGammaAssignment( cov[i], overhangPiCov, overhangKCov, overhangThetaCov  ) ;
 
-		p1 = GetPValue( covRatio[i], kRatio, thetaRatio ) ; //1 - gammad( covRatio[i] / thetaRatio[0], kRatio[0], &fault ) ;
+		/*p1 = GetPValue( covRatio[i], kRatio, thetaRatio ) ; //1 - gammad( covRatio[i] / thetaRatio[0], kRatio[0], &fault ) ;
 		if ( overhangPiRatio != 0)
 			p2 = GetPValue( cov[i], kCov, thetaCov ) ;//1 - gammad( cov[i] / thetaCov[0], kCov[0], &fault ) ;
 		else
-			p2 = p1 ;
+			p2 = p1 ;*/
 
 		p = p1 > p2 ? p1 : p2 ;
 
@@ -632,10 +635,11 @@ int main( int argc, char *argv[] )
 
 	// Process the result for subexons seems like single-exon transcript (...)
 	int islandBlockCnt = islandBlocks.size() ;
+	std::sort( islandBlocks.begin(), islandBlocks.end(), CompBlocksByAvgDepth ) ;
 	for ( i = 0, j = 0 ; i < islandBlockCnt ; ++i )
 	{
 		if ( regions.GetAvgDepth( islandBlocks[i] ) != regions.GetAvgDepth( islandBlocks[j] ) )
-			++j ;
+			j = i ;
 		leftClassifier[ islandBlocks[i].contigId ] = 1 - (j + 1) / (double)( islandBlockCnt + 1 ) ;
 		rightClassifier[ islandBlocks[i].contigId ] = 1 - (j + 1) / (double)( islandBlockCnt + 1 ) ;
 	}

diff --git a/TranscriptDecider.cpp b/TranscriptDecider.cpp
@@ -24,7 +24,11 @@ void TranscriptDecider::OutputTranscript( FILE *fp, int baseGeneId, struct _sube
 
 			for ( j = 0 ; j < nextCnt ; ++j )
 				if ( transcript.seVector.Test( subexons[ subexonInd[i] ].next[j] ) )
-					break ;
+				{
+					int a = subexons[ subexonInd[i] ].next[j] ;
+					if ( subexons[ subexonInd[i] ].end + 1 < subexons[a].start ) // avoid the case like ..(...[...
+						break ;
+				}
 			if ( j < nextCnt )
 			{
 				if ( subexons[ subexonInd[i] ].rightStrand == 1 )
@@ -39,7 +43,6 @@ void TranscriptDecider::OutputTranscript( FILE *fp, int baseGeneId, struct _sube
 	// TODO: transcript_id
 	char *chrom = alignments.GetChromName( subexons[0].chrId ) ;
 	char prefix[10] = "" ;
-	int a = subexonInd[0] ;
 	struct _subexon *catSubexons = new struct _subexon[ size + 1 ] ;
 	// Concatenate adjacent subexons 
 	catSubexons[0] = subexons[ subexonInd[0] ] ;
@@ -57,29 +60,34 @@ void TranscriptDecider::OutputTranscript( FILE *fp, int baseGeneId, struct _sube
 		}
 	}
 	size = j ;
-
+
+	int gid = GetTranscriptGeneId( subexonInd, baseGeneId ) ;
 	fprintf( fp, "%s\tCLASSES\ttranscript\t%d\t%d\t1000\t%s\t.\tgene_id \"%s%s.%d\"; transcript_id \"%s%s.%d.%d\"; Abundance \"%.6lf\";\n",
 			chrom, catSubexons[0].start + 1, catSubexons[size - 1].end + 1, strand,
-			prefix, chrom, geneId[a],
-			prefix, chrom, geneId[a], transcriptId[ geneId[a] - baseGeneId ], transcript.abundance ) ;
+			prefix, chrom, gid,
+			prefix, chrom, gid, transcriptId[ gid - baseGeneId ], transcript.abundance ) ;
 	for ( i = 0 ; i < size ; ++i )
 	{
 		fprintf( fp, "%s\tCLASSES\texon\t%d\t%d\t1000\t%s\t.\tgene_id \"%s%s.%d\"; "
 				"transcript_id \"%s%s.%d.%d\"; exon_number \"%d\"; Abundance \"%.6lf\"\n",
 				chrom, catSubexons[i].start + 1, catSubexons[i].end + 1, strand,
-				prefix, chrom, geneId[a],
-				prefix, chrom, geneId[a], transcriptId[ geneId[a] - baseGeneId ],
+				prefix, chrom, gid,
+				prefix, chrom, gid, transcriptId[ gid - baseGeneId ],
 				i + 1, transcript.abundance ) ;
 	}
-	++transcriptId[ geneId[a] - baseGeneId ] ;
+	++transcriptId[ gid - baseGeneId ] ;
 
 	delete catSubexons ;
 }
 
 void TranscriptDecider::SetGeneId( int tag, struct _subexon *subexons, int id )
 {
 	if ( geneId[tag] != -1 )
+	{
+		if ( geneId[tag] != id ) // a subexon may belong to more than one gene.
+			geneId[tag] = -2 ; 
 		return ;
+	}
 	int i ;
 	geneId[ tag ] = id ;
 	int cnt = subexons[tag].nextCnt ;
@@ -91,6 +99,28 @@ void TranscriptDecider::SetGeneId( int tag, struct _subexon *subexons, int id )
 		SetGeneId( subexons[tag].prev[i], subexons, id ) ;
 }
 
+int TranscriptDecider::GetTranscriptGeneId( std::vector<int> &subexonInd, int baseGeneId )
+{
+	int i ;
+	int size = subexonInd.size() ;
+
+	for ( i = 0 ; i < size ; ++i )
+		if ( geneId[ subexonInd[i] ] != -2  )
+			return  geneId[ subexonInd[i] ] ;
+	return baseGeneId ; // should never reach here.
+}
+
+int TranscriptDecider::GetTranscriptGeneId( struct _transcript &t, int baseGeneId )
+{
+	if ( geneId[ t.first ] != -2 )
+		return geneId[ t.first ] ;
+	if ( geneId[ t.last ] != -2 )
+		return geneId[ t.last ] ;
+	std::vector<int> subexonInd ;
+	t.seVector.GetOnesIndices( subexonInd ) ;
+	return GetTranscriptGeneId( subexonInd, baseGeneId ) ;
+}
+
 void TranscriptDecider::InitTranscriptId( int baseGeneId, int usedGeneId )
 {
 	int i ;
@@ -924,6 +954,8 @@ void TranscriptDecider::PickTranscripts( std::vector<struct _transcript> &alltra
 			value = 1e-6 ;
 		if ( value > maxAbundance )
 			maxAbundance = value ;
+		//printf( "abundance %d: %lf ", i, value ) ;
+		//alltranscripts[i].seVector.Print() ;
 	}
 	//printf( "%s: %lf\n", __func__, maxAbundance ) ;
 
@@ -967,6 +999,7 @@ void TranscriptDecider::PickTranscripts( std::vector<struct _transcript> &alltra
 				max = score ;
 				maxtag = i ;
 			}
+			//printf( "score: %d %lf %lf\n", i, cnt, score ) ;
 		}
 
 		if ( maxcnt == 0 || maxtag == -1 )
@@ -1022,15 +1055,47 @@ void TranscriptDecider::PickTranscripts( std::vector<struct _transcript> &alltra
 	delete[] btable ;
 }
 
-int TranscriptDecider::RefineTranscripts( std::vector<struct _transcript> &transcripts, Constraints &constraints ) 
+int TranscriptDecider::RefineTranscripts( int baseGeneId, std::vector<struct _transcript> &transcripts, Constraints &constraints ) 
 {
 	int i, j ;
 	int tcnt = transcripts.size() ;
 	int tcCnt = constraints.matePairs.size() ;
 
 	std::vector<struct _matePairConstraint> &tc = constraints.matePairs ;
 	std::vector<struct _constraint> &scc = constraints.constraints ; //single-end constraints.constraints
+
+	// Remove transcripts whose FPKM are too small.
+	double *geneMaxFPKM = new double[usedGeneId - baseGeneId ] ;
+	memset( geneMaxFPKM, 0, sizeof( double ) * ( usedGeneId - baseGeneId ) ) ;
+	int *txptGid = new int[tcnt] ;
+	for ( i = 0 ; i < tcnt ; ++i )
+	{
+		int gid = GetTranscriptGeneId( transcripts[i], baseGeneId ) ;
+		if ( transcripts[i].abundance > geneMaxFPKM[gid - baseGeneId ] )
+			geneMaxFPKM[ gid - baseGeneId ] = transcripts[i].abundance ;
+		txptGid[i] = gid ;
+	}
 
+	for ( i = 0 ; i < tcnt ; ++i )
+	{
+		if ( transcripts[i].abundance < 0.05 * geneMaxFPKM[ txptGid[i] - baseGeneId ] )
+			transcripts[i].abundance = -1 ;
+	}
+
+	j = 0 ;
+	for ( i = 0 ; i < tcnt ; ++i )
+	{
+		if ( transcripts[i].abundance == -1 )
+		{
+			transcripts[i].seVector.Release() ; // Don't forget release the memory.
+			continue ;
+		}
+		transcripts[j] = transcripts[i] ;
+		++j ;
+	}
+	transcripts.resize( j ) ;
+	tcnt = j ;
+	delete []txptGid ;
 
 	/*==================================================================
 	Remove transcripts that seems duplicated
@@ -1215,7 +1280,9 @@ int TranscriptDecider::Solve( struct _subexon *subexons, int seCnt, std::vector<
 
 		int size = predTranscripts.size() ;
 		InitTranscriptId( baseGeneId, usedGeneId ) ;
-		size = RefineTranscripts( predTranscripts, constraints[i] ) ;
+		for ( j = 0 ; j < size ; ++j )
+			ConvertTranscriptAbundanceToFPKM( subexons, predTranscripts[j] ) ;
+		size = RefineTranscripts( baseGeneId, predTranscripts, constraints[i] ) ;
 		for ( j = 0 ; j < size ; ++j )
 		{
 			OutputTranscript( outputFPs[i], baseGeneId, subexons, predTranscripts[j] ) ;

diff --git a/TranscriptDecider.hpp b/TranscriptDecider.hpp
@@ -124,13 +124,30 @@ class TranscriptDecider
 		return cnt * ( 1 + pow( a / A, 0.25 ) ) + correlation ;
 	}
 
+	void ConvertTranscriptAbundanceToFPKM( struct _subexon *subexons, struct _transcript &t )
+	{
+		int txptLen = 0 ;
+		int i, size ;
+
+		std::vector<int> subexonInd ;
+		t.seVector.GetOnesIndices( subexonInd ) ;
+		size = subexonInd.size() ;
+		for ( i = 0 ; i < size ; ++i )
+			txptLen += ( subexons[ subexonInd[i] ].end - subexons[ subexonInd[i] ].start + 1 ) ;
+		t.abundance = t.abundance / ( ( alignments.totalReadCnt / 1000000.0 ) * ( txptLen / 1000.0 ) ) ;
+	}
+
 	void CoalesceSameTranscripts( std::vector<struct _transcript> &t ) ;
 
 	// The function to assign gene ids to subexons.
 	void SetGeneId( int tag, struct _subexon *subexons, int id ) ;
 
 	// Initialize the structure to store transcript id 
 	void InitTranscriptId( int baseGeneId, int usedGeneId ) ; 
+
+	int GetTranscriptGeneId( std::vector<int> &subexonInd, int baseGeneId ) ;
+	int GetTranscriptGeneId( struct _transcript &t, int baseGeneId ) ;
+	int RefineTranscripts( int baseGeneId, std::vector<struct _transcript> &transcripts, Constraints &constraints ) ;
 
 	void OutputTranscript( FILE *fp, int baseGeneId, struct _subexon *subexons, struct _transcript &transcript ) ;
 public:

diff --git a/blocks.hpp b/blocks.hpp
@@ -217,7 +217,7 @@ class Blocks
 				for ( i = 0 ; i < segCnt ; ++i )
 				{
 					//if ( i == 0 )
-					//	printf( "hi %d %d %d\n", i, segments[i].a, segments[i].b ) ;
+					//	printf( "hi %d %s %d %d\n", i, alignments.GetReadId(), segments[i].a, segments[i].b ) ;
 					for ( j = tag ; j < (int)exonBlocks.size() ; ++j )
 					{
 						if ( exonBlocks[j].end >= segments[i].a - 1 )
@@ -454,6 +454,8 @@ class Blocks
 						exonBlocks.pop_back() ;
 					}*/
 					if ( start > end )
+						//|| ( tmpB.start == rawExonBlocks[i].start && tmpB.end != rawExonBlocks[i].end && tmpB.end - tmpB.start + 1 <= 7 ) 
+						//|| ( tmpB.end == rawExonBlocks[i].end && tmpB.start != rawExonBlocks[i].start && tmpB.end - tmpB.start + 1 <= 7  )) // the case of too short overhang
 					{
 						exonBlocks.pop_back() ;
 					}

diff --git a/stats.cpp b/stats.cpp
@@ -190,9 +190,9 @@ double LogGammaDensity( double x, double k, double theta )
 double MixtureGammaAssignment( double x, double pi, double* k, double *theta )
 {
 	if ( pi == 1 )
-		return 0 ;
-	else if ( pi == 0 )
 		return 1 ;
+	else if ( pi == 0 )
+		return 0 ;
 
 	double lf0 = LogGammaDensity( x, k[0], theta[0] ) ;
 	double lf1 = LogGammaDensity( x, k[1], theta[1] ) ;