From ec3980700ccc0a347040358be07d4039b2bbd37f Mon Sep 17 00:00:00 2001 From: Li Song Date: Sun, 21 Jan 2018 20:46:03 -0500 Subject: [PATCH] Improve the memory usage for combining subexons --- CombineSubexons.cpp | 125 ++++++++++++++++++++++++++++++++++++-------- GetTrustedSplice.pl | 6 ++- 2 files changed, 108 insertions(+), 23 deletions(-) diff --git a/CombineSubexons.cpp b/CombineSubexons.cpp index 770b2d9..02fe759 100644 --- a/CombineSubexons.cpp +++ b/CombineSubexons.cpp @@ -51,6 +51,8 @@ struct _subexonSplit int type ; //1-start of a subexon. 2-end of a subexon int splitType ; //0-soft boundary, 1-start of an exon, 2-end of an exon. int strand ; + + int weight ; } ; struct _interval // exon or intron @@ -300,6 +302,29 @@ int *MergePositions( int *old, int ocnt, int *add, int acnt, int &newCnt, int ** return ret ; } +void CoalesceSubexonSplits( std::vector &splits ) +{ + int i, k ; + std::sort( splits.begin(), splits.end(), CompSubexonSplit ) ; + int cnt = splits.size() ; + k = 0 ; + for ( i = 1 ; i < cnt ; ++i ) + { + if ( splits[i].chrId == splits[k].chrId && splits[i].pos == splits[k].pos && splits[i].type == splits[k].type && splits[i].splitType == splits[k].splitType + && splits[i].strand == splits[k].strand ) + { + splits[k].weight += splits[i].weight ; + } + else + { + ++k ; + splits[k] = splits[i] ; + } + } + splits.resize( k + 1 ) ; +} + + void CoalesceIntervals( std::vector &intervals ) { int i, k ; @@ -487,7 +512,7 @@ int main( int argc, char *argv[] ) if ( buffer[0] == '#' ) continue ; - SubexonGraph::InputSubexon( buffer, alignments, se, true ) ; + SubexonGraph::InputSubexon( buffer, alignments, se, false) ; // Record all the intron rentention, overhang from the samples if ( ( se.leftType == 2 && se.rightType == 1 ) || ( se.leftType == 2 && se.rightType == 0 ) @@ -522,7 +547,7 @@ int main( int argc, char *argv[] ) } - for ( i = 0 ; i < se.nextCnt ; ++i ) + /*for ( i = 0 ; i < se.nextCnt ; ++i ) { struct _interval ni ; ni.chrId = se.chrId ; @@ -532,13 +557,14 @@ int main( int argc, char *argv[] ) ni.sampleSupport = 1 ; if ( ni.start + 1 < ni.end ) introns.push_back( ni ) ; - } + }*/ sp.chrId = se.chrId ; sp.pos = se.start ; sp.type = 1 ; sp.splitType = se.leftType ; sp.strand = se.leftStrand ; + sp.weight = 1 ; subexonSplits.push_back( sp ) ; sp.chrId = se.chrId ; @@ -546,27 +572,31 @@ int main( int argc, char *argv[] ) sp.type = 2 ; sp.splitType = se.rightType ; sp.strand = se.rightStrand ; + sp.weight = 1 ; subexonSplits.push_back( sp ) ; - if ( se.prevCnt > 0 ) + /*if ( se.prevCnt > 0 ) delete[] se.prev ; if ( se.nextCnt > 0 ) - delete[] se.next ; + delete[] se.next ;*/ } CoalesceIntervals( exons ) ; CoalesceIntervals( introns ) ; + CoalesceSubexonSplits( subexonSplits ) ; CleanIntervalIrOverhang( intervalIrOverhang ) ; + printf( "%d %d\n", subexonSplits.size(), intervalIrOverhang.size() ) ; fclose( fp ) ; } + // Obtain the split sites from the introns. int intronCnt = introns.size() ; std::vector intronSplits ; for ( i = 0 ; i < intronCnt ; ++i ) { - if ( introns[i].sampleSupport < 0.05 * fileCnt ) + /*if ( introns[i].sampleSupport < 0.05 * fileCnt ) { continue ; - } + }*/ struct _interval &it = introns[i] ; struct _subexonSplit sp ; sp.chrId = it.chrId ; @@ -586,12 +616,15 @@ int main( int argc, char *argv[] ) // Pair up the split sites to get subexons std::sort( intronSplits.begin(), intronSplits.end(), CompSubexonSplit ) ; - std::sort( subexonSplits.begin(), subexonSplits.end(), CompSubexonSplit ) ; + //std::sort( subexonSplits.begin(), subexonSplits.end(), CompSubexonSplit ) ; + // Convert the hard boundary to soft boundary if the split sites is filtered from the introns + // Seems no need to do this now. int splitCnt = subexonSplits.size() ; int intronSplitCnt = intronSplits.size() ; k = 0 ; - for ( i = 0 ; i < splitCnt ; ++i ) + //for ( i = 0 ; i < splitCnt ; ++i ) + while ( 0 ) { if ( subexonSplits[i].type != subexonSplits[i].splitType ) continue ; @@ -634,6 +667,8 @@ int main( int argc, char *argv[] ) } } } + + std::vector().swap( intronSplits ) ; // Force the soft boundary that collides with hard boundaries to be hard boundary. for ( i = 0 ; i < splitCnt ; ++i ) @@ -654,6 +689,10 @@ int main( int argc, char *argv[] ) break ; } } + /*if ( subexonSplits[i].pos == 144177260 ) + { + printf( "force conversion: %d %d %d. %d %d\n", subexonSplits[i].pos, subexonSplits[i].splitType, subexonSplits[i].weight, subexonSplits[i + 1].pos, subexonSplits[i + 1].splitType ) ; + }*/ subexonSplits[i].splitType = newSplitType ; subexonSplits[i].strand = newStrand ; } @@ -665,16 +704,16 @@ int main( int argc, char *argv[] ) for ( i = 0 ; i < splitCnt - 1 ; ++i ) { struct _subexon se ; - /*if ( subexonSplits[i].pos == 36946557 || subexonSplits[i + 1].pos == 36946557 ) + /*if ( subexonSplits[i + 1].pos == 144177260 ) { - printf( "%d %d %d: %d %d %d\n", subexonSplits[i].pos, subexonSplits[i].type, subexonSplits[i].splitType, - subexonSplits[i + 1].pos, subexonSplits[i + 1].type, subexonSplits[i + 1].splitType ) ; + printf( "%d %d %d: %d %d %d. %d\n", subexonSplits[i].pos, subexonSplits[i].type, subexonSplits[i].splitType, + subexonSplits[i + 1].pos, subexonSplits[i + 1].type, subexonSplits[i + 1].splitType, diffCnt ) ; }*/ if ( subexonSplits[i].type == 1 ) - ++diffCnt ; + diffCnt += subexonSplits[i].weight ; else - --diffCnt ; + diffCnt -= subexonSplits[i].weight ; if ( subexonSplits[i + 1].chrId != subexonSplits[i].chrId ) { @@ -704,6 +743,13 @@ int main( int argc, char *argv[] ) --se.end ; } + /*if ( se.end == 24613649 ) + { + //printf( "%d %d %d: %d %d %d. %d\n", subexonSplits[i].pos, subexonSplits[i].type, subexonSplits[i].splitType, + // subexonSplits[i + 1].pos, subexonSplits[i + 1].type, subexonSplits[i + 1].splitType, diffCnt ) ; + printf( "%d %d %d. %d %d %d\n", se.start, se.leftType, se.leftStrand, se.end, se.rightType, se.rightStrand ) ; + }*/ + if ( se.start > se.end ) //Note: this handles the case of repeated subexon split. { // handle the case in sample 0: [...[..] @@ -723,14 +769,29 @@ int main( int argc, char *argv[] ) printf( "%d: %d-%d: %d\n", se.chrId, se.start, se.end, se.rightType ) ; }*/ + se.next = se.prev = NULL ; se.nextCnt = se.prevCnt = 0 ; subexons.push_back( se ) ; ++seCnt ; } + std::vector().swap( subexonSplits ) ; + + // Adjust the split type. + seCnt = subexons.size() ; + for ( i = 1 ; i < seCnt ; ++i ) + { + if ( subexons[i - 1].end + 1 == subexons[i].start ) + { + if ( subexons[i - 1].rightType == 0 ) + subexons[i - 1].rightType = subexons[i].leftType ; + if ( subexons[i].leftType == 0 ) + subexons[i].leftType = subexons[i - 1].rightType ; + } + } + // Merge the adjacent soft boundaries std::vector rawSubexons = subexons ; - seCnt = subexons.size() ; int exonCnt = exons.size() ; subexons.clear() ; @@ -761,6 +822,11 @@ int main( int argc, char *argv[] ) break ; } // rawsubexons[i...j-1] will be merged. + + /*if ( rawSubexons[i].start == 144177116 ) + { + printf( "merge j-1: %d %d\n", rawSubexons[j - 1].end, rawSubexons[j - 1].rightType ) ; + }*/ bool merge = true ; if ( rawSubexons[i].leftType == 1 && rawSubexons[j - 1].rightType == 2 && j - i > 1 && rawSubexons[j - 1].end - rawSubexons[i].start >= 1000 ) @@ -1044,16 +1110,26 @@ int main( int argc, char *argv[] ) } continue ; } + else + break ; - SubexonGraph::InputSubexon( buffer, alignments, se, true ) ; - sampleSubexons.push_back( se ) ; + //SubexonGraph::InputSubexon( buffer, alignments, se, true ) ; + //sampleSubexons.push_back( se ) ; } - int sampleSubexonCnt = sampleSubexons.size() ; + //int sampleSubexonCnt = sampleSubexons.size() ; int intervalCnt = seIntervals.size() ; - for ( i = 0 ; i < sampleSubexonCnt ; ++i ) + //for ( i = 0 ; i < sampleSubexonCnt ; ++i ) + int iterCnt = 0 ; + while ( 1 ) { - struct _subexon &se = sampleSubexons[i] ; + if ( iterCnt > 0 && fgets( buffer, sizeof( buffer), fp ) == NULL) + break ; + ++iterCnt ; + + struct _subexon se ; + SubexonGraph::InputSubexon( buffer, alignments, se, true ) ; + while ( tag < intervalCnt ) { if ( seIntervals[tag].chrId < se.chrId || @@ -1274,16 +1350,21 @@ int main( int argc, char *argv[] ) } } } + + if ( se.nextCnt > 0 ) + delete[] se.next ; + if ( se.prevCnt > 0 ) + delete[] se.prev ; } fclose( fp ) ; - for ( i = 0 ; i < sampleSubexonCnt ; ++i ) + /*for ( i = 0 ; i < sampleSubexonCnt ; ++i ) { if ( sampleSubexons[i].nextCnt > 0 ) delete[] sampleSubexons[i].next ; if ( sampleSubexons[i].prevCnt > 0 ) delete[] sampleSubexons[i].prev ; - } + }*/ } CleanUpSubexonConnections( subexons ) ; diff --git a/GetTrustedSplice.pl b/GetTrustedSplice.pl index ba99895..91c9c1d 100644 --- a/GetTrustedSplice.pl +++ b/GetTrustedSplice.pl @@ -94,11 +94,15 @@ #{ # $flag = 1 if ( $spliceSupport{ $key } / $sampleCnt < 1 ) ; #} - my $siteSupport = $spliceSiteSupport{ $cols[0]." ".$cols[1] } + $spliceSiteSupport{ $cols[0]." ".$cols[1] } ; + my $siteSupport = max( $spliceSiteSupport{ $cols[0]." ".$cols[1] }, $spliceSiteSupport{ $cols[0]." ".$cols[2] } ) ; if ( $spliceSupport{ $key } < $siteSupport / 10.0 ) { #print $spliceSupport{ $key } / $siteSupport, " ", -log( $spliceSupport{ $key } / $siteSupport ) / log( 10.0 ), "\n" ; + #if ( $cols[1] == 73518141 && $cols[2] == 73518206 ) + #{ + # print "test: ", $spliceSupport{$key}, " $siteSupport ", -log( $spliceSupport{ $key } / $siteSupport ), "\n"; + #} my $needSample = min( -log( $spliceSupport{ $key } / $siteSupport ) / log( 10.0 ) + 1, $sampleCnt ) ; next if ( $spliceSampleSupport{ $key } < $needSample ) ; }