From 8f26bdea3352209a7c96a5e74747b1c9529f8bb4 Mon Sep 17 00:00:00 2001 From: Li Date: Tue, 3 May 2022 14:52:57 -0400 Subject: [PATCH 1/2] Add the option --stranded to specify strand library --- FindJunction.cpp | 36 ++++++++++++++++++++++++++++++++++++ README.md | 1 + alignments.hpp | 23 ++++++++++++++++++++++- classes.cpp | 17 +++++++++++++++++ psiclass | 9 ++++++++- 5 files changed, 84 insertions(+), 2 deletions(-) diff --git a/FindJunction.cpp b/FindJunction.cpp index d104ccd..8705a07 100644 --- a/FindJunction.cpp +++ b/FindJunction.cpp @@ -64,6 +64,7 @@ bool flagStrict ; int junctionCnt ; bool anchorBoth ; bool validRead ; +int strandedLib ; // 0-unstranded, 1-rf, 2-fr int flank ; struct _cigarSeg @@ -88,6 +89,7 @@ void PrintHelp() "\t-j xx [-B]: Output the junctions using 1-based coordinates. The format is \"reference id\" start end \"# of read\" strand.(They are sorted)\n and the xx is an integer means the maximum unqualified anchor length for a splice junction(default=8). If -B, the splice junction must be supported by a read whose both anchors are longer than xx.\n" "\t-a: Output all the junctions, and use non-positive support number to indicate unqualified junctions.\n" "\t-y: If the bits from YS field of bam matches the argument, we filter the alignment (default: 4).\n" + "\t--stranded un/rf/fr: stranded library fr-firststrand/secondstrand (default: not set).\n" ) ; } @@ -741,7 +743,20 @@ void cigar2string( bam1_core_t *c, uint32_t *in_cigar, char *out_cigar ) } } +char GetStrandFromStrandedLib(int flag) +{ + if (strandedLib == 0) + return '?' ; + if (flag & 0x80 == 0) // first read or single-end case + { + return (((flag >> 4) & 1) ^ (strandedLib & 1)) ? '+' : '-' ; + } + else + { + return (((flag >> 4) & 1) ^ (strandedLib & 1)) ? '-' : '+' ; + } +} int main( int argc, char *argv[] ) { @@ -768,6 +783,7 @@ int main( int argc, char *argv[] ) flagPrintJunction = true ; flank = 8 ; filterYS = 4 ; + strandedLib = 0 ; contradictedReads = NULL ; @@ -814,6 +830,16 @@ int main( int argc, char *argv[] ) hasMateReadIdSuffix = true ; ++i ; } + else if ( !strcmp( argv[i], "--stranded" ) ) + { + if ( !strcmp(argv[i + 1], "un") ) + strandedLib = 0 ; + else if ( !strcmp(argv[i + 1], "rf") ) + strandedLib = 1 ; + else if ( !strcmp(argv[i + 1], "fr" ) ) + strandedLib = 2 ; + ++i ; + } else if ( i > 1 ) { printf( "Unknown option %s\n", argv[i] ) ; @@ -1021,6 +1047,11 @@ int main( int argc, char *argv[] ) noncanonStrandInfo = -1 ; } } + else if ( strandedLib != 0 ) + { + strand = GetStrandFromStrandedLib(flag) ; + noncanonStrandInfo = -1 ; + } else { strand = '?' ; @@ -1034,6 +1065,11 @@ int main( int argc, char *argv[] ) strand = '-' ; else if ( strstr( line, "XS:A:+" ) ) strand = '+' ; + else if ( strandedLib != 0) + { + strand = GetStrandFromStrandedLib(flag) ; + noncanonStrandInfo = -1 ; + } else { strand = '?' ; diff --git a/README.md b/README.md index 26a8299..7a92e8a 100644 --- a/README.md +++ b/README.md @@ -41,6 +41,7 @@ PsiCLASS depends on [pthreads](http://en.wikipedia.org/wiki/POSIX_Threads) and s -c FLOAT: only use the subexons with classifier score <= than the given number. (default: 0.05) --sa FLOAT: the minimum average number of supported read for retained introns (default: 0.5) --vd FLOAT : the minimum average coverage depth of a transcript to be reported in voting (defaults: 1.0) + --stranded STRING: un/rf/fr for library unstranded/fr-firstand/fr-secondstrand (default: not used) --maxDpConstraintSize: the number of subexons a constraint can cover in DP. (default: 7. -1 for inf) --primaryParalog: use primary alignment to retain paralog genes (default: use unique alignments) --tssTesQuantile FLOAT: the quantile for transcription start/end sites in subexon graph (default: 0.5; 1.0 for longest exon margin) diff --git a/alignments.hpp b/alignments.hpp index 619b3bb..1ff9b40 100644 --- a/alignments.hpp +++ b/alignments.hpp @@ -26,6 +26,7 @@ class Alignments bool allowClip ; bool hasClipHead, hasClipTail ; int segmentsSum ; // the sum of segments. + int strandedLib ; // 0-unstranded, 1-rf, 2-fr bool atBegin ; bool atEnd ; @@ -77,7 +78,10 @@ class Alignments fragStdev = 0 ; readLen = 0 ; matePaired = false ; + + strandedLib = 0 ; } + ~Alignments() { if ( b ) @@ -134,6 +138,11 @@ class Alignments return hasClipTail ; } + void SetStrandedLib(int libtype) + { + strandedLib = libtype ; + } + int Next() { int i ; @@ -393,7 +402,7 @@ class Alignments // -1:minus, 0: unknown, 1:plus int GetStrand() { - if ( segCnt == 1 ) + if ( segCnt == 1 && strandedLib == 0) return 0 ; if ( bam_aux_get( b, "XS" ) ) { @@ -402,6 +411,18 @@ class Alignments else return 1 ; } + else if (strandedLib != 0) + { + int flag = b->core.flag ; + if (flag & 0x80 == 0) // first read or single-end case + { + return (((flag >> 4) & 1) ^ (strandedLib & 1)) ? 1 : -1 ; + } + else + { + return (((flag >> 4) & 1) ^ (strandedLib & 1)) ? -1 : 1 ; + } + } else return 0 ; } diff --git a/classes.cpp b/classes.cpp index ed10043..08d1cb2 100644 --- a/classes.cpp +++ b/classes.cpp @@ -22,6 +22,7 @@ char usage[] = "./classes [OPTIONS]:\n" "\t-f FLOAT: filter the transcript from the gene if its abundance is lower than the given number percent of the most abundant one. (default: 0.05)\n" "\t-d FLOAT: filter the transcript whose average read depth is less than the given number. (default: 2.5)\n" "\t--ls STRING: path to the file of the list of single-sample subexon files. (default: not used)\n" + "\t--stranded STRING: un/rf/fr for library unstranded/fr-firstand/fr-secondstrand (default: not used)\n" "\t--hasMateIdSuffix: the read id has suffix such as .1, .2 for a mate pair. (default: false)\n" "\t--maxDpConstraintSize: the maximum number of subexons a constraint can cover in dynamic programming. (default: 7; -1 for inf)\n" "\t--primaryParalog: use primary alignment to retain paralog genes instead of unique alignments. (default: not used)\n" @@ -35,6 +36,7 @@ static struct option long_options[] = { "hasMateIdSuffix", no_argument, 0, 10002 }, { "primaryParalog", no_argument, 0, 10003 }, { "maxDpConstraintSize", required_argument, 0, 10004 }, + { "stranded", required_argument, 0, 10005 }, { (char *)0, 0, 0, 0} } ; @@ -114,6 +116,7 @@ int main( int argc, char *argv[] ) bool hasMateReadIdSuffix = false ; bool usePrimaryAsUnique = false ; int maxDpConstraintSize = 7 ; + int strandedLib = 0 ; std::vector alignmentFiles ; SubexonCorrelation subexonCorrelation ; @@ -191,6 +194,13 @@ int main( int argc, char *argv[] ) { maxDpConstraintSize = atoi(optarg) ; } + else if ( c == 10005 ) // stranded + { + if (!strcmp(optarg, "rf")) + strandedLib = 1 ; + else if (!strcmp(optarg, "fr")) + strandedLib = 2 ; + } else { printf( "%s", usage ) ; @@ -208,6 +218,13 @@ int main( int argc, char *argv[] ) exit( 1 ) ; } + if (strandedLib != 0) + { + size = alignmentFiles.size() ; + for ( i = 0 ; i < size ; ++i ) + alignmentFiles[i].SetStrandedLib(strandedLib) ; + } + if ( alignmentFiles.size() < 50 ) { diff --git a/psiclass b/psiclass index 881a8dc..40977ad 100755 --- a/psiclass +++ b/psiclass @@ -21,6 +21,7 @@ die "Usage: ./psiclass [OPTIONS]\n". "\t-c FLOAT: only use the subexons with classifier score <= than the given number (default: 0.05)\n". "\t--sa FLOAT: the minimum average number of supported read for retained introns (default: 0.5)\n". "\t--vd FLOAT : the minimum average coverage depth of a transcript to be reported (defaults: 1.0)\n". + "\t--stranded STRING: un/rf/fr for library unstranded/fr-firstand/fr-secondstrand (default: not used)\n". "\t--maxDpConstraintSize: the number of subexons a constraint can cover in DP. (default: 7. -1 for inf)\n". "\t--bamGroup STRING: path to the file listing the group id of BAMs in the --lb file (default: not used)\n". "\t--primaryParalog: use primary alignment to retain paralog genes (default: use unique alignments)\n". @@ -170,9 +171,15 @@ for ( $i = 0 ; $i < @ARGV ; ++$i ) $combineSubexonsOpt .= "-q ".$ARGV[$i + 1] ; ++$i ; } + elsif ( $ARGV[$i] eq "--stranded" ) + { + $juncOpt .= " --stranded ".$ARGV[$i + 1] ; + $classesOpt .= " --stranded ".$ARGV[$i + 1] ; + ++$i ; + } elsif ( $ARGV[$i] eq "--version" ) { - die "PsiCLASS v1.0.2\n" ; + die "PsiCLASS v1.0.3\n" ; } else { From d32242799a68877f62927c367dbc383e47be4d69 Mon Sep 17 00:00:00 2001 From: Li Date: Thu, 5 May 2022 22:16:58 -0400 Subject: [PATCH 2/2] Fix an issue of wrong op precedence --- FindJunction.cpp | 7 +++---- alignments.hpp | 6 +++--- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/FindJunction.cpp b/FindJunction.cpp index 8705a07..f1c589b 100644 --- a/FindJunction.cpp +++ b/FindJunction.cpp @@ -747,14 +747,13 @@ char GetStrandFromStrandedLib(int flag) { if (strandedLib == 0) return '?' ; - - if (flag & 0x80 == 0) // first read or single-end case + if ((flag & 0x80) == 0) // first read or single-end case { - return (((flag >> 4) & 1) ^ (strandedLib & 1)) ? '+' : '-' ; + return (((flag >> 4) & 1) ^ (strandedLib & 1)) ? '-' : '+' ; } else { - return (((flag >> 4) & 1) ^ (strandedLib & 1)) ? '-' : '+' ; + return (((flag >> 4) & 1) ^ (strandedLib & 1)) ? '+' : '-' ; } } diff --git a/alignments.hpp b/alignments.hpp index 1ff9b40..855d026 100644 --- a/alignments.hpp +++ b/alignments.hpp @@ -414,13 +414,13 @@ class Alignments else if (strandedLib != 0) { int flag = b->core.flag ; - if (flag & 0x80 == 0) // first read or single-end case + if ((flag & 0x80) == 0) // first read or single-end case { - return (((flag >> 4) & 1) ^ (strandedLib & 1)) ? 1 : -1 ; + return (((flag >> 4) & 1) ^ (strandedLib & 1)) ? -1 : 1 ; } else { - return (((flag >> 4) & 1) ^ (strandedLib & 1)) ? -1 : 1 ; + return (((flag >> 4) & 1) ^ (strandedLib & 1)) ? 1 : -1 ; } } else