From e248d2cec3a0dc8c0c291da6f19b2abdd211d80e Mon Sep 17 00:00:00 2001 From: Alison Tang Date: Mon, 7 Feb 2022 21:21:53 -0800 Subject: [PATCH] samjuncs update --- README.md | 4 ++-- bin/samJuncs.py | 17 +++++++++++------ 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 6888ffe..a287d81 100644 --- a/README.md +++ b/README.md @@ -45,7 +45,7 @@ Alternatively, the user can align the reads themselves with their aligner of cho **Usage:** ```sh -python flair.py align -g genome.fa -r | [options] +python flair.py align -g genome.fa -r || [options] ``` run with `--help` for a description of optional arguments. Outputs (1) `sam` of raw aligned reads and (2) smoothed `bed12` file of aligned reads to be supplied to flair-correct. @@ -74,7 +74,7 @@ Alternatively, the `-j` argument for flair-correct can also be generated using S ### flair collapse Defines high-confidence isoforms from corrected reads. As FLAIR does not use annotations to collapse isoforms, FLAIR will pick the name of a read that shares the same splice junction chain as the isoform to be the isoform name. It is recommended to still provide an annotation with `-f`, which is used to rename FLAIR isoforms that match isoforms in existing annotation according to the transcript_id field in the gtf. Intermediate files generated by this step are removed by default, but can be retained for debugging purposes by supplying the argument `--keep_intermediate` and optionally supplying a directory to keep those files with `--temp_dir`. -If there are multiple samples to be compared, the flair-corrected read `psl` files should be concatenated prior to running flair-collapse. In addition, all raw read fastq/fasta files should either be specified after `-r` with space/comma separators or concatenated into a single file. +If there are multiple samples to be compared, the flair-corrected read `bed` or `psl` files should be concatenated prior to running flair-collapse. In addition, all raw read fastq/fasta files should either be specified after `-r` with space/comma separators or concatenated into a single file. **Usage:** ```sh diff --git a/bin/samJuncs.py b/bin/samJuncs.py index 986541f..9d1382c 100644 --- a/bin/samJuncs.py +++ b/bin/samJuncs.py @@ -73,7 +73,7 @@ class SAM(object): Handles sequence alignment format file input and output. ''' - def __init__(self, inFile=None, isHISAT=False, fetch=''): + def __init__(self, inFile=None, isHISAT=False, fetch='', keep_supplementary=False): # Attributes self.inFile = inFile self.fetch = fetch @@ -87,6 +87,7 @@ def __init__(self, inFile=None, isHISAT=False, fetch=''): sys.exit(1) self.strandInfo = {0:'+', 16:'-'} + self.supplementary_strandInfo = {2048:'+', 2064:'-'} #print(self.reader.find_introns((read for read in self.reader.fetch() if read.is_reverse))) #sys.exit(1) @@ -96,6 +97,8 @@ def __init__(self, inFile=None, isHISAT=False, fetch=''): else: self.inferJuncStrand = self.inferMM2JuncStrand + self.keep_supplementary = keep_supplementary + def inferMM2JuncStrand(self, read): # minimap gives junction strand denoted as 'ts' # the sign corresponds to the alignment orientation, where + agrees and - disagrees @@ -162,14 +165,17 @@ def readJuncs(self): ''' Returns start, end and junctions from a single read. ''' - + allskipped={} for read in self.reader.fetch(): try: # Skip unmapped or multimapped reads. strand = self.strandInfo[read.flag] except: - continue + if self.keep_supplementary and read.flag in self.supplementary_strandInfo: + strand = self.supplementary_strandInfo[read.flag] + else: + continue qName = read.query_name chromosome = read.reference_name @@ -189,7 +195,6 @@ def readJuncs(self): orientation = read.flag juncDir = self.inferJuncStrand(read) - for num, flagTuple in enumerate(cigar,1): flag, length = flagTuple if flag not in [0,2,3,7,8]: @@ -201,6 +206,7 @@ def readJuncs(self): refPos = refEnd+length refEnd = refPos + # Last is the end rend = refEnd @@ -210,7 +216,6 @@ def readJuncs(self): def runCMD(x): fetch, alignType, bam = x bObj = SAM(bam, alignType, fetch) - print(bObj) return bObj.countJuncs() @@ -238,7 +243,7 @@ def main(): #results = p.imap_unordered(runCMD, tqdm(referenceIDs, desc="Parsing BAM for junctions", total=len(referenceIDs))) results = p.map(runCMD, referenceIDs) - print(results) + # print(results[0])