Skip to content

Commit

Permalink
Fix bug in BAM reads count, refactore
Browse files Browse the repository at this point in the history
Excluded chromosomes should not influence
on mapped reads number. So when BAM statistics
is calculated the results are the following:
1. Total = all reads, includeing excluded chrms
2. Excluded = reads from excluded chrms
3. Aligned = Total - Excluded - NotAligned
4. NotAligned are all the reads which satisfy one
   of the criterias:
   - not mapped (0x4)
   - not primary alignment (0x100)
   - PCR or optical duplicate (0x400)
   - is paired (0x1), but is not proper pair (0x2)
   - is paired (0x1), but mate is unmapped (0x8)
We don't need to check if paired-end reads have
correct alignment or if they are on opposite strands
etc, because proper pair (0x2) flag set by aligner
is enough to make the conclusion that everything
ok with that read (tested with STAR).

Refactore:
- excluded chromosomes are completely skipped at the
  beginning of the program. We don't load them at all
  • Loading branch information
michael-kotliar committed Jan 3, 2018
1 parent 6241114 commit 26c6855
Show file tree
Hide file tree
Showing 5 changed files with 95 additions and 104 deletions.
3 changes: 2 additions & 1 deletion include/annotation_reader.h
Original file line number Diff line number Diff line change
Expand Up @@ -116,8 +116,9 @@ std::map <string, std::map <string, Isoform> > group_by_gene (const std::map <st

// global_annotation_map_ptr : key - chromosome name, value - multimap of annotations, sorted by not-unique key - start pose of annotation
// NOTE : forward list of annotations should be sorted by start pose with rule a<b
// NOTE : load_annotation function skip all the chromosomes, which are not present in chromosome_info_map (not present in BAM file)
bool load_annotation (const string & full_path_name,
const string & exclude_chr_coma_sep_list,
const std::map <string, pair <int, int> > & chromosome_info_map,
std::map <string, multimap <long, GffRecordPtr> > & global_annotation_map_ptr,
std::map <string, std::map <string, Isoform> > & iso_var_map);

Expand Down
10 changes: 6 additions & 4 deletions include/bam_reader.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,16 +36,19 @@ struct BamGeneralInfo {
long total;
long not_aligned;
long aligned;
long excluded;
BamGeneralInfo ():
total (0),
not_aligned (0),
aligned (0)
aligned (0),
excluded (0)
{
}
void operator = (const BamGeneralInfo &other ) {
total = other.total;
not_aligned = other.not_aligned;
aligned = other.aligned;
excluded = other.excluded;
}
};

Expand Down Expand Up @@ -93,12 +96,11 @@ void reset_saved_reads ();

void print_ref_info (const std::map <string, pair <int, int> > & info_map);

std::map <string, pair <int, int> > get_chromosome_map_info (const BamReader & reader);
std::map <string, pair <int, int> > get_chromosome_map_info (const BamReader & reader, const vector<string> & exclude_chr);

bool make_index (BamReader & bam_reader);
bool flag_check (BamAlignment & al, BamGeneralInfo & bam_general_info, bool dUTP);
bool flag_check (BamAlignment & al, bool dUTP);
void get_bam_info(BamReader & bam_reader, BamGeneralInfo & bam_general_info);
void get_bam_info(BamReader & bam_reader, BamGeneralInfo & bam_general_info, const std::map <string, pair <int, int> > & chromosome_info_map);



Expand Down
14 changes: 4 additions & 10 deletions src/annotation_reader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -443,8 +443,9 @@ bool is_duplicate (const Isoform & original_isoform, const Isoform & new_isoform

// global_annotation_map_ptr : key - chromosome name, value - multimap of annotations, sorted by not-unique key - start pose of annotation
// NOTE : forward list of annotations should be sorted by start pose with rule a<b
// NOTE : load_annotation function skip all the chromosomes, which are not present in chromosome_info_map (not present in BAM file)
bool load_annotation (const string & full_path_name,
const string & exclude_chr_coma_sep_list,
const std::map <string, pair <int, int> > & chromosome_info_map,
std::map <string, multimap <long, GffRecordPtr> > & global_annotation_map_ptr,
std::map <string, std::map <string, Isoform> > & iso_var_map){
ifstream input_stream (full_path_name);
Expand All @@ -453,13 +454,6 @@ bool load_annotation (const string & full_path_name,
return false;
}

vector<string> exclude_chr;
try {
exclude_chr = split_line( boost::to_lower_copy(exclude_chr_coma_sep_list), "," );
} catch (...){
cerr << "Cannot interpret exclude chromosome list [" << exclude_chr_coma_sep_list << "]. Set to default empty value" << endl;
}

// check if annotation input is in gtf format
bool gtf = false;
string annotation_file_ext (full_path_name, full_path_name.find_last_of('.')+1);
Expand All @@ -484,8 +478,8 @@ bool load_annotation (const string & full_path_name,
continue;
}

if (std::find(exclude_chr.begin(), exclude_chr.end(), boost::to_lower_copy(current_isoform.chrom)) != exclude_chr.end()){
cout << "Skip chromosome [" << current_isoform.chrom << "]" << endl;
if ( chromosome_info_map.count(current_isoform.chrom) != 1 ){
// cout << "Skip excluded chromosome [" << current_isoform.chrom << "]" << endl;
continue;
}

Expand Down
153 changes: 69 additions & 84 deletions src/bam_reader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -101,51 +101,25 @@ list <BamRecordPtr> split_to_single_reads (const BamAlignment & current_alignmen
return single_read_array;
}

bool flag_check (BamAlignment & al, BamGeneralInfo & bam_general_info, bool dUTP){
bam_general_info.total++;
if( al.IsMapped() && al.IsPrimaryAlignment() && (!al.IsDuplicate()) ) {
if (al.IsPaired()){
if(al.IsProperPair() && al.IsMateMapped() ) {
// The only possible situation when it may happen is when read was mapped on the wrong strand
if( (!(al.IsReverseStrand() ^ al.IsMateReverseStrand()) ) || // both mates came from the same strand. it shouldn't be like this
( (al.Position < al.MatePosition) && al.IsReverseStrand() ) ||
( (al.MatePosition < al.Position) && al.IsMateReverseStrand() )
) {
bam_general_info.not_aligned++;
return false;
}

// // It looks like when we use DUTP we count strand only for the first mate in pair
// // we artificially change the strand of second mate pair to be equal the strand of main mate in a pair
// // TODO Think if we really should do like this?
// // TODO Do we need to check any other conditions?
if (dUTP){
if(al.IsMateMapped() && al.IsSecondMate()) {
if(al.IsReverseStrand()) {
al.SetIsReverseStrand(false);
} else {
al.SetIsReverseStrand(true);
}
}
}
} else {
bam_general_info.not_aligned++;
return false;
}
}
} else {
bam_general_info.not_aligned++;
bool flag_check (BamAlignment & al, bool dUTP){
if (!al.IsMapped() || !al.IsPrimaryAlignment() || al.IsDuplicate()){
return false;
}
if (al.IsPaired() && (!al.IsProperPair() || !al.IsMateMapped()) ){
return false;
}
// TODO Think if we really should do like this?
// TODO check if it switch it back when is called twice
if (dUTP && al.IsPaired() && al.IsSecondMate()){
if(al.IsReverseStrand()) {
al.SetIsReverseStrand(false);
} else {
al.SetIsReverseStrand(true);
}
}
return true;
}

bool flag_check (BamAlignment & al, bool dUTP){
BamGeneralInfo bam_general_info;
return flag_check (al, bam_general_info, dUTP);
}



void put_bam_record_back (BamRecordPtr bam_record){
if( ! saved_reads_tls.get() ) {
Expand All @@ -160,7 +134,7 @@ void reset_saved_reads (){
saved_reads_tls.reset( new list <BamRecordPtr> );
}

// Gets the new read fro the BAM file through BamReader object
// Gets the new read from the BAM file through BamReader object
bool get_bam_record (BamReader & bam_reader, BamRecordPtr & bam_record, int min_read_segment_length, bool dUTP, bool freeze){

if( ! saved_reads_tls.get() ) {
Expand Down Expand Up @@ -205,12 +179,17 @@ void print_ref_info (const std::map <string, pair <int, int> > & info_map){
}


std::map <string, pair <int, int> > get_chromosome_map_info (const BamReader & reader){
std::map <string, pair <int, int> > get_chromosome_map_info (const BamReader & reader, const vector<string> & exclude_chr){
std::map <string, pair <int, int> > output_map;
RefVector ref_data_vector = reader.GetReferenceData();
for (int i = 0; i < ref_data_vector.size(); i++ ){
RefData current_ref_data = ref_data_vector[i];
string chrom_name = current_ref_data.RefName;
if (std::find(exclude_chr.begin(), exclude_chr.end(), boost::to_lower_copy(chrom_name)) != exclude_chr.end()){
cerr << "Skip excluded chromosome [" << chrom_name << "]" << endl;
continue;
}

int ref_id = reader.GetReferenceID(chrom_name);
if (ref_id == -1){
throw ("BUG: it shouldn't be like this");
Expand Down Expand Up @@ -250,55 +229,61 @@ bool make_index (BamReader & bam_reader){



//
//bool load_from_file (const string & full_filename, BamGeneralInfo & bam_general_info){
// BamGeneralInfo new_info;
// ifstream input_stream (full_filename);
// if (!input_stream) {
// cerr << "Cannot open file " << full_filename << endl;
// return false;
// }
// string line;
// while (getline(input_stream, line)) {
// if (include_key(line, "Number of input reads")){
// if (not str_to_long(new_info.total, split_line(line)[1]) ){
// return false;
// }
// }
// if (include_key(line, "Uniquely mapped reads number")){
// if (not str_to_long(new_info.aligned, split_line(line)[1]) ){
// return false;
// }
// }
// }
// if (new_info.total == 0 || new_info.aligned == 0){
// return false;
// }
// new_info.not_aligned = new_info.total - new_info.aligned;
// bam_general_info = new_info;
// return true;
//}


bool load_from_file (const string & full_filename, BamGeneralInfo & bam_general_info){
BamGeneralInfo new_info;
ifstream input_stream (full_filename);
if (!input_stream) {
cerr << "Cannot open file " << full_filename << endl;
return false;
void get_bam_info(BamReader & bam_reader, BamGeneralInfo & bam_general_info, const std::map <string, pair <int, int> > & chromosome_info_map){
// Refactore chromosome_info_map to allow easy search by RefIf
std::map <int, string> allowed_chr_ids; // < RefID, chromosome name >
for (auto it = chromosome_info_map.begin(); it != chromosome_info_map.end(); ++it){
allowed_chr_ids[it->second.first] = it->first;
}
string line;
while (getline(input_stream, line)) {
if (include_key(line, "Number of input reads")){
if (not str_to_long(new_info.total, split_line(line)[1]) ){
return false;
}
}
if (include_key(line, "Uniquely mapped reads number")){
if (not str_to_long(new_info.aligned, split_line(line)[1]) ){
return false;
}
}
}
if (new_info.total == 0 || new_info.aligned == 0){
return false;
}
new_info.not_aligned = new_info.total - new_info.aligned;
bam_general_info = new_info;
return true;
}


void get_bam_info(BamReader & bam_reader, BamGeneralInfo & bam_general_info){
// check if we can get it from the STAR output Log.final.out in the same folder.
// If not - iterate over the file
string log_filename_sufix = "Log.final.out"; // TODO put it as parameter
string bam_filename (bam_reader.GetFilename(), bam_reader.GetFilename().find_last_of('/')+1);
string bam_wo_ext (bam_filename, 0, bam_filename.find_last_of('.')+1);
string path (bam_reader.GetFilename(), 0, bam_reader.GetFilename().find_last_of('/')+1);
string full_log_filename = path + bam_wo_ext + log_filename_sufix;
if (not load_from_file (full_log_filename, bam_general_info)){
cerr << "Couldn't find any file with mapping statistics. Calculating ..." << endl;
BamAlignment al;
while (bam_reader.GetNextAlignment(al)){
flag_check (al, bam_general_info, false);
cerr << "Calculating mapping statistics" << endl;
BamAlignment al;
while (bam_reader.GetNextAlignment(al)){
bam_general_info.total++;
if ( allowed_chr_ids.count(al.RefID) != 1){
bam_general_info.excluded++;
continue;
}
bam_general_info.aligned = bam_general_info.total - bam_general_info.not_aligned;
bam_reader.Rewind();
if (not flag_check (al, false)){
bam_general_info.not_aligned++;
};
}
bam_general_info.aligned = bam_general_info.total - bam_general_info.not_aligned - bam_general_info.excluded;
bam_reader.Rewind();

cerr << "BAM alignment statistics:" << endl;
cerr << " Total: " << bam_general_info.total << endl;
cerr << " Excluded: " << bam_general_info.excluded << endl;
cerr << " Aligned: " << bam_general_info.aligned << endl;
cerr << " Not aligned: " << bam_general_info.not_aligned << endl;
}
19 changes: 14 additions & 5 deletions src/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -134,11 +134,21 @@ int main(int argc, char **argv) {
return 0;
} else cerr << "Open " << bam_reader.GetFilename() << endl;


// Get a vector of chromosomes to exclude from calculation
vector<string> exclude_chr;
try {
exclude_chr = split_line( boost::to_lower_copy(params["exclude"].as<std::string>()), "," );
} catch (...){
cerr << "Cannot interpret exclude chromosome list [" << params["exclude"].as<std::string>() << "]. Set to default empty value" << endl;
}

// key - chromosome name, value - <RefId, Length> for corresponding chromosome from the BAM file
// chromosome_info_map - saves correspondence between chromosome name and RefId from the BamReader object
std::map <string, pair <int, int> > chromosome_info_map = get_chromosome_map_info (bam_reader);
// chromosome_info_map doesn't include excluded chromosomes
std::map <string, pair <int, int> > chromosome_info_map = get_chromosome_map_info (bam_reader, exclude_chr);

// print_ref_info (chromosome_info_map); // Only for DEBUG
// print_ref_info (chromosome_info_map); // Only for DEBUG

// Check if current bam file is indexed (and that index data is loaded into program)
if (not make_index(bam_reader)){
Expand All @@ -148,10 +158,9 @@ int main(int argc, char **argv) {
cerr << "Gathering info about bam file" << endl;
BamGeneralInfo bam_general_info;
// Need to run through the whole bam file, because when we align reads according to exons, we can skip some of its parts
get_bam_info (bam_reader, bam_general_info);
get_bam_info (bam_reader, bam_general_info, chromosome_info_map);
bam_reader.Close();


// read from tab delimited file
// map < chromosome_key, multimap < exon_start_pose, exon_pointer> >
// exon_start_pose - is needed for sorting mutlimap by the start pose of exon; we cannot do if we add only pointers
Expand All @@ -160,7 +169,7 @@ int main(int argc, char **argv) {

// map to save <chromosome name, <isoform name, correspondent Isoform object> >
std::map <string, std::map <string, Isoform> > iso_var_map;
if (not load_annotation (params["annotation"].as<std::string>(), params["exclude"].as<std::string>(), global_annotation_map_ptr, iso_var_map)){
if (not load_annotation (params["annotation"].as<std::string>(), chromosome_info_map, global_annotation_map_ptr, iso_var_map)){
return 0;
}

Expand Down

0 comments on commit 26c6855

Please sign in to comment.