diff --git a/src/CommandDeclarations.h b/src/CommandDeclarations.h index 2d6a5b241..a5b38820a 100644 --- a/src/CommandDeclarations.h +++ b/src/CommandDeclarations.h @@ -56,6 +56,7 @@ extern int lca(int argc, const char **argv, const Command& command); extern int profile2pssm(int argc, const char **argv, const Command& command); extern int profile2cs(int argc, const char **argv, const Command& command); extern int offsetalignment(int argc, const char **argv, const Command& command); +extern int proteinaln2nucl(int argc, const char **argv, const Command& command); extern int apply(int argc, const char **argv, const Command& command); extern int shellcompletion(int argc, const char **argv, const Command& command); diff --git a/src/mmseqs.cpp b/src/mmseqs.cpp index 7a1d6fe6f..63376cc82 100644 --- a/src/mmseqs.cpp +++ b/src/mmseqs.cpp @@ -253,6 +253,12 @@ std::vector commands = { "Martin Steinegger ", " ", CITATION_MMSEQS2}, + {"proteinaln2nucl", proteinaln2nucl, &par.onlythreads, COMMAND_DB, + "Map protein alignment to nucleotide alignment", + NULL, + "Martin Steinegger ", + " ", + CITATION_MMSEQS2}, {"tsv2db", tsv2db, &par.tsv2db, COMMAND_DB, "Turns a TSV file into a MMseqs database", NULL, diff --git a/src/util/CMakeLists.txt b/src/util/CMakeLists.txt index de1a118a5..953701d08 100644 --- a/src/util/CMakeLists.txt +++ b/src/util/CMakeLists.txt @@ -42,5 +42,6 @@ set(util_source_files util/swapresults.cpp util/translatenucs.cpp util/tsv2db.cpp + util/proteinaln2nucl.cpp PARENT_SCOPE ) diff --git a/src/util/proteinaln2nucl.cpp b/src/util/proteinaln2nucl.cpp new file mode 100644 index 000000000..df4fe291b --- /dev/null +++ b/src/util/proteinaln2nucl.cpp @@ -0,0 +1,99 @@ +// +// Created by mad on 11/28/17. +// +#include +#include +#include +#include +#include + +#include "Alignment.h" +#include "Util.h" +#include "Parameters.h" +#include "Matcher.h" +#include "PrefilteringIndexReader.h" + +#include "Debug.h" +#include "DBReader.h" +#include "DBWriter.h" + +#ifdef OPENMP +#include +#endif + + +int proteinaln2nucl(int argc, const char **argv, const Command &command) { + Parameters &par = Parameters::getInstance(); + par.parseParameters(argc, argv, command, 2); + + Debug(Debug::INFO) << "Alignment database: " << par.db1 << "\n"; + DBReader alnDbr(par.db1.c_str(), par.db1Index.c_str()); + alnDbr.open(DBReader::LINEAR_ACCCESS); + + DBWriter resultWriter(par.db2.c_str(), par.db2Index.c_str(), par.threads); + resultWriter.open(); + Debug(Debug::INFO) << "Start writing file to " << par.db2 << "\n"; + +#pragma omp parallel for schedule(dynamic, 10) + for (size_t i = 0; i < alnDbr.getSize(); i++) { + unsigned int thread_idx = 0; +#ifdef OPENMP + thread_idx = static_cast(omp_get_thread_num()); +#endif + + unsigned int alnKey = alnDbr.getDbKey(i); + char *data = alnDbr.getData(i); + + std::string querySeq; + + char buffer[1024]; + std::string ss; + ss.reserve(1024); + + + std::vector results = Matcher::readAlignmentResults(data, true); + for (size_t j = 0; j < results.size(); j++) { + Matcher::result_t &res = results[j]; + bool hasBacktrace = (res.backtrace.size() > 0); + std::string newBacktrace; + newBacktrace.reserve(1024); + res.dbStartPos = res.dbStartPos*3; + res.dbEndPos = res.dbEndPos*3; + res.dbLen = res.dbLen*3; + res.qStartPos = res.qStartPos*3; + res.qEndPos = res.qEndPos*3; + res.qLen = res.qLen*3; + for (size_t pos = 0; pos < res.backtrace.size(); pos++) { + int cnt=0; + if(isdigit(res.backtrace[pos])){ + cnt += Util::fast_atoi(res.backtrace.c_str()+pos); + while(isdigit(res.backtrace[pos])){ + pos++; + } + } + switch(res.backtrace[pos]){ + case 'M': + case 'D': + case 'I': + newBacktrace.append(std::string(cnt*3, res.backtrace[pos])); + break; + + } + + } + res.backtrace = newBacktrace; + + size_t len = Matcher::resultToBuffer(buffer, res, hasBacktrace, true); + ss.append(buffer, len); + } + + resultWriter.writeData(ss.c_str(), ss.length(), alnKey, thread_idx); + ss.clear(); + } + alnDbr.close(); + resultWriter.close(); + Debug(Debug::INFO) << "Done." << "\n"; + + return EXIT_SUCCESS; +} +