Skip to content

Commit

Permalink
Merge commit 'a52a508cfcdeca043278823a5528cc2d2afd8a26'
Browse files Browse the repository at this point in the history
  • Loading branch information
elileka committed Oct 18, 2020
2 parents 3bb93d6 + a52a508 commit 586b284
Show file tree
Hide file tree
Showing 25 changed files with 366 additions and 323 deletions.
29 changes: 17 additions & 12 deletions lib/mmseqs/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -43,19 +43,7 @@ elseif (CMAKE_CXX_COMPILER_ID MATCHES "Intel")
endif ()

# see https://wiki.debian.org/ArchitectureSpecificsMemo for char signedness

set(MMSEQS_CXX_FLAGS "-std=c++1y -fsigned-char")
if (CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm.*|ARM.*|aarch64.*|AARCH64.*)")
set(ARM 1)
elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "PPC64*|ppc64*|powerpc64*")
set(PPC64 1)
elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "amd64|AMD64")
set(X64 1)
elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "x86|X86")
set(X86 1)
else ()
message(WARNING "CPU without native SIMD instructions. Performance will be bad.")
endif ()

# SIMD instruction sets support
set(MMSEQS_ARCH "")
Expand All @@ -65,20 +53,37 @@ if (HAVE_AVX2)
else ()
set(MMSEQS_ARCH "${MMSEQS_ARCH} -mavx2 -mcx16 -Wa,-q")
endif ()
set(X64 1)
elseif (HAVE_SSE4_1)
set(MMSEQS_ARCH "${MMSEQS_ARCH} -msse4.1 -mcx16")
set(X64 1)
elseif (HAVE_SSE2)
set(MMSEQS_ARCH "${MMSEQS_ARCH} -msse2")
set(DISABLE_IPS4O 1)
set(X64 1)
elseif (HAVE_POWER9)
set(MMSEQS_ARCH "${MMSEQS_ARCH} -mcpu=power9 -mvsx")
set(PPC64 1)
elseif (HAVE_POWER8)
set(MMSEQS_ARCH "${MMSEQS_ARCH} -mcpu=power8 -mvsx")
set(PPC64 1)
elseif (HAVE_ARM8)
set(MMSEQS_ARCH "${MMSEQS_ARCH} -march=armv8-a+simd")
set(ARM 1)
endif ()

if (NATIVE_ARCH AND (MMSEQS_ARCH STREQUAL ""))
if (CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm.*|ARM.*|aarch64.*|AARCH64.*)")
set(ARM 1)
elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "PPC64*|ppc64*|powerpc64*")
set(PPC64 1)
elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "amd64|AMD64")
set(X64 1)
elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "x86|X86")
set(X86 1)
else ()
message(WARNING "CPU without native SIMD instructions. Performance will be bad.")
endif ()
if (EMSCRIPTEN)
set(MMSEQS_ARCH "-msimd128 -s WASM=1 -s ASSERTIONS=1")
elseif (X86 OR X64)
Expand Down
2 changes: 1 addition & 1 deletion lib/mmseqs/azure-pipelines.yml
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ jobs:
deb [arch=$ARCH] http://ports.ubuntu.com/ubuntu-ports/ focal-security main universe multiverse
HEREDOC
sudo apt-get update
sudo apt-get -y install crossbuild-essential-$ARCH zlib1g-dev:$ARCH libbz2-dev:$ARCH
sudo apt-get -y install -o APT::Immediate-Configure=false crossbuild-essential-$ARCH zlib1g-dev:$ARCH libbz2-dev:$ARCH
displayName: Install Toolchain
- script: |
mkdir build && cd build
Expand Down
10 changes: 5 additions & 5 deletions lib/mmseqs/data/workflow/databases.sh
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ downloadFile() {
aria2c --max-connection-per-server="$ARIA_NUM_CONN" --allow-overwrite=true -o "$FILENAME" -d "$DIR" "$URL" && return 0
;;
CURL)
curl -o "$OUTPUT" "$URL" && return 0
curl -L -o "$OUTPUT" "$URL" && return 0
;;
WGET)
wget -O "$OUTPUT" "$URL" && return 0
Expand Down Expand Up @@ -175,10 +175,10 @@ case "${SELECTION}" in
"eggNOG")
if notExists "${TMP_PATH}/download.done"; then
date "+%s" > "${TMP_PATH}/version"
downloadFile "http://eggnogdb.embl.de/download/eggnog_5.0/per_tax_level/2/2_raw_algs.tar" "${TMP_PATH}/bacteria"
downloadFile "http://eggnogdb.embl.de/download/eggnog_5.0/per_tax_level/2157/2157_raw_algs.tar" "${TMP_PATH}/archea"
downloadFile "http://eggnogdb.embl.de/download/eggnog_5.0/per_tax_level/2759/2759_raw_algs.tar" "${TMP_PATH}/eukaryota"
downloadFile "http://eggnogdb.embl.de/download/eggnog_5.0/per_tax_level/10239/10239_raw_algs.tar" "${TMP_PATH}/viruses"
downloadFile "http://eggnog5.embl.de/download/eggnog_5.0/per_tax_level/2/2_raw_algs.tar" "${TMP_PATH}/bacteria"
downloadFile "http://eggnog5.embl.de/download/eggnog_5.0/per_tax_level/2157/2157_raw_algs.tar" "${TMP_PATH}/archea"
downloadFile "http://eggnog5.embl.de/download/eggnog_5.0/per_tax_level/2759/2759_raw_algs.tar" "${TMP_PATH}/eukaryota"
downloadFile "http://eggnog5.embl.de/download/eggnog_5.0/per_tax_level/10239/10239_raw_algs.tar" "${TMP_PATH}/viruses"
touch "${TMP_PATH}/download.done"
fi
INPUT_TYPE="eggNOG"
Expand Down
4 changes: 2 additions & 2 deletions lib/mmseqs/data/workflow/searchslicedtargetprofile.sh
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ while [ "${FIRST_INDEX_LINE}" -le "${TOTAL_NUM_PROFILES}" ]; do
fi

# predict NUM_SEQS_THAT_SATURATE as the average number of prefilter results per profile in previous steps
# this allows to increase NUM_PROFS_IN_STEP
# this allows one to increase NUM_PROFS_IN_STEP
if [ "${NUM_PREF_RESULTS_IN_ALL_PREV_STEPS}" -gt 0 ]; then
# BE MORE CAUTIOUS?
NUM_PROFS_PROCESSED="$((FIRST_INDEX_LINE-1))"
Expand Down Expand Up @@ -126,7 +126,7 @@ while [ "${FIRST_INDEX_LINE}" -le "${TOTAL_NUM_PROFILES}" ]; do
# align current step chunk
if notExists "${TMP_PATH}/aln.done"; then
# shellcheck disable=SC2086
${RUNNER} "$MMSEQS" align "${PROFILEDB}" "${INPUT}" "${TMP_PATH}/pref" "${TMP_PATH}/aln" ${ALIGNMENT_PAR} \
${RUNNER} "$MMSEQS" "${ALIGN_MODULE}" "${PROFILEDB}" "${INPUT}" "${TMP_PATH}/pref" "${TMP_PATH}/aln" ${ALIGNMENT_PAR} \
|| fail "align died"
# shellcheck disable=SC2086
"$MMSEQS" rmdb "${TMP_PATH}/pref" ${VERBOSITY}
Expand Down
1 change: 1 addition & 0 deletions lib/mmseqs/lib/microtar/microtar.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ struct mtar_t {
int (*close)(mtar_t *tar);
void *stream;
size_t curr_size;
int isFinished;
};


Expand Down
4 changes: 2 additions & 2 deletions lib/mmseqs/src/MMseqsBase.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -717,7 +717,7 @@ std::vector<Command> baseCommands = {
{"DB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::allDb },
{"DB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA | DbType::VARIADIC, &DbValidator::allDb }}},
{"subtractdbs", subtractdbs, &par.subtractdbs, COMMAND_SET,
"Remove all entries from first DB occuring in second DB by key",
"Remove all entries from first DB occurring in second DB by key",
NULL,
"Martin Steinegger <[email protected]>",
"<i:resultDBLeft> <i:resultDBRight> <o:resultDB>",
Expand Down Expand Up @@ -930,7 +930,7 @@ std::vector<Command> baseCommands = {
{"targetOrfDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
{"alnDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::alignmentDb },
{"alnDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::alignmentDb }}},
{"proteinaln2nucl", proteinaln2nucl, &par.threadsandcompression,COMMAND_RESULT,
{"proteinaln2nucl", proteinaln2nucl, &par.proteinaln2nucl, COMMAND_RESULT,
"Transform protein alignments to nucleotide alignments",
NULL,
"Martin Steinegger <[email protected]> ",
Expand Down
12 changes: 0 additions & 12 deletions lib/mmseqs/src/alignment/Alignment.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -305,8 +305,6 @@ void Alignment::run(const std::string &outDB, const std::string &outDBIndex,
swResults.reserve(300);
std::vector<Matcher::result_t> swRealignResults;
swRealignResults.reserve(300);
std::vector<hit_t> shortResults;
shortResults.reserve(300);

#pragma omp for schedule(dynamic, 5) reduction(+: alignmentsNum, totalPassedNum)
for (size_t id = start; id < (start + bucketSize); id++) {
Expand Down Expand Up @@ -400,9 +398,6 @@ void Alignment::run(const std::string &outDB, const std::string &outDBIndex,
computeAlternativeAlignment(queryDbKey, dbSeq, swResults, matcher, evalThr, swMode, thread_idx);
}

if(wrappedScoring && shortResults.size() > 1)
SORT_SERIAL(shortResults.begin(), shortResults.end(), hit_t::compareHitsByScoreAndId);

// write the results
if(swResults.size() > 1)
SORT_SERIAL(swResults.begin(), swResults.end(), Matcher::compareHits);
Expand Down Expand Up @@ -446,17 +441,10 @@ void Alignment::run(const std::string &outDB, const std::string &outDBIndex,
alnResultsOutString.append(buffer, len);
}

for (size_t result = 0; result < shortResults.size(); result++) {
size_t len = snprintf(buffer, 100, "%u\t%d\t%d\n", shortResults[result].seqId, shortResults[result].prefScore,
shortResults[result].diagonal);
alnResultsOutString.append(buffer, len);
}

dbw.writeData(alnResultsOutString.c_str(), alnResultsOutString.length(), queryDbKey, thread_idx);
alnResultsOutString.clear();
swResults.clear();
swRealignResults.clear();
shortResults.clear();
}
if (realign == true) {
delete realigner;
Expand Down
2 changes: 1 addition & 1 deletion lib/mmseqs/src/alignment/MsaFilter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ size_t MsaFilter::filter(MultipleAlignment::MSAResult &msa, std::vector<Matcher:
if (!alnResults.empty()) {
// alignmentResults does not include the query
for (size_t i = 0, j = 0; j < msa.setSize - 1; j++) {
if (keep[j] != 0) {
if (keep[j + 1] != 0) {
if (i < j) {
std::swap(alnResults[i], alnResults[j]);
}
Expand Down
52 changes: 26 additions & 26 deletions lib/mmseqs/src/alignment/PSSMCalculator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,40 +21,39 @@ PSSMCalculator::PSSMCalculator(BaseMatrix *subMat, size_t maxSeqLength, size_t m
this->matchWeight = (float *) malloc_simd_float(Sequence::PROFILE_AA_SIZE * (maxSeqLength + 1) * sizeof(float));
this->pseudocountsWeight = (float *) malloc_simd_float(Sequence::PROFILE_AA_SIZE * (maxSeqLength + 1) * sizeof(float));
this->nseqs = new int[maxSeqLength + 1];
const unsigned int NAA_VECSIZE = ((MultipleAlignment::NAA+ 3 + VECSIZE_INT - 1) / VECSIZE_INT) * VECSIZE_INT;
this->w_contrib = new float*[maxSeqLength + 1];
unsigned int NAA_ALIGNSIZE = ((((MultipleAlignment::NAA + 3) + VECSIZE_FLOAT - 1) / VECSIZE_FLOAT) * VECSIZE_FLOAT) * sizeof(float);
NAA_ALIGNSIZE = ((NAA_ALIGNSIZE + ALIGN_FLOAT - 1) / ALIGN_FLOAT) * ALIGN_FLOAT;
w_contrib = new float*[maxSeqLength + 1];
w_contrib_backing = (unsigned char*)mem_align(ALIGN_FLOAT, NAA_ALIGNSIZE * (maxSeqLength + 1));
for (size_t j = 0; j < (maxSeqLength + 1); j++) {
this->w_contrib[j] = (float *) malloc_simd_int(NAA_VECSIZE * sizeof(float));
w_contrib[j] = (float*)(w_contrib_backing + (NAA_ALIGNSIZE * j));
}
wi = (float*)malloc(maxSetSize * sizeof(float));
naa = new int[maxSeqLength + 1];
f = malloc_matrix<float>(maxSeqLength + 1, MultipleAlignment::NAA + 3);
n = new int*[maxSeqLength + 2];
for (size_t j = 0; j < maxSeqLength; j++) {
n[j] = (int *) malloc_simd_int(NAA_VECSIZE * sizeof(int));
n_backing = (unsigned char*)mem_align(ALIGN_INT, NAA_ALIGNSIZE * (maxSeqLength + 2));
for (size_t j = 0; j < (maxSeqLength + 2); j++) {
n[j] = (int*)(n_backing + (NAA_ALIGNSIZE * j));
}
this->pca = pca;
this->pcb = pcb;
}

PSSMCalculator::~PSSMCalculator() {
delete [] profile;
delete [] Neff_M;
delete[] profile;
delete[] Neff_M;
free(seqWeight);
delete [] pssm;
delete [] nseqs;
delete[] pssm;
delete[] nseqs;
free(matchWeight);
free(pseudocountsWeight);
for (size_t j = 0; j < (maxSeqLength + 1); j++) {
free(w_contrib[j]);
}
delete [] w_contrib;
free(w_contrib_backing);
delete[] w_contrib;
free(wi);
delete [] naa;
for (size_t j = 0; j < maxSeqLength; ++j){
free(n[j]);
}
delete [] n;
delete[] naa;
free(n_backing);
delete[] n;
free(f);
}

Expand Down Expand Up @@ -294,14 +293,15 @@ void PSSMCalculator::computeContextSpecificWeights(float * matchWeight, float *w
const int ENDGAP=22; //Important to distinguish because end gaps do not contribute to tansition counts

int nseqi = 0;
unsigned int NAA_VECSIZE = ((MultipleAlignment::NAA+ 3 + VECSIZE_INT - 1) / VECSIZE_INT) * VECSIZE_INT; // round NAA+3 up to next multiple of VECSIZE_INT
for (size_t j = 0; j < queryLength; j++) {
memset(n[j], 0, sizeof(int) * NAA_VECSIZE);
}

for (size_t j = 0; j < queryLength; j++){
memset(w_contrib[j], 0, NAA_VECSIZE * sizeof(int));
}
// unsigned int NAA_VECSIZE = ((MultipleAlignment::NAA+ 3 + VECSIZE_INT - 1) / VECSIZE_INT) * VECSIZE_INT; // round NAA+3 up to next multiple of VECSIZE_INT
// for (size_t j = 0; j < queryLength; j++) {
// memset(n[j], 0, NAA_VECSIZE * sizeof(int));
// memset(w_contrib[j], 0, NAA_VECSIZE * sizeof(int));
// }
unsigned int NAA_ALIGNSIZE = ((((MultipleAlignment::NAA + 3) + VECSIZE_FLOAT - 1) / VECSIZE_FLOAT) * VECSIZE_FLOAT) * sizeof(float);
NAA_ALIGNSIZE = ((NAA_ALIGNSIZE + ALIGN_FLOAT - 1) / ALIGN_FLOAT) * ALIGN_FLOAT;
memset(n_backing, 0, NAA_ALIGNSIZE * queryLength);
memset(w_contrib_backing, 0, NAA_ALIGNSIZE * queryLength);
// insert endgaps
for (size_t k = 0; k < setSize; ++k) {
for (size_t i = 0; i < queryLength && X[k][i] == MultipleAlignment::GAP; ++i)
Expand Down
4 changes: 4 additions & 0 deletions lib/mmseqs/src/alignment/PSSMCalculator.h
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,8 @@ class PSSMCalculator {

// weight contribution value for each sequence
float **w_contrib;
// backing aligned memory
unsigned char *w_contrib_backing;

// weight of sequence k in column i, calculated from subalignment i
float *wi;
Expand All @@ -75,6 +77,8 @@ class PSSMCalculator {
float **f;

int **n;
// backing aligned memory
unsigned char *n_backing;

size_t maxSeqLength;
size_t maxSetSize;
Expand Down
3 changes: 2 additions & 1 deletion lib/mmseqs/src/commons/Command.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,8 @@ std::vector<int> DbValidator::allDb = {Parameters::DBTYPE_SEQTAXDB, Parameters::
Parameters::DBTYPE_OFFSETDB, Parameters::DBTYPE_GENERIC_DB, Parameters::DBTYPE_TAXONOMICAL_RESULT};
std::vector<int> DbValidator::allDbAndFlat = {Parameters::DBTYPE_SEQTAXDB, Parameters::DBTYPE_INDEX_DB, Parameters::DBTYPE_NUCLEOTIDES, Parameters::DBTYPE_MSA_DB,
Parameters::DBTYPE_HMM_PROFILE, Parameters::DBTYPE_AMINO_ACIDS, Parameters::DBTYPE_ALIGNMENT_RES,
Parameters::DBTYPE_PREFILTER_RES, Parameters::DBTYPE_CLUSTER_RES, Parameters::DBTYPE_GENERIC_DB,
Parameters::DBTYPE_PREFILTER_RES, Parameters::DBTYPE_PREFILTER_REV_RES, Parameters::DBTYPE_CLUSTER_RES,
Parameters::DBTYPE_OFFSETDB, Parameters::DBTYPE_GENERIC_DB, Parameters::DBTYPE_TAXONOMICAL_RESULT,
Parameters::DBTYPE_FLATFILE};
std::vector<int> DbValidator::csDb = {Parameters::DBTYPE_PROFILE_STATE_SEQ};
std::vector<int> DbValidator::ca3mDb = {Parameters::DBTYPE_CA3M_DB};
Expand Down
2 changes: 2 additions & 0 deletions lib/mmseqs/src/commons/CommandCaller.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,8 @@ int CommandCaller::callProgram(const char* program, size_t argc, const char **ar
}

void CommandCaller::execProgram(const char* program, const std::vector<std::string> &argv) {
std::cerr.flush();
std::cout.flush();
// hack: our argv string does not contain a program name anymore, readd it
const char **pArgv = new const char*[argv.size() + 2];
pArgv[0] = program;
Expand Down
2 changes: 1 addition & 1 deletion lib/mmseqs/src/commons/DBReader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1106,7 +1106,7 @@ void copyLinkDb(const std::string &databaseName, const std::string &outDb, DBFil
if (idx != std::string::npos) {
ext = names[i].substr(idx);
} else {
Debug(Debug::ERROR) << "File extention was not found but it is expected to be there!\n"
Debug(Debug::ERROR) << "File extension was not found but it is expected to be there!\n"
<< "Filename: " << names[i] << ".\n";
EXIT(EXIT_FAILURE);
}
Expand Down
42 changes: 0 additions & 42 deletions lib/mmseqs/src/commons/DBWriter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -110,48 +110,6 @@ void DBWriter::sortDatafileByIdOrder(DBReader<unsigned int> &dbr) {

}

void DBWriter::mergeFiles(DBReader<unsigned int> &qdbr,
const std::vector<std::pair<std::string, std::string>>& files,
const std::vector<std::string>& prefixes) {
Debug(Debug::INFO) << "Merging the results to " << dataFileName << "\n";

// open DBReader
const size_t fileCount = files.size();
DBReader<unsigned int> **filesToMerge = new DBReader<unsigned int>*[fileCount];
for (size_t i = 0; i < fileCount; i++) {
filesToMerge[i] = new DBReader<unsigned int>(files[i].first.c_str(),
files[i].second.c_str(), 1, DBReader<unsigned int>::USE_DATA|DBReader<unsigned int>::USE_INDEX);
filesToMerge[i]->open(DBReader<unsigned int>::NOSORT);
}
std::string result;

for (size_t id = 0; id < qdbr.getSize(); id++) {
unsigned int key = qdbr.getDbKey(id);
// get all data for the id from all files
for (size_t i = 0; i < fileCount; i++) {
const char *data = filesToMerge[i]->getDataByDBKey(key, 0);
if (data != NULL) {
if(i < prefixes.size()) {
result.append( prefixes[i]);
}
result.append(data);
}
}
// write result
writeData(result.c_str(), result.length(), key, 0);
result.clear();
}

// close all reader
for (size_t i = 0; i < fileCount; i++) {
filesToMerge[i]->close();
delete filesToMerge[i];
}
delete [] filesToMerge;


}

// allocates heap memory, careful
char* makeResultFilename(const char* name, size_t split) {
std::ostringstream ss;
Expand Down
4 changes: 0 additions & 4 deletions lib/mmseqs/src/commons/DBWriter.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,6 @@ class DBWriter : public MemoryTracker {

void alignToPageSize(int thrIdx = 0);

void mergeFiles(DBReader<unsigned int>& qdbr,
const std::vector<std::pair<std::string, std::string> >& files,
const std::vector<std::string>& prefixes);

void sortDatafileByIdOrder(DBReader<unsigned int>& qdbr);

static void mergeResults(const std::string &outFileName, const std::string &outFileNameIndex,
Expand Down
Loading

0 comments on commit 586b284

Please sign in to comment.