Skip to content

Commit

Permalink
feat(backend): compress amino acid sequences in processed data
Browse files Browse the repository at this point in the history
  • Loading branch information
TobiasKampmann committed Dec 12, 2023
1 parent eb2e688 commit 99750fc
Show file tree
Hide file tree
Showing 4 changed files with 117 additions and 18 deletions.
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package org.pathoplexus.backend.config

import org.pathoplexus.backend.api.AminoAcidSequence
import org.pathoplexus.backend.api.NucleotideSequence

data class ReferenceGenome(
Expand All @@ -15,9 +16,13 @@ data class ReferenceGenome(
fun getNucleotideSegmentReference(segmentName: String): NucleotideSequence? = nucleotideSequences.find {
it.name == segmentName
}?.sequence

fun getAminoAcidGeneReference(gene: String): AminoAcidSequence? = genes.find {
it.name == gene
}?.sequence
}

data class ReferenceSequence(
val name: String,
val sequence: NucleotideSequence,
val sequence: String,
)
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package org.pathoplexus.backend.service
import com.github.luben.zstd.Zstd
import org.pathoplexus.backend.api.Organism
import org.pathoplexus.backend.api.OriginalData
import org.pathoplexus.backend.api.ProcessedData
import org.pathoplexus.backend.config.BackendConfig
import org.springframework.stereotype.Service
import java.nio.charset.StandardCharsets
Expand All @@ -21,29 +22,38 @@ enum class CompressionAlgorithm(val extension: String) {
@Service
class CompressionService(private val backendConfig: BackendConfig) {

fun compressUnalignedNucleotideSequence(
uncompressedSequence: String,
segmentName: String,
organism: Organism,
): String = compress(
uncompressedSequence,
getDictionaryForNucleotideSequenceSegments(segmentName, organism),
)
fun compressNucleotideSequence(uncompressedSequence: String, segmentName: String, organism: Organism): String =
compress(
uncompressedSequence,
getDictionaryForNucleotideSequenceSegment(segmentName, organism),
)

private fun decompressUnalignedNucleotideSequence(
private fun decompressNucleotideSequence(
compressedSequence: String,
segmentName: String,
organism: Organism,
): String = decompress(
compressedSequence,
getDictionaryForNucleotideSequenceSegments(segmentName, organism),
getDictionaryForNucleotideSequenceSegment(segmentName, organism),
)

private fun compressAminoAcidSequence(uncompressedSequence: String, gene: String, organism: Organism): String =
compress(
uncompressedSequence,
getDictionaryForAminoAcidSequence(gene, organism),
)

private fun decompressAminoAcidSequence(compressedSequence: String, gene: String, organism: Organism): String =
decompress(
compressedSequence,
getDictionaryForAminoAcidSequence(gene, organism),
)

fun decompressSequencesInOriginalData(originalData: OriginalData, organism: Organism) = OriginalData(
originalData.metadata,
originalData
.unalignedNucleotideSequences.mapValues {
decompressUnalignedNucleotideSequence(
decompressNucleotideSequence(
it.value,
it.key,
organism,
Expand All @@ -55,14 +65,71 @@ class CompressionService(private val backendConfig: BackendConfig) {
originalData.metadata,
originalData
.unalignedNucleotideSequences.mapValues { (segmentName, sequenceData) ->
compressUnalignedNucleotideSequence(
compressNucleotideSequence(
sequenceData,
segmentName,
organism,
)
},
)

fun decompressSequencesInProcessedData(processedData: ProcessedData, organism: Organism) = ProcessedData(
processedData.metadata,
processedData
.unalignedNucleotideSequences.mapValues {
decompressNucleotideSequence(
it.value,
it.key,
organism,
)
},
processedData.alignedNucleotideSequences.mapValues {
decompressNucleotideSequence(
it.value,
it.key,
organism,
)
},
processedData.nucleotideInsertions,
processedData.alignedAminoAcidSequences.mapValues {
decompressAminoAcidSequence(
it.value,
it.key,
organism,
)
},
processedData.aminoAcidInsertions,
)

fun compressSequencesInProcessedData(processedData: ProcessedData, organism: Organism) = ProcessedData(
processedData.metadata,
processedData
.unalignedNucleotideSequences.mapValues { (segmentName, sequenceData) ->
compressNucleotideSequence(
sequenceData,
segmentName,
organism,
)
},
processedData.alignedNucleotideSequences.mapValues {
compressNucleotideSequence(
it.value,
it.key,
organism,
)
},
processedData.nucleotideInsertions,
processedData.alignedAminoAcidSequences.mapValues {
compressAminoAcidSequence(
it.value,
it.key,
organism,
)
},
processedData.aminoAcidInsertions,

)

private fun compress(seq: String, dictionary: ByteArray?): String {
val input = seq.toByteArray(StandardCharsets.UTF_8)
val compressBound = Zstd.compressBound(input.size.toLong()).toInt()
Expand Down Expand Up @@ -96,11 +163,18 @@ class CompressionService(private val backendConfig: BackendConfig) {
return String(decompressedBuffer, 0, decompressionReturnCode.toInt(), StandardCharsets.UTF_8)
}

private fun getDictionaryForNucleotideSequenceSegments(segmentName: String, organism: Organism): ByteArray? =
private fun getDictionaryForNucleotideSequenceSegment(segmentName: String, organism: Organism): ByteArray? =
backendConfig
.getInstanceConfig(organism)
.referenceGenomes
.getNucleotideSegmentReference(
segmentName,
)?.toByteArray()

private fun getDictionaryForAminoAcidSequence(geneName: String, organism: Organism): ByteArray? = backendConfig
.getInstanceConfig(organism)
.referenceGenomes
.getAminoAcidGeneReference(
geneName,
)?.toByteArray()
}
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ import org.springframework.stereotype.Service
@Service
class SequenceEntriesTableProvider(private val compressionService: CompressionService) {

private val cachedTables: MutableMap<Organism?, SequenceEntriesDataTable> = mutableMapOf()
private val cachedTables: MutableMap<Organism, SequenceEntriesDataTable> = mutableMapOf()

fun get(organism: Organism): SequenceEntriesDataTable {
return cachedTables.getOrPut(organism) {
Expand All @@ -43,6 +43,7 @@ class SequenceEntriesDataTable(
SEQUENCE_ENTRIES_TABLE_NAME,
) {
val originalDataColumn = serializeOriginalData(compressionService, organism).nullable()
val processedDataColumn = serializeProcessedData(compressionService, organism).nullable()

val accessionColumn = varchar("accession", 255)
val versionColumn = long("version")
Expand All @@ -54,7 +55,6 @@ class SequenceEntriesDataTable(
val finishedProcessingAtColumn = datetime("finished_processing_at").nullable()
val statusColumn = varchar("status", 255)
val isRevocationColumn = bool("is_revocation").default(false)
val processedDataColumn = jacksonSerializableJsonb<ProcessedData>("processed_data").nullable()
val errorsColumn = jacksonSerializableJsonb<List<PreprocessingAnnotation>>("errors").nullable()
val warningsColumn = jacksonSerializableJsonb<List<PreprocessingAnnotation>>("warnings").nullable()

Expand Down Expand Up @@ -122,4 +122,25 @@ class SequenceEntriesDataTable(
)
},
)

private fun serializeProcessedData(
compressionService: CompressionService,
organism: Organism,
): Column<ProcessedData> = jsonb(
"processed_data",
{ processedData ->
jacksonObjectMapper.writeValueAsString(
compressionService.compressSequencesInProcessedData(
processedData,
organism,
),
)
},
{ string ->
compressionService.decompressSequencesInProcessedData(
jacksonObjectMapper.readValue(string) as ProcessedData,
organism,
)
},
)
}
Original file line number Diff line number Diff line change
Expand Up @@ -93,8 +93,7 @@ class UploadDatabaseService(
this[sequenceSubmissionIdColumn] = submissionId
this[segmentNameColumn] = segmentName
this[sequenceUploadIdColumn] = uploadId
// this can be handled better; creating named sequences in the first place; it is possible to de-compress when serializing. Issue now, segment name is not known.
this[compressedSequenceDataColumn] = compressor.compressUnalignedNucleotideSequence(
this[compressedSequenceDataColumn] = compressor.compressNucleotideSequence(
it.sequence,
segmentName,
submittedOrganism,
Expand Down

0 comments on commit 99750fc

Please sign in to comment.