diff --git a/backend/src/main/kotlin/org/pathoplexus/backend/config/ReferenceGenome.kt b/backend/src/main/kotlin/org/pathoplexus/backend/config/ReferenceGenome.kt index a71c4ed64..a1c47cb9e 100644 --- a/backend/src/main/kotlin/org/pathoplexus/backend/config/ReferenceGenome.kt +++ b/backend/src/main/kotlin/org/pathoplexus/backend/config/ReferenceGenome.kt @@ -1,5 +1,6 @@ package org.pathoplexus.backend.config +import org.pathoplexus.backend.api.AminoAcidSequence import org.pathoplexus.backend.api.NucleotideSequence data class ReferenceGenome( @@ -15,9 +16,13 @@ data class ReferenceGenome( fun getNucleotideSegmentReference(segmentName: String): NucleotideSequence? = nucleotideSequences.find { it.name == segmentName }?.sequence + + fun getAminoAcidGeneReference(gene: String): AminoAcidSequence? = genes.find { + it.name == gene + }?.sequence } data class ReferenceSequence( val name: String, - val sequence: NucleotideSequence, + val sequence: String, ) diff --git a/backend/src/main/kotlin/org/pathoplexus/backend/service/CompressionService.kt b/backend/src/main/kotlin/org/pathoplexus/backend/service/CompressionService.kt index a448dc673..4e3c16c11 100644 --- a/backend/src/main/kotlin/org/pathoplexus/backend/service/CompressionService.kt +++ b/backend/src/main/kotlin/org/pathoplexus/backend/service/CompressionService.kt @@ -3,6 +3,7 @@ package org.pathoplexus.backend.service import com.github.luben.zstd.Zstd import org.pathoplexus.backend.api.Organism import org.pathoplexus.backend.api.OriginalData +import org.pathoplexus.backend.api.ProcessedData import org.pathoplexus.backend.config.BackendConfig import org.springframework.stereotype.Service import java.nio.charset.StandardCharsets @@ -21,29 +22,38 @@ enum class CompressionAlgorithm(val extension: String) { @Service class CompressionService(private val backendConfig: BackendConfig) { - fun compressUnalignedNucleotideSequence( - uncompressedSequence: String, - segmentName: String, - organism: Organism, - ): String = compress( - uncompressedSequence, - getDictionaryForNucleotideSequenceSegments(segmentName, organism), - ) + fun compressNucleotideSequence(uncompressedSequence: String, segmentName: String, organism: Organism): String = + compress( + uncompressedSequence, + getDictionaryForNucleotideSequenceSegment(segmentName, organism), + ) - private fun decompressUnalignedNucleotideSequence( + private fun decompressNucleotideSequence( compressedSequence: String, segmentName: String, organism: Organism, ): String = decompress( compressedSequence, - getDictionaryForNucleotideSequenceSegments(segmentName, organism), + getDictionaryForNucleotideSequenceSegment(segmentName, organism), ) + private fun compressAminoAcidSequence(uncompressedSequence: String, gene: String, organism: Organism): String = + compress( + uncompressedSequence, + getDictionaryForAminoAcidSequence(gene, organism), + ) + + private fun decompressAminoAcidSequence(compressedSequence: String, gene: String, organism: Organism): String = + decompress( + compressedSequence, + getDictionaryForAminoAcidSequence(gene, organism), + ) + fun decompressSequencesInOriginalData(originalData: OriginalData, organism: Organism) = OriginalData( originalData.metadata, originalData .unalignedNucleotideSequences.mapValues { - decompressUnalignedNucleotideSequence( + decompressNucleotideSequence( it.value, it.key, organism, @@ -55,7 +65,7 @@ class CompressionService(private val backendConfig: BackendConfig) { originalData.metadata, originalData .unalignedNucleotideSequences.mapValues { (segmentName, sequenceData) -> - compressUnalignedNucleotideSequence( + compressNucleotideSequence( sequenceData, segmentName, organism, @@ -63,6 +73,63 @@ class CompressionService(private val backendConfig: BackendConfig) { }, ) + fun decompressSequencesInProcessedData(processedData: ProcessedData, organism: Organism) = ProcessedData( + processedData.metadata, + processedData + .unalignedNucleotideSequences.mapValues { + decompressNucleotideSequence( + it.value, + it.key, + organism, + ) + }, + processedData.alignedNucleotideSequences.mapValues { + decompressNucleotideSequence( + it.value, + it.key, + organism, + ) + }, + processedData.nucleotideInsertions, + processedData.alignedAminoAcidSequences.mapValues { + decompressAminoAcidSequence( + it.value, + it.key, + organism, + ) + }, + processedData.aminoAcidInsertions, + ) + + fun compressSequencesInProcessedData(processedData: ProcessedData, organism: Organism) = ProcessedData( + processedData.metadata, + processedData + .unalignedNucleotideSequences.mapValues { (segmentName, sequenceData) -> + compressNucleotideSequence( + sequenceData, + segmentName, + organism, + ) + }, + processedData.alignedNucleotideSequences.mapValues { + compressNucleotideSequence( + it.value, + it.key, + organism, + ) + }, + processedData.nucleotideInsertions, + processedData.alignedAminoAcidSequences.mapValues { + compressAminoAcidSequence( + it.value, + it.key, + organism, + ) + }, + processedData.aminoAcidInsertions, + + ) + private fun compress(seq: String, dictionary: ByteArray?): String { val input = seq.toByteArray(StandardCharsets.UTF_8) val compressBound = Zstd.compressBound(input.size.toLong()).toInt() @@ -96,11 +163,18 @@ class CompressionService(private val backendConfig: BackendConfig) { return String(decompressedBuffer, 0, decompressionReturnCode.toInt(), StandardCharsets.UTF_8) } - private fun getDictionaryForNucleotideSequenceSegments(segmentName: String, organism: Organism): ByteArray? = + private fun getDictionaryForNucleotideSequenceSegment(segmentName: String, organism: Organism): ByteArray? = backendConfig .getInstanceConfig(organism) .referenceGenomes .getNucleotideSegmentReference( segmentName, )?.toByteArray() + + private fun getDictionaryForAminoAcidSequence(geneName: String, organism: Organism): ByteArray? = backendConfig + .getInstanceConfig(organism) + .referenceGenomes + .getAminoAcidGeneReference( + geneName, + )?.toByteArray() } diff --git a/backend/src/main/kotlin/org/pathoplexus/backend/service/SequenceEntriesTableProvider.kt b/backend/src/main/kotlin/org/pathoplexus/backend/service/SequenceEntriesTableProvider.kt index c547f2b6d..7b532f76b 100644 --- a/backend/src/main/kotlin/org/pathoplexus/backend/service/SequenceEntriesTableProvider.kt +++ b/backend/src/main/kotlin/org/pathoplexus/backend/service/SequenceEntriesTableProvider.kt @@ -25,7 +25,7 @@ import org.springframework.stereotype.Service @Service class SequenceEntriesTableProvider(private val compressionService: CompressionService) { - private val cachedTables: MutableMap = mutableMapOf() + private val cachedTables: MutableMap = mutableMapOf() fun get(organism: Organism): SequenceEntriesDataTable { return cachedTables.getOrPut(organism) { @@ -43,6 +43,7 @@ class SequenceEntriesDataTable( SEQUENCE_ENTRIES_TABLE_NAME, ) { val originalDataColumn = serializeOriginalData(compressionService, organism).nullable() + val processedDataColumn = serializeProcessedData(compressionService, organism).nullable() val accessionColumn = varchar("accession", 255) val versionColumn = long("version") @@ -54,7 +55,6 @@ class SequenceEntriesDataTable( val finishedProcessingAtColumn = datetime("finished_processing_at").nullable() val statusColumn = varchar("status", 255) val isRevocationColumn = bool("is_revocation").default(false) - val processedDataColumn = jacksonSerializableJsonb("processed_data").nullable() val errorsColumn = jacksonSerializableJsonb>("errors").nullable() val warningsColumn = jacksonSerializableJsonb>("warnings").nullable() @@ -122,4 +122,25 @@ class SequenceEntriesDataTable( ) }, ) + + private fun serializeProcessedData( + compressionService: CompressionService, + organism: Organism, + ): Column = jsonb( + "processed_data", + { processedData -> + jacksonObjectMapper.writeValueAsString( + compressionService.compressSequencesInProcessedData( + processedData, + organism, + ), + ) + }, + { string -> + compressionService.decompressSequencesInProcessedData( + jacksonObjectMapper.readValue(string) as ProcessedData, + organism, + ) + }, + ) } diff --git a/backend/src/main/kotlin/org/pathoplexus/backend/service/UploadDatabaseService.kt b/backend/src/main/kotlin/org/pathoplexus/backend/service/UploadDatabaseService.kt index 8df476ec0..e6fb13846 100644 --- a/backend/src/main/kotlin/org/pathoplexus/backend/service/UploadDatabaseService.kt +++ b/backend/src/main/kotlin/org/pathoplexus/backend/service/UploadDatabaseService.kt @@ -93,8 +93,7 @@ class UploadDatabaseService( this[sequenceSubmissionIdColumn] = submissionId this[segmentNameColumn] = segmentName this[sequenceUploadIdColumn] = uploadId - // this can be handled better; creating named sequences in the first place; it is possible to de-compress when serializing. Issue now, segment name is not known. - this[compressedSequenceDataColumn] = compressor.compressUnalignedNucleotideSequence( + this[compressedSequenceDataColumn] = compressor.compressNucleotideSequence( it.sequence, segmentName, submittedOrganism,