Skip to content

Commit

Permalink
Fix for fastq2 path failing unlink (deletion), and removing some debu…
Browse files Browse the repository at this point in the history
…gging code.
  • Loading branch information
joshfactorial committed May 29, 2024
1 parent e96fd20 commit 866fbd7
Show file tree
Hide file tree
Showing 9 changed files with 53 additions and 201 deletions.
2 changes: 1 addition & 1 deletion config_template/simple_template.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,6 @@ discard_bed: .
mutation_rate: .
mutation_bed: .
rng_seed: .
min_mutations: 0
min_mutations: .
overwrite_output: .

37 changes: 8 additions & 29 deletions neat/read_simulator/runner.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""
Runner for generate_reads task
"""
import copy
import time
import logging
import pickle
import gzip
Expand All @@ -10,7 +10,7 @@
from pathlib import Path

from .utils import Options, parse_input_vcf, parse_beds, OutputFileWriter, \
generate_variants, write_local_file, generate_reads
generate_variants, generate_reads
from ..common import validate_input_path, validate_output_path
from ..models import MutationModel, SequencingErrorModel, FragmentLengthModel
from ..models.default_cancer_mutation_model import *
Expand Down Expand Up @@ -249,33 +249,24 @@ def read_simulator_runner(config: str, output: str):
# these will be the features common to each contig, for multiprocessing
common_features = {}

all_variants = {} # dict of all ContigVariants objects, indexed by contig, which we will collect at the end.
vcf_files = []
local_variant_files = {}
fastq_files = []

sam_reads_files = []

for contig in breaks:
local_variant_files[contig] = None

_LOG.info(f"Generating variants for {contig}")

# Todo genericize breaks

input_variants = input_variants_dict[contig]
# TODO: add the ability to pick up input variants here from previous loop

local_reference = reference_index[contig]

# _LOG.info(f'Creating trinucleotide map for {contig}...')
# local_trinuc_map = map_chromosome(local_reference, mut_model)

# Since we're only running single threaded for now:
threadidx = 1

local_variant_file = options.temp_dir_path / f'{options.output.stem}_tmp_{contig}_{threadidx}.vcf.gz'

_LOG.debug(f'local vcf filename = {local_variant_file}')

local_bam_pickle_file = None
if options.produce_bam:
local_bam_pickle_file = options.temp_dir_path / f'{options.output.stem}_tmp_{contig}_{threadidx}.p.gz'
Expand All @@ -296,20 +287,8 @@ def read_simulator_runner(config: str, output: str):
max_qual_score=max_qual_score,
options=options)

_LOG.info(f'Outputting temp vcf for {contig} for later use')
# This function produces the local vcf file.
# TODO pickle dump the ContigVariants object instead. Combine them into one vcf
# at the end.
write_local_file(
local_variant_file,
local_variants,
local_reference,
target_regions_dict[contig],
discard_regions_dict[contig]
)

# The above function writes data to local_variant_file, so we need only store its location.
vcf_files.append(local_variant_file)
# This function saves the local variant data a dictionary. We may need to write this to file.
local_variant_files[contig] = local_variants

if options.produce_fastq or options.produce_bam:
read1_fastq_paired, read1_fastq_single, read2_fastq_paired, read2_fastq_single = \
Expand All @@ -333,15 +312,15 @@ def read_simulator_runner(config: str, output: str):

if options.produce_vcf:
_LOG.info(f"Outputting golden vcf: {str(output_file_writer.vcf_fn)}")
output_file_writer.merge_temp_vcfs(vcf_files)
output_file_writer.write_final_vcf(local_variant_files, reference_index)

if options.produce_fastq:
if options.paired_ended:
_LOG.info(f"Outputting fastq files: "
f"{', '.join([str(x) for x in output_file_writer.fastq_fns]).strip(', ')}")
else:
_LOG.info(f"Outputting fastq file: {output_file_writer.fastq_fns[0]}")
output_file_writer.merge_temp_fastqs(fastq_files, options.paired_ended, options.rng)
output_file_writer.merge_temp_fastqs(fastq_files, options.rng)

if options.produce_bam:
_LOG.info(f"Outputting golden bam file: {str(output_file_writer.bam_fn)}")
Expand Down
1 change: 0 additions & 1 deletion neat/read_simulator/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,3 @@
from .vcf_func import *
from .generate_reads import *
from .generate_variants import *
from .local_file_writer import *
11 changes: 4 additions & 7 deletions neat/read_simulator/utils/generate_reads.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import logging
import time
import pickle
import numpy as np

from math import ceil
from pathlib import Path
Expand All @@ -14,8 +13,6 @@
from ...variants import ContigVariants
from .read import Read

# TODO check that we're not truncating reads with deletions, but getting a full 151 bases

__all__ = [
'generate_reads',
'cover_dataset',
Expand Down Expand Up @@ -199,20 +196,20 @@ def generate_reads(reference: SeqRecord,
base_name = f'NEAT-generated_{chrom}'

_LOG.debug("Covering dataset.")
t = time.process_time()
t = time.time()
reads = cover_dataset(
len(reference),
options,
fraglen_model,
)
_LOG.debug(f"Dataset coverage took: {(time.process_time() - t)/60:.2f} m")
_LOG.debug(f"Dataset coverage took: {(time.time() - t)/60:.2f} m")

# These will hold the values as inserted.
properly_paired_reads = []
singletons = []

_LOG.debug("Writing fastq(s) and optional tsam, if indicated")
t = time.process_time()
t = time.time()
with (
open_output(chrom_fastq_r1_paired) as fq1_paired,
open_output(chrom_fastq_r1_single) as fq1_single,
Expand Down Expand Up @@ -361,7 +358,7 @@ def generate_reads(reference: SeqRecord,
else:
singletons.append((None, read_2))

_LOG.info(f"Contig fastq(s) written in: {(time.process_time() - t)/60:.2f} m")
_LOG.info(f"Contig fastq(s) written in: {(time.time() - t)/60:.2f} m")

if options.produce_bam:
# this will give us the proper read order of the elements, for the sam. They are easier to sort now
Expand Down
2 changes: 1 addition & 1 deletion neat/read_simulator/utils/generate_variants.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def generate_variants(reference: SeqRecord,
existing_variants: ContigVariants,
mutation_model: MutationModel,
options: Options,
max_qual_score: int):
max_qual_score: int) -> ContigVariants:
"""
This function will generate variants to add to the dataset, by writing them to the input temp vcf file.
Expand Down
114 changes: 0 additions & 114 deletions neat/read_simulator/utils/local_file_writer.py

This file was deleted.

10 changes: 4 additions & 6 deletions neat/read_simulator/utils/options.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,13 +207,11 @@ def read(self):

# Now we check that the type is correct and it is in range, depending on the type defined for it
# If it passes that it gets put into the args dictionary.
try:
temp = type_of_var(value)
except ValueError:
raise ValueError(f"Incorrect type for value entered for {key}: {type_of_var}")
if value != type_of_var(value):
raise ValueError(f"Incorrect type for value entered for {key}: {type_of_var} (found: {value})")

self.check_and_log_error(key, temp, criteria1, criteria2)
self.args[key] = temp
self.check_and_log_error(key, value, criteria1, criteria2)
self.args[key] = value

def set_random_seed(self):
"""
Expand Down
Loading

0 comments on commit 866fbd7

Please sign in to comment.