From 534434abeee27f88137df4b004b12d36973dd4ba Mon Sep 17 00:00:00 2001 From: Paul-Davis Date: Tue, 19 May 2020 11:17:02 +0100 Subject: [PATCH] Added code to dump and incorporate a Transposon_CDS peptide file into the release. --- scripts/autoace_builder.pl | 1 + scripts/fasta_dumper.pl | 11 +++++++++-- scripts/make_FTP_sites.pl | 13 ++++++++++--- scripts/release_letter.pl | 1 + 4 files changed, 21 insertions(+), 5 deletions(-) diff --git a/scripts/autoace_builder.pl b/scripts/autoace_builder.pl index 6b0c3adee..83b4cce1f 100755 --- a/scripts/autoace_builder.pl +++ b/scripts/autoace_builder.pl @@ -181,6 +181,7 @@ my @options = "-classmethod CDS:Transposon_CDS:Transposon-mRNA -classmethod Pseudogene:Transposon_Pseudogene:Transposon-pseudogenic_transcript -classmethod Transcript:Transposon_ncRNA:Transposon-non-coding_transcript"; $wormbase->run_script( "fasta_dumper.pl @options -output $seqdir/transposon_transcripts.dna", $log); $wormbase->run_script( "fasta_dumper.pl -classmethod Transposon:Transposon -output $seqdir/transposons.dna", $log); + $wormbase->run_script( "fasta_dumper.pl -classmethod CDS:Transposon_cds -pep -output $seqdir/transposon_cds.pep", $log); } } diff --git a/scripts/fasta_dumper.pl b/scripts/fasta_dumper.pl index dc6801203..52344d8f9 100644 --- a/scripts/fasta_dumper.pl +++ b/scripts/fasta_dumper.pl @@ -19,7 +19,7 @@ use Bio::SeqIO; my ($debug, $store, $verbose, $database, $test, $wormbase, $species, - @classmethodlabel, $out, @seqs); + @classmethodlabel, $out, @seqs, $pep); GetOptions ( "debug:s" => \$debug, "verbose" => \$verbose, #verbose quces a little more info to screen @@ -29,6 +29,7 @@ "test" => \$test, #invoke test env "store:s" => \$store, #supply a storable "species:s" => \$species, #needed to work out what species is being processed + "pep" => \$pep #peptide dump ) ; @@ -68,7 +69,13 @@ my $object_it = $connection->fetch_many(-query => $query); while(my $object = $object_it->next){ - my $dna = $object->asDNA(); + my $dna; + if ($pep) { + $dna = $object->asPeptide(); + } + else { + $dna = $object->asDNA(); + } my @dna = split(/\n/, $dna); shift @dna; $dna = join("", @dna); diff --git a/scripts/make_FTP_sites.pl b/scripts/make_FTP_sites.pl index e615cf184..4e09464b6 100755 --- a/scripts/make_FTP_sites.pl +++ b/scripts/make_FTP_sites.pl @@ -345,10 +345,16 @@ sub copy_dna_files{ map { $copied_files{$_} = 1 } ($dna_file, $masked_file, $soft_file); # copy over outstanding dna files - foreach my $dna_file (glob("$seqdir/*.dna.gz"), glob("$seqdir/*.dna")) { + foreach my $dna_file (glob("$seqdir/*.dna.gz"), glob("$seqdir/*.dna"), glob("$seqdir/*.pep")) { if (not exists $copied_files{$dna_file}) { - my ($prefix) = $dna_file =~ /$seqdir\/(\S+)\.dna/; - my $target = "$dna_dir/${gspecies}.${bioproj}.${WS_version_name}.$prefix.fa.gz"; + my ($prefix) = $dna_file =~ /$seqdir\/(\S+)\./; + my $target; + if ($dna_file =~ /pep/) { + $target = "$dna_dir/${gspecies}.${bioproj}.${WS_version_name}.$prefix.pep.gz"; + } + else { + $target = "$dna_dir/${gspecies}.${bioproj}.${WS_version_name}.$prefix.fa.gz"; + } if ($dna_file =~ /\.gz$/) { $wormbase->run_command("cp -f $dna_file $target", $log); } else { @@ -1379,6 +1385,7 @@ sub checkfile { GSPECIES.BIOPROJ.WSREL.wormpep_package.tar.gz GSPECIES.BIOPROJ.WSREL.transposon_transcripts.fa.gz GSPECIES.BIOPROJ.WSREL.transposons.fa.gz +GSPECIES.BIOPROJ.WSREL.transposon_cds.pep.gz [CORE]species/GSPECIES/BIOPROJ/annotation GSPECIES.BIOPROJ.WSREL.functional_descriptions.txt.gz diff --git a/scripts/release_letter.pl b/scripts/release_letter.pl index 4cdb99e09..7bcc7c19f 100755 --- a/scripts/release_letter.pl +++ b/scripts/release_letter.pl @@ -172,6 +172,7 @@ printf $rlfh " - G_SPECIES.BIOPROJECT.WS$ver.pseudogenic_transcripts.fa.gz - Spliced cDNA sequence for pseudogenic transcripts\n"; printf $rlfh " - G_SPECIES.BIOPROJECT.WS$ver.transposon_transcripts.fa.gz - Spliced cDNA sequence for mRNAs and pseudogenes located in Transposons\n"; printf $rlfh " - G_SPECIES.BIOPROJECT.WS$ver.transposons.fa.gz - DNA sequence of curated and predicted Transposons\n"; +printf $rlfh " - G_SPECIES.BIOPROJECT.WS$ver.transposon_cds.pep.gz - Protein sequence of curated CDSs associated with Transposons\n"; printf $rlfh " - G_SPECIES.BIOPROJECT.WS$ver.intergenic_sequences.fa.gz - DNA sequence between pairs of adjacent genes\n"; printf $rlfh " - G_SPECIES.BIOPROJECT.WS$ver.annotations.gff[2|3].gz - Sequence features in either GFF2 or GFF3 format\n"; printf $rlfh " - G_SPECIES.BIOPROJECT.WS$ver.protein_annotation.gff3.gz - Sequence features in proteins in GFF3 format\n";