diff --git a/scripts/GFF_post_process/GFF_post_process.pl b/scripts/GFF_post_process/GFF_post_process.pl index adcb77c7b..91b72ce1f 100755 --- a/scripts/GFF_post_process/GFF_post_process.pl +++ b/scripts/GFF_post_process/GFF_post_process.pl @@ -504,7 +504,7 @@ sub collate_and_sort { # if ($gff3) { my $outfile = "$working_dir/post_process_gff3.gff3"; - $wormbase->run_script("GFF_post_process/post_process_gff3.pl -infile $processed_gff_file -outfile $outfile", $log) + $wormbase->run_script("GFF_post_process/post_process_gff3.pl -infile $processed_gff_file -outfile $outfile -tecred", $log) and $log->log_and_die("Unsuccessful post-processing of GFF3 prior to collation\n"); if ($debug) { diff --git a/scripts/GFF_post_process/post_process_gff3.pl b/scripts/GFF_post_process/post_process_gff3.pl index 758c21e78..1d948f358 100644 --- a/scripts/GFF_post_process/post_process_gff3.pl +++ b/scripts/GFF_post_process/post_process_gff3.pl @@ -59,12 +59,13 @@ # variables and command-line options # ###################################### -my ( $debug, $test, $store, $wormbase, $species ); +my ( $debug, $test, $store, $wormbase, $species, $tecred ); my ( $infile, $outfile, $changed_lines ); GetOptions( 'debug=s' => \$debug, 'test' => \$test, + 'tecred' => \$tecred, 'store:s' => \$store, 'species:s' => \$species, 'infile:s' => \$infile, @@ -89,6 +90,31 @@ $log->log_and_die("You must define -infile and -outfile\n"); } +open(my $gff_in_fh, $infile) or $log->log_and_die("Could not open $infile for reading\n"); + + +#### TECRED modification START #### + +# Read in all TECRED IDs and store in a hash +my %tecredid; +if ($tecred) { + print "Doing TECRED processing\n"; + while(<$gff_in_fh>) { + chomp; + my $line = $_; + my @l = split(/\t+/, $line); + if ($l[1]=~/TEC_RED/) { + my @trid =split(/\s+/,$l[8]); + $tecredid{$trid[0]}+=1; + #print "$trid[0]\t$tecredid{$trid[0]}\n"; + } + } +} +close($gff_in_fh) or $log->log_and_die("Could not close $outfile after writing\n"); + +#### TECRED modification END #### + + open(my $gff_in_fh, $infile) or $log->log_and_die("Could not open $infile for reading\n"); open(my $gff_out_fh, ">$outfile") or $log->log_and_die("Could not open $outfile for writing\n"); @@ -132,6 +158,28 @@ $l[4] = $l[3]; } + # + # Fix up the tecreds if needed https://github.com/WormBase/website/issues/8039 + # Remove orientation, and mark up multimapped TEC-REDs + # + if ($tecred) { + if ($l[1]=~/TEC_RED/) { + $l[6]='.'; + #print "$line\n"; + + # Mark multimapping + my @trid =split(/\s+/,$l[8]); + if ($tecredid{$trid[0]} >1) { + #print "$trid[0]\t multimap\n"; + $l[8]=$l[8] . "; multimapping=TRUE"; + } + else { + $l[8]=$l[8] . "; multimapping=FALSE"; + #print "$trid[0]\t$tecredid{$trid[0]}\n"; + } + } + } + # # Clean up attributes # diff --git a/scripts/RNASeq_align.pl b/scripts/RNASeq_align.pl index 31821eb2c..b41e04f7a 100644 --- a/scripts/RNASeq_align.pl +++ b/scripts/RNASeq_align.pl @@ -205,20 +205,28 @@ sub results { # make the ace file of RNASeq spanned introns to load into acedb print "Running Intron analyses ...\n"; my $splice_file = $wormbase->acefiles."/RNASeq_splice_${species}.ace"; + print "Writing splicefile $splice_file\n"; # get old splice_file size my $old_splice_file = $splice_file . '.old'; # if splice_file is older than a month, then move it to be the 'old_splice_file' (because this script may be run more than once, overwriting the splice file) if (-M $splice_file >= 30) { system("mv $splice_file $old_splice_file"); + print "Moving $splice_file to $old_splice_file\n"; } my $old_splice_file_size = -s $old_splice_file; chdir $RNASeq->{RNASeqSRADir}; + $rnadir= $RNASeq->{RNASeqSRADir}; + print "Going to folder $rnadir\n"; $status = $wormbase->run_command("rm -f $splice_file", $log); + print "Running command1: cat */Introns/virtual_objects.${species}.RNASeq.ace > $splice_file\n"; $status = $wormbase->run_command("cat */Introns/virtual_objects.${species}.RNASeq.ace > $splice_file", $log); + print "Running command2: acezip.pl -file $splice_file\n"; $status = $wormbase->run_script("acezip.pl -file $splice_file", $log); + print "Running command3: cat */Introns/Intron.ace > ${splice_file}.tmp\n"; $status = $wormbase->run_command("cat */Introns/Intron.ace > ${splice_file}.tmp", $log); + print "Running command4: acezip.pl -file ${splice_file}.tmp\n"; $status = $wormbase->run_script("acezip.pl -file ${splice_file}.tmp", $log); # flatten the results of all libraries at a position into one entry open (FEAT, "< ${splice_file}.tmp") || $log->log_and_die("Can't open file ${splice_file}.tmp\n"); diff --git a/scripts/Wormbase.pm b/scripts/Wormbase.pm index 3901bbeb8..9639c538d 100755 --- a/scripts/Wormbase.pm +++ b/scripts/Wormbase.pm @@ -1373,8 +1373,10 @@ sub establish_paths { $self->{'autoace'} = $self->species eq 'elegans' ? "$basedir/autoace" : "$basedir/".$self->species; $self->{'orgdb'} = $self->{'autoace'}; #."/".$self->{'organism'}; } - + + #$basedir = '/nfs/panda/ensemblgenomes/wormbase/BUILD'; $self->{'basedir'} = $basedir; +# print "BASE $basedir\n"; if ($self->test) { $self->{'ftp_upload'} = $self->wormpub . "/TEST/ftp_uploads/wormbase"; @@ -1452,6 +1454,8 @@ sub establish_paths { $self->{'smasked_genome_seq'} = $self->sequences . "/" . $self->{species} . ".genome_softmasked.fa"; # create dirs if missing + #my $wep = $self->wormpep; + #print "WORMPEP $wep\n"; mkpath( $self->logs ) unless ( -e $self->logs ); mkpath( $self->common_data ) unless ( -e $self->common_data ); mkpath( $self->wormpep ) unless ( -e $self->wormpep ); diff --git a/scripts/make_FTP_sites.pl b/scripts/make_FTP_sites.pl index 9ddbe2ede..0a83860f3 100755 --- a/scripts/make_FTP_sites.pl +++ b/scripts/make_FTP_sites.pl @@ -650,45 +650,95 @@ sub copy_rna_files{ # copy across ontology files ############################################ sub copy_ontology_files { - + + print "Copy ontology innit\n"; + my $runtime = $wormbase->runtime; - $log->write_to("$runtime: copying ontology files\n"); - - my $ace_dir = $wormbase->autoace; - + $log->write_to("$runtime: Copying ontology files\n"); + + my $ace_dir = $wormbase->autoace; my $obo_dir = $wormbase->primaries . "/citace/temp_unpack_dir/home/citace/Data_for_${WS_version_name}/Data_for_Ontology/"; my $ace_ontology_dir = "$ace_dir/ONTOLOGY"; my $ftp_ontology_dir = "$targetdir/ONTOLOGY"; - + #print "$ace_dir $obo_dir $ace_ontology_dir $ftp_ontology_dir\n"; + mkpath($ace_ontology_dir,1,0775); mkpath($ftp_ontology_dir,1,0775); - + + # run through all possible organisms my %accessors = ($wormbase->species_accessors); - $accessors{$wormbase->species} = $wormbase; - my %species_name_map; - while(my ($species, $wb) = each %accessors) { - $species_name_map{$wb->full_name('-g_species' => 1)} = $species; - } + $accessors{elegans} = $wormbase; - $wormbase->run_command("cp -f $obo_dir/*.obo $ace_ontology_dir/", $log); - foreach my $file (glob("$ace_ontology_dir/*.*")) { - my ($filestem, $suffix) = $file =~ /\/([^\/]+)\.([^\.\/]+)$/; - if (exists $species_name_map{$suffix}) { - my ($filetype, $release, $extension) = $filestem =~ /^([^\.]+)\.(WS\d+)\.([^\.]+)$/; - my $wb = $accessors{$species_name_map{$suffix}}; - my $bioproj = $wb->ncbi_bioproject; - my $new_filename = join('.', $suffix, $bioproj, $release, $filtype, $extension); - my $species_dir = "$targetdir/species/$species/$bioproj"; - mkpath($species_dir,1,0775); - $wormbase->run_command("cp -f $file ${species_dir}/${new_filename}", $log); - } - else { - $wormbase->run_command("cp -f $file $ftp_ontology_dir/", $log); - } - } + my %copied; + + foreach my $wb (values %accessors) { + next if exists $skip_species{$wb->species}; + next if @only_species and not exists $only_species{$wb->species}; + + my $gspecies = $wb->full_name('-g_species' => 1); + my $bioproj = $wb->ncbi_bioproject; + my $prefix = $wb->pepdir_prefix; + + #print "$gspecies $bioproj $prefix \n"; + + + # Copy to species folders + foreach my $file (glob("$ace_ontology_dir/*.*")) { + my @a = split(/\./, $file); + my @b = split(/\//, $a[0]); + # If is daf file + if ($file=~/$gspecies/ and $file=~/daf\.txt/) { + my $species_dir = "$targetdir/species/$gspecies/$bioproj"; + #print "cp -f $file $targetdir/species/$gspecies/$bioproj/annotation/$gspecies\.$bioproj\.$WS_version_name\.$b[-1]\.daf\n "; + $wormbase->run_command("cat $file | gzip -n -9 > $targetdir/species/$gspecies/$bioproj/annotation/$gspecies\.$bioproj\.$WS_version_name\.$b[-1]\.daf.gz", $log); + $copied{$file}=1; + } + elsif ($file=~/$gspecies/ and $file=~/rnai_phenotypes/) { + my $species_dir = "$targetdir/species/$gspecies/$bioproj"; + #print "cp -f $file $targetdir/species/$gspecies/$bioproj/annotation/$gspecies\.$bioproj\.$WS_version_name\.$b[-1]\.wb\n "; + $wormbase->run_command("cat $file | gzip -n -9 > $targetdir/species/$gspecies/$bioproj/annotation/$gspecies\.$bioproj\.$WS_version_name\.$b[-1]\.wb.gz", $log); + $copied{$file}=1; + } + elsif ($file=~/$gspecies/) { + my $species_dir = "$targetdir/species/$gspecies/$bioproj"; + #print "cp -f $file $targetdir/species/$gspecies/$bioproj/annotation/$gspecies\.$bioproj\.$WS_version_name\.$b[-1]\.gaf\n "; + $wormbase->run_command("cat $file | gzip -n -9 > $targetdir/species/$gspecies/$bioproj/annotation/$gspecies\.$bioproj\.$WS_version_name\.$b[-1]\.gaf.gz", $log); + $copied{$file}=1; + } + else { + } + } +} + +# Copy the rest of the files without species name to the ontology folder + foreach my $file (glob("$ace_ontology_dir/*.*")) { + my @a = split(/\./, $file); + my @b = split(/\//, $a[0]); + if (exists $copied{$file}) { + #print "Copied $file\n"; + } + else { + if ($file=~/daf\.txt/) { + #print "cp -f $file $targetdir/ONTOLOGY/$b[-1]\.$WS_version_name\.daf\n "; + $wormbase->run_command("cp -f $file $targetdir/ONTOLOGY/$b[-1]\.$WS_version_name\.daf", $log); + } + elsif ($file=~/rnai_phenotypes/) { + #print "cp -f $file $targetdir/ONTOLOGY/$b[-1]\.$WS_version_name\.wb\n "; + $wormbase->run_command("cp -f $file $targetdir/ONTOLOGY/$b[-1]\.$WS_version_name\.wb", $log); + } + else { + #print "cp -f $file $targetdir/ONTOLOGY/$b[-1]\.$WS_version_name\.gaf\n "; + $wormbase->run_command("cp -f $file $targetdir/ONTOLOGY/$b[-1]\.$WS_version_name\.gaf", $log); + } + + } + + } + - $runtime = $wormbase->runtime; - $log->write_to("$runtime: Finished copying ontology files\n\n"); + $runtime = $wormbase->runtime; + $log->write_to("$runtime: Finished copying ontology files!\n\n"); + } diff --git a/scripts/make_life_stage_expression.pl b/scripts/make_life_stage_expression.pl index 99ed98a38..25f5410fa 100644 --- a/scripts/make_life_stage_expression.pl +++ b/scripts/make_life_stage_expression.pl @@ -393,6 +393,7 @@ } else { my $iterator = $db->fetch_many(-query => 'Find Gene where Status = "Live" AND Species = "Caenorhabditis elegans"'); while ($gene = $iterator->next) { + unless ($gene=~/WBGene/) {next}; # Make sure not to read VC2010 genes print "Doing $gene ...\n"; # if ($gene->name eq 'WBGene00001495') { # next; # lacks a Public_name in WS273 diff --git a/scripts/next_builder_checks.pl b/scripts/next_builder_checks.pl old mode 100755 new mode 100644 diff --git a/scripts/preFTPdumps/perSpecies/pre_ftp_dumps.pl b/scripts/preFTPdumps/perSpecies/pre_ftp_dumps.pl index 384a89371..198d30a09 100644 --- a/scripts/preFTPdumps/perSpecies/pre_ftp_dumps.pl +++ b/scripts/preFTPdumps/perSpecies/pre_ftp_dumps.pl @@ -56,7 +56,7 @@ { script => 'dump_geneid_list.pl', output => 'geneIDs.txt', all => 1, mem => 12000 }, { script => 'dump_molecules.pl', output => 'molecules.ace', mem => 12000 }, { script => 'dump_geneid_list.pl', output => 'geneOtherIDs.txt', options => '-other', all=>1, mem => 12000 }, - { script => 'uniprotxrefs.pl', output => 'uniprot_papers.txt', all=>1, mem => 12000 }, + { script => 'uniprotxrefs.pl', output => 'uniprot_papers.txt', all=>1, mem => 20000 }, ); diff --git a/scripts/update_Common_data.pl b/scripts/update_Common_data.pl index b55c910cf..de5df20b0 100755 --- a/scripts/update_Common_data.pl +++ b/scripts/update_Common_data.pl @@ -173,6 +173,8 @@ if ( $arg eq 'est2feature' or $arg eq 'gene_id' or + $arg eq 'worm_gene2cgc' or + $arg eq 'worm_gene2geneID' or $arg eq 'est' or $arg eq 'cds2wormpep') { push @bsub_options, ( @@ -181,8 +183,8 @@ -J => $job_name); } else { push @bsub_options, ( - -M => "3000", - -R => "\"select[mem>3000] rusage[mem=3000]\"", + -M => "6000", + -R => "\"select[mem>6000] rusage[mem=6000]\"", -J => $job_name); } my $cmd = "update_Common_data.pl -${arg}";