Merge branch 'master' of https://github.com/WormBase/wormbase-pipeline

WormBase · Aug 5, 2021 · c33ab12 · c33ab12
2 parents b41b409 + 705e648
commit c33ab12
Show file tree

Hide file tree

Showing 6 changed files with 101 additions and 62 deletions.
diff --git a/parasite/checks/Blast.dump.pl b/parasite/checks/Blast.dump.pl
@@ -31,12 +31,18 @@
 );
 # connect to database
 my $dbh = $dba->dbc->db_handle;
-my $sql_dna = "SELECT COUNT(distinct(name)) FROM seq_region;";# count number of DNA in database
+my $sql_dna = "SELECT COUNT(*) FROM seq_region LEFT JOIN seq_region_attrib
+               ON seq_region.seq_region_id = seq_region_attrib.seq_region_id 
+               LEFT JOIN attrib_type
+               ON attrib_type.attrib_type_id = seq_region_attrib.attrib_type_id
+               WHERE attrib_type.code = 'toplevel';";
 my $sql_pep = "SELECT COUNT(translation_id) FROM translation;";# count number of protein in database
 my $sql_trans = "SELECT COUNT(stable_id) FROM transcript;"; # count number of transcript in database
 my $sql_1 = "SELECT meta_value FROM meta WHERE meta_key = 'species.url';"; # retreive the species.url from meta table
 my $sql_2 = "SELECT meta_value FROM meta WHERE meta_key = 'assembly.default';";
 
+print "working on $db\n";
+
 #my @row = $dbh->selectrow_array($sql)  ;
 #print "@row\n";
 my $sth_trans = $dbh->prepare($sql_trans);          # prepare the query
@@ -71,11 +77,8 @@
 
 my $value2; 
 $value2 = $sth_2->fetchrow_array; 
+$value2 =~ s/\s/_/g;
 
-#while (@row = $sth->fetchrow_array) 
-#{  
- #  print join(", ", @row), "\n";        # retrieve one row
-#}
 
 # preparing path for dump files
 my $cDNA ; 

diff --git a/parasite/checks/FTP.dump.pl b/parasite/checks/FTP.dump.pl
@@ -2,7 +2,7 @@
 # core _${PARASITE_VERSION}_ | while read -r i ; 
 # do perl FTP.dump.pl $($PARASITE_STAGING_MYSQL details script) 
 #  -dumpPath /hps/nobackup/production/ensemblgenomes/parasite/production/dumps/WBPS15/FTP
-# -dbname $i ; done
+# -dbname $i -wbps_version $PARASITE_VERSION ; done
 
 
 #!/usr/bin/env perl
@@ -14,7 +14,7 @@
 
 
 #my $input = shift or die ; 
-my ($host, $user, $db, $pass, $port,$dp, $dump_file);
+my ($host, $user, $db, $pass, $port,$dp, $dump_file, $wbps_version);
 GetOptions
 (
 'host=s'       => \$host,
@@ -24,6 +24,7 @@
 'user=s'       => \$user,
 'db=s'       => \$db,
 'dumpPath=s'   => \$dp,
+'wbps_version=i' => \$wbps_version,
 #'dump_file=s'  => \$dump_file,
 )|| die ("check command line params\n");
 
@@ -37,11 +38,14 @@
 );
 
 my $dbh = $dba->dbc->db_handle;
-my $sql_dna = "SELECT COUNT(distinct(name)) FROM seq_region;";# count number of dna in database
+my $sql_dna = "SELECT COUNT(*) FROM seq_region LEFT JOIN seq_region_attrib
+               ON seq_region.seq_region_id = seq_region_attrib.seq_region_id 
+	       LEFT JOIN attrib_type
+	       ON attrib_type.attrib_type_id = seq_region_attrib.attrib_type_id
+               WHERE attrib_type.code = 'toplevel';"; # count number of dna in database
 my $sql_pep = "SELECT COUNT(translation_id) FROM translation;";# count number of protein in database
 my $sql_trans = "SELECT COUNT(stable_id) FROM transcript;"; # count number of transcript in database
 my $sql_1 = "SELECT meta_value FROM meta WHERE meta_key = 'species.url';";
-my $sql_2 = "SELECT meta_value FROM meta WHERE meta_key = 'species.bioproject_id';"; 
 
 # sub routine to count the sequence in dump
 sub mycount 
@@ -73,10 +77,7 @@ sub mycount
 my $sth_1 = $dbh->prepare($sql_1);          # prepare the query
 $sth_1->execute();                        # execute the query
 
-my $sth_2 = $dbh->prepare($sql_2);          # prepare the query
-$sth_2->execute();                        # execute the query
 
-#my @row;
 # parse the query to variable 
 my $value_trans; 
 $value_trans = $sth_trans->fetchrow_array; 
@@ -90,29 +91,29 @@ sub mycount
 my $value1; 
 $value1 = $sth_1->fetchrow_array; 
 
-my $value2; 
-$value2 = $sth_2->fetchrow_array; 
-
-
 
 my @val = split (/_/, $value1);
 my $spe_name = lc("$val[0]_$val[1]");
+my $bioproject = uc($val[2]);
+$bioproject =~ s/(.)PRJ/$1_PRJ/;
 
 # Getting path for different files in dump in variable for both ziped and unzipped file as I have mix of them
 my ( $pepgz,$dna_genomicgz, $dna_genomicSMgz, $dna_genomicMgz,$transgz, $pep,$dna_genomic, $dna_genomicSM, $dna_genomicM,$trans);
-$pepgz = "$dp/$spe_name/$value2/$spe_name\.$value2\.WBPS15.protein.fa.gz" ;
-$dna_genomicgz = "$dp/$spe_name/$value2/$spe_name\.$value2\.WBPS15.genomic.fa.gz" ;
-$dna_genomicMgz = "$dp/$spe_name/$value2/$spe_name\.$value2\.WBPS15.genomic_masked.fa.gz" ;
-$dna_genomicSMgz = "$dp/$spe_name/$value2/$spe_name\.$value2\.WBPS15.genomic_softmasked.fa.gz" ;
-$transgz = "$dp/$spe_name/$value2/$spe_name\.$value2\.WBPS15.mRNA_transcripts.fa.gz" ;
+my $path = "$dp/$spe_name/$bioproject/$spe_name.$bioproject.WBPS$wbps_version";
+
+$pepgz = "$path.protein.fa.gz" ;
+$dna_genomicgz = "$path.genomic.fa.gz" ;
+$dna_genomicMgz = "$path.genomic_masked.fa.gz" ;
+$dna_genomicSMgz = "$path.genomic_softmasked.fa.gz" ;
+$transgz = "$path.mRNA_transcripts.fa.gz" ;
 
 #path for files 
 
-my $pep_path = "$dp/$spe_name/$value2/$spe_name\.$value2\.WBPS15.protein.fa";
-my $genomic_path = "$dp/$spe_name/$value2/$spe_name\.$value2\.WBPS15.genomic.fa";
-my $genomicM_path = "$dp/$spe_name/$value2/$spe_name\.$value2\.WBPS15.genomic_masked.fa";
-my $genomicSM_path = "$dp/$spe_name/$value2/$spe_name\.$value2\.WBPS15.genomic_softmasked.fa";
-my $trans_path = "$dp/$spe_name/$value2/$spe_name\.$value2\.WBPS15.mRNA_transcripts.fa";
+my $pep_path = "$path.protein.fa";
+my $genomic_path = "$path.genomic.fa";
+my $genomicM_path = "$path.genomic_masked.fa";
+my $genomicSM_path = "$path.genomic_softmasked.fa";
+my $trans_path = "$path.mRNA_transcripts.fa";
 
 
 print "working on: $db\n";

diff --git a/scripts/AGR/make_agr_disease_json.pl b/scripts/AGR/make_agr_disease_json.pl
@@ -310,8 +310,10 @@
 
   my $secondary_base_annot = dclone($annot); #Create copy to use for secondary annotations before conditionRelations added
 
-  my $conditions = get_condition_relations($obj);
-  $annot->{conditionRelations} = $conditions if @$conditions;
+  if (!$build) {
+      my $conditions = get_condition_relations($obj);
+      $annot->{conditionRelations} = $conditions if @$conditions;
+  }
 
   push @annots, $annot;# unless ($obj_type eq 'transgene' && ! $build);
 

diff --git a/scripts/AGR/run_agr_vep_pipelines.pl b/scripts/AGR/run_agr_vep_pipelines.pl
@@ -11,6 +11,7 @@
 use Digest::MD5;
 use Log_files;
 use Wormbase;
+use DateTime;
 
 const my $FMS_LATEST_PREFIX => 'https://fms.alliancegenome.org/api/datafile/by/';
 const my $FMS_LATEST_SUFFIX => '?latest=true';
@@ -30,11 +31,13 @@
 			 'SGDr64'   => 'SGD',
 			 'WBcel235' => 'WB',
 			 'GRCz11'   => 'ZFIN',
+			 'HUMAN'    => 'HUMAN'
     );
 const my $BASE_DIR => $ENV{'AGR_VEP_BASE_DIR'} . '/' . $ENV{'AGR_RELEASE'} . '/' . $ENV{'DOWNLOAD_DATE'};
 const my $HGNC_FILE_URL => 'http://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/tsv/hgnc_complete_set.txt';
 const my @IDS_TO_MAP => ('symbol', 'entrez_id', 'ensembl_gene_id', 'vega_id', 'ucsc_id', 'refseq_accession', 'mgd_id', 'rgd_id', 'omim_id', 'agr');
 const my $CHECKSUMS_FILE => $ENV{'AGR_VEP_BASE_DIR'} . '/mod_file_checksums.txt';
+const my $HUMAN_FILES_DIR => $ENV{'AGR_VEP_BASE_DIR'} . '/human_vep_input_files';
 
 const my %REFSEQ_CHROMOSOMES => (
     'FB'   => {
@@ -198,38 +201,41 @@
     },
     );
 
-my ($url, $test, $logfile, $debug, $password, $cleanup, $nocheck, $overwrite, $help);
+my ($url, $test, $logfile, $debug, $password, $cleanup, $nocheck, $overwrite, $external_human_gff, $help);
 my $stages = '1,2,3,4,5';
 my $mods_string = 'FB,MGI,RGD,SGD,WB,ZFIN,HUMAN';
 
 GetOptions(
-    "mods|m=s"     => \$mods_string,
-    "stages|s=s"   => \$stages,
-    "password|p=s" => \$password,
-    "logfile|l=s"  => \$logfile,
-    "debug|d=s"    => \$debug,
-    "url|u=s"      => \$url,
-    "test|t"       => \$test,
-    "cleanup|c"    => \$cleanup,
-    "overwrite|o"  => \$overwrite,
-    "nocheck|n"    => \$nocheck,
-    "help|h"       => \$help,
+    "mods|m=s"              => \$mods_string,
+    "stages|s=s"            => \$stages,
+    "password|p=s"          => \$password,
+    "logfile|l=s"           => \$logfile,
+    "debug|d=s"             => \$debug,
+    "url|u=s"               => \$url,
+    "test|t"                => \$test,
+    "cleanup|c"             => \$cleanup,
+    "overwrite|o"           => \$overwrite,
+    "nocheck|n"             => \$nocheck,
+    "external_human_gff|e"  => \$external_human_gff,
+    "help|h"                => \$help,
     ) or print_usage();
 
 print_usage() if $help;
 
+my $start_time = DateTime->now->strftime('%Y%m%d%H%M%S');
+
 my @mods = split(',', $mods_string);
-$logfile = $BASE_DIR . '/submission.log' if !$logfile;;
+$logfile = "${BASE_DIR}/submission.${start_time}.log" if !$logfile;;
 
 my $log = Log_files->make_log($logfile, $debug);
-download_from_agr(\@mods, $url, $overwrite, $log) if $stages =~ /1/;
+download_from_agr(\@mods, $start_time, $url, $overwrite, $external_human_gff, $log) if $stages =~ /1/;
 
 for my $mod (@mods) {
     my ($checksums, $run_stages) = check_for_new_data($mod, $nocheck, $log);
     chdir "$BASE_DIR/$mod";
     if ($stages =~ /2/) {
 	if ($run_stages->{2}) {
-	    process_input_files($mod, $log);
+	    process_input_files($mod, $external_human_gff, $log);
 	}
 	else {
 	    $log->write_to("Skipping processing of input files for $mod " .
@@ -371,20 +377,21 @@ sub update_checksums {
 
 
 sub download_from_agr {
-    my ($mods, $url, $overwrite, $log) = @_;
+    my ($mods, $start_time, $url, $overwrite, $external_human_gff, $log) = @_;
 
     my $download_urls = defined $url ? get_urls_from_snapshot($url) : get_latest_urls();
 
     make_path($BASE_DIR) unless -d $BASE_DIR;
-    my $input_files_file = "${BASE_DIR}/VEP_input_files.txt";
-    open (FILES, '>>', $input_files_file) or $log->lod_and_die("Couldn't open $input_files_file to append data\n");
+    my $input_files_file = "${BASE_DIR}/VEP_input_files.${start_time}.txt";
+    open (FILES, '>>', $input_files_file) or $log->log_and_die("Couldn't open $input_files_file to append data\n");
     for my $mod (@$mods) {
 	my $time = localtime();
 	print FILES "${mod}: $time\n";
 	my $mod_dir = "$BASE_DIR/$mod";
 	make_path($mod_dir) unless -d $mod_dir;
 	chdir $mod_dir;
 	for my $datatype (keys %{$download_urls->{$mod}}){
+	    next if $external_human_gff and $mod eq 'HUMAN' and $datatype eq 'GFF';
 	    my $extension = $DATATYPE_EXTENSIONS{$datatype};
 	    if (-e "${mod}_${datatype}.${extension}" and !$overwrite) {
 		$log->write_to("Using previously downloaded $mod $datatype\n");
@@ -403,8 +410,18 @@ sub download_from_agr {
 	unlink "${mod}_FASTA.fa.fai" if -e "${mod}_FASTA.fa.fai";
 	run_system_cmd('python3 ' . $ENV{'CVS_DIR'} . "/AGR/agr_variations_json2vcf.py -j ${mod}_VARIATION.json -m $mod -g ${mod}_GFF.gff " .
 		       "-f ${mod}_FASTA.fa -o ${mod}_VCF.vcf", "Converting $mod phenotypic variants JSON to VCF", $log) if -e "${mod}_VARIATION.json";
+	if ($mod eq 'HUMAN') {
+	    # Copy across files not on FMS from local directory
+	    my @files_to_copy = ('HUMAN_HTVCF.vcf');
+	    push @files_to_copy, 'HUMAN_GFF.gff' if $external_human_gff;
+	    for my $file (@files_to_copy) {
+	    run_system_cmd("cp ${HUMAN_FILES_DIR}/${file} $file", "Copying local $file file to working directory", $log);
+	}
+    }
+
     }
     close (FILES);
+
 
     return;
 }
@@ -498,18 +515,18 @@ sub check_if_actually_compressed {
 
 
 sub process_input_files {
-    my ($mod, $log) = @_;
+    my ($mod, $external_human_gff, $log) = @_;
 
     cleanup_intermediate_files($mod, $log);
 
-    sort_vcf_files($mod, $log);
+    sort_vcf_files($mod, $log) unless $mod eq 'HUMAN'; # No need for human as using local (sorted) file
 
     my $chr_map;
     check_chromosome_map($mod, $log);
     convert_fasta_headers($mod, $log);
     convert_vcf_chromosomes($mod, 'VCF', $log);
 
-    munge_gff($mod, $log);
+    munge_gff($mod, $external_human_gff, $log);
     run_system_cmd("bgzip -c ${mod}_FASTA.refseq.fa > ${mod}_FASTA.refseq.fa.gz", "Compressing $mod FASTA", $log);
     run_system_cmd("sort -k1,1 -k4,4n -k5,5n -t\$'\\t' ${mod}_GFF.refseq.gff | bgzip -c > ${mod}_GFF.refseq.gff.gz",
     	             "Sorting and compressing $mod GFF", $log);
@@ -524,7 +541,6 @@ sub calculate_pathogenicity_predictions {
 
     backup_pathogenicity_prediction_db($mod, $password, $log);
 
-    #my $lsf_queue = $test ? $ENV{'LSF_TEST_QUEUE'} : $ENV{'LSF_DEFAULT_QUEUE'}; # Waiting for clarification of production queue usage for AGR work
     my $lsf_queue = $ENV{'LSF_DEFAULT_QUEUE'};
 
     my $init_cmd = "init_pipeline.pl VepProteinFunction::VepProteinFunction_conf -mod $mod" .
@@ -707,7 +723,8 @@ sub sort_vcf_files {
 
 
 sub remove_mirna_primary_transcripts {
-    my (%mirna_parents, %parents, $log);
+    my $log = shift;
+    my (%mirna_parents, %parents);
 
     $log->write_to("Getting RefSeq HUMAN miRNA details from GFF\n");
 
@@ -765,15 +782,15 @@ sub remove_mirna_primary_transcripts {
 
 
 sub munge_gff {
-    my ($mod, $log) = @_;
+    my ($mod, $external_human_gff, $log) = @_;
 
     run_system_cmd("rm ${mod}_GFF.refseq.gff", "Deleting old munged GFF file", $log) if -e "${mod}_GFF.refseq.gff";
 
     my $reverse_map = get_reverse_chromosome_map($mod);
 
     my $hgnc_id_map;
-    if ($mod eq 'HUMAN') {
-	remove_mirna_primary_transcripts();
+    if ($mod eq 'HUMAN' and $external_human_gff) {
+	remove_mirna_primary_transcripts($log);
 	$hgnc_id_map = get_hgnc_id_map($log);
     }
 
@@ -835,7 +852,7 @@ sub munge_gff {
 	    }
 	}
 
-	if ($mod eq 'HUMAN') {
+	if ($mod eq 'HUMAN' and $external_human_gff) {
 	    $line = convert_to_hgnc_gene_ids(\@columns, $hgnc_id_map);
 	}
 
@@ -886,8 +903,18 @@ sub convert_to_hgnc_gene_ids {
     my %attr = split /[=;]/, $columns->[8];
     my @pairs;
     my %keys_to_map = map {$_ => 1} ('ID', 'Parent', 'gene_id', 'id');
+    # Dbxref=GeneID:100996442,Genbank:XR_001737578.2
     for my $key (keys %attr) {
 	my $value = $attr{$key};
+	if ($key eq 'Dbxref') {
+	    my @dbxrefs = split ',', $value;
+	    my @dbxrefs_to_keep;
+	    for my $dbxref (@dbxrefs) {
+		push @dbxrefs_to_keep, $dbxref unless $dbxref =~ /^GeneID:/
+	    }
+	    push @pairs, 'Dbxref=' . join(',', @dbxrefs_to_keep) if @dbxrefs_to_keep;
+	    next;
+	}
 	$value =~ s/^gene[:\-]//;
 	if (exists $keys_to_map{$key} and exists $hgnc_id_map->{$value}) {
 	    push @pairs, $key . '=' . $hgnc_id_map->{$value};
@@ -1084,7 +1111,9 @@ sub cleanup_intermediate_files {
 
 
 sub get_hgnc_id_map {
-    my ($file, $log) = $HGNC_FILE_URL =~ /\/([^\/]+)$/;
+    my $log = shift;
+
+    my ($file) = $HGNC_FILE_URL =~ /\/([^\/]+)$/;
     run_system_cmd("curl -O $HGNC_FILE_URL", 'Downloading HGNC gene ID map file', $log) unless -e $file;
 
     my $first_line = 1;

diff --git a/scripts/merge_split_camaces.pl b/scripts/merge_split_camaces.pl
@@ -719,12 +719,14 @@ sub load_curation_data {
 		  "$blat_dir/virtual_objects.$species.ci.OST.$species.ace",
 		  "$blat_dir/virtual_objects.$species.ci.RST.$species.ace",
 		  "$blat_dir/virtual_objects.$species.ci.ncRNA.$species.ace",
+		  "$blat_dir/virtual_objects.$species.ci.elegans_Nanopore.ace",
 		  "$blat_dir/$species.blat.${species}_OST.ace",
 		  "$blat_dir/$species.blat.${species}_RST.ace",
 		  "$blat_dir/$species.blat.${species}_ncRNA.ace",
 		  "$blat_dir/$species.good_introns.OST.ace",
 		  "$blat_dir/$species.good_introns.RST.ace",
 		  "$blat_dir/$species.good_introns.ncRNA.ace",
+		  "$blat_dir/$species.good_introns.Nanopore.ace",
 		 );
 	  }
     push (@BLATfiles,