Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
Faye Rogers committed Aug 5, 2021
2 parents b41b409 + 705e648 commit c33ab12
Show file tree
Hide file tree
Showing 6 changed files with 101 additions and 62 deletions.
13 changes: 8 additions & 5 deletions parasite/checks/Blast.dump.pl
Original file line number Diff line number Diff line change
Expand Up @@ -31,12 +31,18 @@
);
# connect to database
my $dbh = $dba->dbc->db_handle;
my $sql_dna = "SELECT COUNT(distinct(name)) FROM seq_region;";# count number of DNA in database
my $sql_dna = "SELECT COUNT(*) FROM seq_region LEFT JOIN seq_region_attrib
ON seq_region.seq_region_id = seq_region_attrib.seq_region_id
LEFT JOIN attrib_type
ON attrib_type.attrib_type_id = seq_region_attrib.attrib_type_id
WHERE attrib_type.code = 'toplevel';";
my $sql_pep = "SELECT COUNT(translation_id) FROM translation;";# count number of protein in database
my $sql_trans = "SELECT COUNT(stable_id) FROM transcript;"; # count number of transcript in database
my $sql_1 = "SELECT meta_value FROM meta WHERE meta_key = 'species.url';"; # retreive the species.url from meta table
my $sql_2 = "SELECT meta_value FROM meta WHERE meta_key = 'assembly.default';";

print "working on $db\n";

#my @row = $dbh->selectrow_array($sql) ;
#print "@row\n";
my $sth_trans = $dbh->prepare($sql_trans); # prepare the query
Expand Down Expand Up @@ -71,11 +77,8 @@

my $value2;
$value2 = $sth_2->fetchrow_array;
$value2 =~ s/\s/_/g;

#while (@row = $sth->fetchrow_array)
#{
# print join(", ", @row), "\n"; # retrieve one row
#}

# preparing path for dump files
my $cDNA ;
Expand Down
43 changes: 22 additions & 21 deletions parasite/checks/FTP.dump.pl
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# core _${PARASITE_VERSION}_ | while read -r i ;
# do perl FTP.dump.pl $($PARASITE_STAGING_MYSQL details script)
# -dumpPath /hps/nobackup/production/ensemblgenomes/parasite/production/dumps/WBPS15/FTP
# -dbname $i ; done
# -dbname $i -wbps_version $PARASITE_VERSION ; done


#!/usr/bin/env perl
Expand All @@ -14,7 +14,7 @@


#my $input = shift or die ;
my ($host, $user, $db, $pass, $port,$dp, $dump_file);
my ($host, $user, $db, $pass, $port,$dp, $dump_file, $wbps_version);
GetOptions
(
'host=s' => \$host,
Expand All @@ -24,6 +24,7 @@
'user=s' => \$user,
'db=s' => \$db,
'dumpPath=s' => \$dp,
'wbps_version=i' => \$wbps_version,
#'dump_file=s' => \$dump_file,
)|| die ("check command line params\n");

Expand All @@ -37,11 +38,14 @@
);

my $dbh = $dba->dbc->db_handle;
my $sql_dna = "SELECT COUNT(distinct(name)) FROM seq_region;";# count number of dna in database
my $sql_dna = "SELECT COUNT(*) FROM seq_region LEFT JOIN seq_region_attrib
ON seq_region.seq_region_id = seq_region_attrib.seq_region_id
LEFT JOIN attrib_type
ON attrib_type.attrib_type_id = seq_region_attrib.attrib_type_id
WHERE attrib_type.code = 'toplevel';"; # count number of dna in database
my $sql_pep = "SELECT COUNT(translation_id) FROM translation;";# count number of protein in database
my $sql_trans = "SELECT COUNT(stable_id) FROM transcript;"; # count number of transcript in database
my $sql_1 = "SELECT meta_value FROM meta WHERE meta_key = 'species.url';";
my $sql_2 = "SELECT meta_value FROM meta WHERE meta_key = 'species.bioproject_id';";

# sub routine to count the sequence in dump
sub mycount
Expand Down Expand Up @@ -73,10 +77,7 @@ sub mycount
my $sth_1 = $dbh->prepare($sql_1); # prepare the query
$sth_1->execute(); # execute the query

my $sth_2 = $dbh->prepare($sql_2); # prepare the query
$sth_2->execute(); # execute the query

#my @row;
# parse the query to variable
my $value_trans;
$value_trans = $sth_trans->fetchrow_array;
Expand All @@ -90,29 +91,29 @@ sub mycount
my $value1;
$value1 = $sth_1->fetchrow_array;

my $value2;
$value2 = $sth_2->fetchrow_array;



my @val = split (/_/, $value1);
my $spe_name = lc("$val[0]_$val[1]");
my $bioproject = uc($val[2]);
$bioproject =~ s/(.)PRJ/$1_PRJ/;

# Getting path for different files in dump in variable for both ziped and unzipped file as I have mix of them
my ( $pepgz,$dna_genomicgz, $dna_genomicSMgz, $dna_genomicMgz,$transgz, $pep,$dna_genomic, $dna_genomicSM, $dna_genomicM,$trans);
$pepgz = "$dp/$spe_name/$value2/$spe_name\.$value2\.WBPS15.protein.fa.gz" ;
$dna_genomicgz = "$dp/$spe_name/$value2/$spe_name\.$value2\.WBPS15.genomic.fa.gz" ;
$dna_genomicMgz = "$dp/$spe_name/$value2/$spe_name\.$value2\.WBPS15.genomic_masked.fa.gz" ;
$dna_genomicSMgz = "$dp/$spe_name/$value2/$spe_name\.$value2\.WBPS15.genomic_softmasked.fa.gz" ;
$transgz = "$dp/$spe_name/$value2/$spe_name\.$value2\.WBPS15.mRNA_transcripts.fa.gz" ;
my $path = "$dp/$spe_name/$bioproject/$spe_name.$bioproject.WBPS$wbps_version";

$pepgz = "$path.protein.fa.gz" ;
$dna_genomicgz = "$path.genomic.fa.gz" ;
$dna_genomicMgz = "$path.genomic_masked.fa.gz" ;
$dna_genomicSMgz = "$path.genomic_softmasked.fa.gz" ;
$transgz = "$path.mRNA_transcripts.fa.gz" ;

#path for files

my $pep_path = "$dp/$spe_name/$value2/$spe_name\.$value2\.WBPS15.protein.fa";
my $genomic_path = "$dp/$spe_name/$value2/$spe_name\.$value2\.WBPS15.genomic.fa";
my $genomicM_path = "$dp/$spe_name/$value2/$spe_name\.$value2\.WBPS15.genomic_masked.fa";
my $genomicSM_path = "$dp/$spe_name/$value2/$spe_name\.$value2\.WBPS15.genomic_softmasked.fa";
my $trans_path = "$dp/$spe_name/$value2/$spe_name\.$value2\.WBPS15.mRNA_transcripts.fa";
my $pep_path = "$path.protein.fa";
my $genomic_path = "$path.genomic.fa";
my $genomicM_path = "$path.genomic_masked.fa";
my $genomicSM_path = "$path.genomic_softmasked.fa";
my $trans_path = "$path.mRNA_transcripts.fa";


print "working on: $db\n";
Expand Down
6 changes: 4 additions & 2 deletions scripts/AGR/make_agr_disease_json.pl
Original file line number Diff line number Diff line change
Expand Up @@ -310,8 +310,10 @@

my $secondary_base_annot = dclone($annot); #Create copy to use for secondary annotations before conditionRelations added

my $conditions = get_condition_relations($obj);
$annot->{conditionRelations} = $conditions if @$conditions;
if (!$build) {
my $conditions = get_condition_relations($obj);
$annot->{conditionRelations} = $conditions if @$conditions;
}

push @annots, $annot;# unless ($obj_type eq 'transgene' && ! $build);

Expand Down
85 changes: 57 additions & 28 deletions scripts/AGR/run_agr_vep_pipelines.pl
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
use Digest::MD5;
use Log_files;
use Wormbase;
use DateTime;

const my $FMS_LATEST_PREFIX => 'https://fms.alliancegenome.org/api/datafile/by/';
const my $FMS_LATEST_SUFFIX => '?latest=true';
Expand All @@ -30,11 +31,13 @@
'SGDr64' => 'SGD',
'WBcel235' => 'WB',
'GRCz11' => 'ZFIN',
'HUMAN' => 'HUMAN'
);
const my $BASE_DIR => $ENV{'AGR_VEP_BASE_DIR'} . '/' . $ENV{'AGR_RELEASE'} . '/' . $ENV{'DOWNLOAD_DATE'};
const my $HGNC_FILE_URL => 'http://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/tsv/hgnc_complete_set.txt';
const my @IDS_TO_MAP => ('symbol', 'entrez_id', 'ensembl_gene_id', 'vega_id', 'ucsc_id', 'refseq_accession', 'mgd_id', 'rgd_id', 'omim_id', 'agr');
const my $CHECKSUMS_FILE => $ENV{'AGR_VEP_BASE_DIR'} . '/mod_file_checksums.txt';
const my $HUMAN_FILES_DIR => $ENV{'AGR_VEP_BASE_DIR'} . '/human_vep_input_files';

const my %REFSEQ_CHROMOSOMES => (
'FB' => {
Expand Down Expand Up @@ -198,38 +201,41 @@
},
);

my ($url, $test, $logfile, $debug, $password, $cleanup, $nocheck, $overwrite, $help);
my ($url, $test, $logfile, $debug, $password, $cleanup, $nocheck, $overwrite, $external_human_gff, $help);
my $stages = '1,2,3,4,5';
my $mods_string = 'FB,MGI,RGD,SGD,WB,ZFIN,HUMAN';

GetOptions(
"mods|m=s" => \$mods_string,
"stages|s=s" => \$stages,
"password|p=s" => \$password,
"logfile|l=s" => \$logfile,
"debug|d=s" => \$debug,
"url|u=s" => \$url,
"test|t" => \$test,
"cleanup|c" => \$cleanup,
"overwrite|o" => \$overwrite,
"nocheck|n" => \$nocheck,
"help|h" => \$help,
"mods|m=s" => \$mods_string,
"stages|s=s" => \$stages,
"password|p=s" => \$password,
"logfile|l=s" => \$logfile,
"debug|d=s" => \$debug,
"url|u=s" => \$url,
"test|t" => \$test,
"cleanup|c" => \$cleanup,
"overwrite|o" => \$overwrite,
"nocheck|n" => \$nocheck,
"external_human_gff|e" => \$external_human_gff,
"help|h" => \$help,
) or print_usage();

print_usage() if $help;

my $start_time = DateTime->now->strftime('%Y%m%d%H%M%S');

my @mods = split(',', $mods_string);
$logfile = $BASE_DIR . '/submission.log' if !$logfile;;
$logfile = "${BASE_DIR}/submission.${start_time}.log" if !$logfile;;

my $log = Log_files->make_log($logfile, $debug);
download_from_agr(\@mods, $url, $overwrite, $log) if $stages =~ /1/;
download_from_agr(\@mods, $start_time, $url, $overwrite, $external_human_gff, $log) if $stages =~ /1/;

for my $mod (@mods) {
my ($checksums, $run_stages) = check_for_new_data($mod, $nocheck, $log);
chdir "$BASE_DIR/$mod";
if ($stages =~ /2/) {
if ($run_stages->{2}) {
process_input_files($mod, $log);
process_input_files($mod, $external_human_gff, $log);
}
else {
$log->write_to("Skipping processing of input files for $mod " .
Expand Down Expand Up @@ -371,20 +377,21 @@ sub update_checksums {


sub download_from_agr {
my ($mods, $url, $overwrite, $log) = @_;
my ($mods, $start_time, $url, $overwrite, $external_human_gff, $log) = @_;

my $download_urls = defined $url ? get_urls_from_snapshot($url) : get_latest_urls();

make_path($BASE_DIR) unless -d $BASE_DIR;
my $input_files_file = "${BASE_DIR}/VEP_input_files.txt";
open (FILES, '>>', $input_files_file) or $log->lod_and_die("Couldn't open $input_files_file to append data\n");
my $input_files_file = "${BASE_DIR}/VEP_input_files.${start_time}.txt";
open (FILES, '>>', $input_files_file) or $log->log_and_die("Couldn't open $input_files_file to append data\n");
for my $mod (@$mods) {
my $time = localtime();
print FILES "${mod}: $time\n";
my $mod_dir = "$BASE_DIR/$mod";
make_path($mod_dir) unless -d $mod_dir;
chdir $mod_dir;
for my $datatype (keys %{$download_urls->{$mod}}){
next if $external_human_gff and $mod eq 'HUMAN' and $datatype eq 'GFF';
my $extension = $DATATYPE_EXTENSIONS{$datatype};
if (-e "${mod}_${datatype}.${extension}" and !$overwrite) {
$log->write_to("Using previously downloaded $mod $datatype\n");
Expand All @@ -403,8 +410,18 @@ sub download_from_agr {
unlink "${mod}_FASTA.fa.fai" if -e "${mod}_FASTA.fa.fai";
run_system_cmd('python3 ' . $ENV{'CVS_DIR'} . "/AGR/agr_variations_json2vcf.py -j ${mod}_VARIATION.json -m $mod -g ${mod}_GFF.gff " .
"-f ${mod}_FASTA.fa -o ${mod}_VCF.vcf", "Converting $mod phenotypic variants JSON to VCF", $log) if -e "${mod}_VARIATION.json";
if ($mod eq 'HUMAN') {
# Copy across files not on FMS from local directory
my @files_to_copy = ('HUMAN_HTVCF.vcf');
push @files_to_copy, 'HUMAN_GFF.gff' if $external_human_gff;
for my $file (@files_to_copy) {
run_system_cmd("cp ${HUMAN_FILES_DIR}/${file} $file", "Copying local $file file to working directory", $log);
}
}

}
close (FILES);


return;
}
Expand Down Expand Up @@ -498,18 +515,18 @@ sub check_if_actually_compressed {


sub process_input_files {
my ($mod, $log) = @_;
my ($mod, $external_human_gff, $log) = @_;

cleanup_intermediate_files($mod, $log);

sort_vcf_files($mod, $log);
sort_vcf_files($mod, $log) unless $mod eq 'HUMAN'; # No need for human as using local (sorted) file

my $chr_map;
check_chromosome_map($mod, $log);
convert_fasta_headers($mod, $log);
convert_vcf_chromosomes($mod, 'VCF', $log);

munge_gff($mod, $log);
munge_gff($mod, $external_human_gff, $log);
run_system_cmd("bgzip -c ${mod}_FASTA.refseq.fa > ${mod}_FASTA.refseq.fa.gz", "Compressing $mod FASTA", $log);
run_system_cmd("sort -k1,1 -k4,4n -k5,5n -t\$'\\t' ${mod}_GFF.refseq.gff | bgzip -c > ${mod}_GFF.refseq.gff.gz",
"Sorting and compressing $mod GFF", $log);
Expand All @@ -524,7 +541,6 @@ sub calculate_pathogenicity_predictions {

backup_pathogenicity_prediction_db($mod, $password, $log);

#my $lsf_queue = $test ? $ENV{'LSF_TEST_QUEUE'} : $ENV{'LSF_DEFAULT_QUEUE'}; # Waiting for clarification of production queue usage for AGR work
my $lsf_queue = $ENV{'LSF_DEFAULT_QUEUE'};

my $init_cmd = "init_pipeline.pl VepProteinFunction::VepProteinFunction_conf -mod $mod" .
Expand Down Expand Up @@ -707,7 +723,8 @@ sub sort_vcf_files {


sub remove_mirna_primary_transcripts {
my (%mirna_parents, %parents, $log);
my $log = shift;
my (%mirna_parents, %parents);

$log->write_to("Getting RefSeq HUMAN miRNA details from GFF\n");

Expand Down Expand Up @@ -765,15 +782,15 @@ sub remove_mirna_primary_transcripts {


sub munge_gff {
my ($mod, $log) = @_;
my ($mod, $external_human_gff, $log) = @_;

run_system_cmd("rm ${mod}_GFF.refseq.gff", "Deleting old munged GFF file", $log) if -e "${mod}_GFF.refseq.gff";

my $reverse_map = get_reverse_chromosome_map($mod);

my $hgnc_id_map;
if ($mod eq 'HUMAN') {
remove_mirna_primary_transcripts();
if ($mod eq 'HUMAN' and $external_human_gff) {
remove_mirna_primary_transcripts($log);
$hgnc_id_map = get_hgnc_id_map($log);
}

Expand Down Expand Up @@ -835,7 +852,7 @@ sub munge_gff {
}
}

if ($mod eq 'HUMAN') {
if ($mod eq 'HUMAN' and $external_human_gff) {
$line = convert_to_hgnc_gene_ids(\@columns, $hgnc_id_map);
}

Expand Down Expand Up @@ -886,8 +903,18 @@ sub convert_to_hgnc_gene_ids {
my %attr = split /[=;]/, $columns->[8];
my @pairs;
my %keys_to_map = map {$_ => 1} ('ID', 'Parent', 'gene_id', 'id');
# Dbxref=GeneID:100996442,Genbank:XR_001737578.2
for my $key (keys %attr) {
my $value = $attr{$key};
if ($key eq 'Dbxref') {
my @dbxrefs = split ',', $value;
my @dbxrefs_to_keep;
for my $dbxref (@dbxrefs) {
push @dbxrefs_to_keep, $dbxref unless $dbxref =~ /^GeneID:/
}
push @pairs, 'Dbxref=' . join(',', @dbxrefs_to_keep) if @dbxrefs_to_keep;
next;
}
$value =~ s/^gene[:\-]//;
if (exists $keys_to_map{$key} and exists $hgnc_id_map->{$value}) {
push @pairs, $key . '=' . $hgnc_id_map->{$value};
Expand Down Expand Up @@ -1084,7 +1111,9 @@ sub cleanup_intermediate_files {


sub get_hgnc_id_map {
my ($file, $log) = $HGNC_FILE_URL =~ /\/([^\/]+)$/;
my $log = shift;

my ($file) = $HGNC_FILE_URL =~ /\/([^\/]+)$/;
run_system_cmd("curl -O $HGNC_FILE_URL", 'Downloading HGNC gene ID map file', $log) unless -e $file;

my $first_line = 1;
Expand Down
2 changes: 2 additions & 0 deletions scripts/merge_split_camaces.pl
Original file line number Diff line number Diff line change
Expand Up @@ -719,12 +719,14 @@ sub load_curation_data {
"$blat_dir/virtual_objects.$species.ci.OST.$species.ace",
"$blat_dir/virtual_objects.$species.ci.RST.$species.ace",
"$blat_dir/virtual_objects.$species.ci.ncRNA.$species.ace",
"$blat_dir/virtual_objects.$species.ci.elegans_Nanopore.ace",
"$blat_dir/$species.blat.${species}_OST.ace",
"$blat_dir/$species.blat.${species}_RST.ace",
"$blat_dir/$species.blat.${species}_ncRNA.ace",
"$blat_dir/$species.good_introns.OST.ace",
"$blat_dir/$species.good_introns.RST.ace",
"$blat_dir/$species.good_introns.ncRNA.ace",
"$blat_dir/$species.good_introns.Nanopore.ace",
);
}
push (@BLATfiles,
Expand Down
Loading

0 comments on commit c33ab12

Please sign in to comment.