From addf3b991dcba9af6a5e9cfc09ac277e383fb9c7 Mon Sep 17 00:00:00 2001 From: Mark Quinton-Tulloch Date: Fri, 19 Apr 2024 08:12:01 +0100 Subject: [PATCH] Add ENSEMBL xrefs --- .../make_agr_gene_json.pl | 82 ++++++++++++++++++- 1 file changed, 79 insertions(+), 3 deletions(-) diff --git a/scripts/AGR/curation_db_submissions/make_agr_gene_json.pl b/scripts/AGR/curation_db_submissions/make_agr_gene_json.pl index 374753ba3..cd5be828e 100755 --- a/scripts/AGR/curation_db_submissions/make_agr_gene_json.pl +++ b/scripts/AGR/curation_db_submissions/make_agr_gene_json.pl @@ -9,7 +9,7 @@ use Wormbase; my ($help, $debug, $test, $verbose, $store, $wormbase, $schema); -my ($outfile, $acedbpath, $ws_version, $out_fh); +my ($outfile, $acedbpath, $ws_version, $out_fh, $gtf_file); const my $LINKML_SCHEMA => 'v2.2.3'; @@ -40,7 +40,8 @@ "store:s" => \$store, "database:s" => \$acedbpath, "outfile:s" => \$outfile, - "wsversion=s" => \$ws_version + "wsversion=s" => \$ws_version, + "gtf=s" => \$gtf_file ); if ( $store ) { @@ -61,6 +62,8 @@ my @genes; +my $locs = get_location_data($db, $gtf_file); + my $it = $db->fetch_many(-query => $query); while (my $obj = $it->next) { @@ -133,7 +136,7 @@ my @baseXrefs = qw(gene gene/expression gene/references); push @baseXrefs, 'gene/phenotypes' if ($obj->Allele and $obj->Allele->Phenotype) || ($obj->RNAi_result and $obj->RNAi_result->Phenotype); push @baseXrefs, 'gene/expression_images' if ($obj->Expr_pattern && grep {$_->Picture} $obj->Expr_pattern); - + for my $baseXref (@baseXrefs) { push @xrefs, { referenced_curie => 'WB:' . $obj->name, @@ -153,6 +156,16 @@ }; } + # Ensembl xrefs, only for genomically placed genes + if (defined $locs and exists $locs->{$obj->name}) { + push @xrefs, { + referenced_curie => 'ENSEMBL:' . $obj->name, + page_area => 'default', + display_name => 'ENSEMBL:' . $obj->name, + prefix => 'ENSEMBL' + }; + } + my $is_obsolete = $obj->Status && $obj->Status->name eq 'Dead' ? JSON::true : JSON::false; @@ -341,4 +354,67 @@ sub get_paper { return $publication_id; } + + +sub get_location_data { + my ($acedb, $gtf) = @_; + + # get the assembly name for the canonical bioproject + my ($assembly_name, $fh, %locs); + + my $species_obj = $acedb->fetch(-class => 'Species', -name => $wormbase->full_name); + my @seq_col = $species_obj->at('Assembly'); + + foreach my $seq_col_name (@seq_col) { + my ($bioproj); + + my $seq_col = $seq_col_name->fetch; + my $this_assembly_name = $seq_col->Name; + + my @db = $seq_col->at('Origin.DB_info.Database'); + foreach my $db (@db) { + if ($db->name eq 'NCBI_BioProject') { + $bioproj = $db->right->right->name; + } + } + + if (defined $bioproj and $wormbase->ncbi_bioproject eq $bioproj) { + $assembly_name = $this_assembly_name->name; + last; + } + } + + if (not defined $assembly_name) { + die "Could not find name of current assembly for " . $wormbase->species() . "\n"; + } + + # parse the GTF + if ($gtf =~ /\.gz$/) { + open( $fh, "gunzip -c $gtf |") + or die "Could not open gunzip stream GTF file $gtf\n"; + } else { + open( $fh, $gtf) + or die "Could not open GTF file $gtf\n"; + } + + while(<$fh>) { + /^\#/ and next; + + my @l = split(/\t/, $_); + next if $l[2] ne "gene"; + + my ($wbgid) = $l[8] =~ /gene_id\s+\"(\S+)\"/; + + $locs{$wbgid} = { + assembly => $assembly_name, + chromosome => $l[0], + startPosition => $l[3] + 0, + endPosition => $l[4] + 0, + strand => $l[6], + }; + } + + return \%locs; +} + 1;