Skip to content

Commit

Permalink
Add ENSEMBL xrefs
Browse files Browse the repository at this point in the history
  • Loading branch information
markquintontulloch committed Apr 19, 2024
1 parent 3ae0c3e commit addf3b9
Showing 1 changed file with 79 additions and 3 deletions.
82 changes: 79 additions & 3 deletions scripts/AGR/curation_db_submissions/make_agr_gene_json.pl
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
use Wormbase;

my ($help, $debug, $test, $verbose, $store, $wormbase, $schema);
my ($outfile, $acedbpath, $ws_version, $out_fh);
my ($outfile, $acedbpath, $ws_version, $out_fh, $gtf_file);

const my $LINKML_SCHEMA => 'v2.2.3';

Expand Down Expand Up @@ -40,7 +40,8 @@
"store:s" => \$store,
"database:s" => \$acedbpath,
"outfile:s" => \$outfile,
"wsversion=s" => \$ws_version
"wsversion=s" => \$ws_version,
"gtf=s" => \$gtf_file
);

if ( $store ) {
Expand All @@ -61,6 +62,8 @@

my @genes;

my $locs = get_location_data($db, $gtf_file);

my $it = $db->fetch_many(-query => $query);

while (my $obj = $it->next) {
Expand Down Expand Up @@ -133,7 +136,7 @@
my @baseXrefs = qw(gene gene/expression gene/references);
push @baseXrefs, 'gene/phenotypes' if ($obj->Allele and $obj->Allele->Phenotype) || ($obj->RNAi_result and $obj->RNAi_result->Phenotype);
push @baseXrefs, 'gene/expression_images' if ($obj->Expr_pattern && grep {$_->Picture} $obj->Expr_pattern);

for my $baseXref (@baseXrefs) {
push @xrefs, {
referenced_curie => 'WB:' . $obj->name,
Expand All @@ -153,6 +156,16 @@
};
}

# Ensembl xrefs, only for genomically placed genes
if (defined $locs and exists $locs->{$obj->name}) {
push @xrefs, {
referenced_curie => 'ENSEMBL:' . $obj->name,
page_area => 'default',
display_name => 'ENSEMBL:' . $obj->name,
prefix => 'ENSEMBL'
};
}



my $is_obsolete = $obj->Status && $obj->Status->name eq 'Dead' ? JSON::true : JSON::false;
Expand Down Expand Up @@ -341,4 +354,67 @@ sub get_paper {
return $publication_id;
}



sub get_location_data {
my ($acedb, $gtf) = @_;

# get the assembly name for the canonical bioproject
my ($assembly_name, $fh, %locs);

my $species_obj = $acedb->fetch(-class => 'Species', -name => $wormbase->full_name);
my @seq_col = $species_obj->at('Assembly');

foreach my $seq_col_name (@seq_col) {
my ($bioproj);

my $seq_col = $seq_col_name->fetch;
my $this_assembly_name = $seq_col->Name;

my @db = $seq_col->at('Origin.DB_info.Database');
foreach my $db (@db) {
if ($db->name eq 'NCBI_BioProject') {
$bioproj = $db->right->right->name;
}
}

if (defined $bioproj and $wormbase->ncbi_bioproject eq $bioproj) {
$assembly_name = $this_assembly_name->name;
last;
}
}

if (not defined $assembly_name) {
die "Could not find name of current assembly for " . $wormbase->species() . "\n";
}

# parse the GTF
if ($gtf =~ /\.gz$/) {
open( $fh, "gunzip -c $gtf |")
or die "Could not open gunzip stream GTF file $gtf\n";
} else {
open( $fh, $gtf)
or die "Could not open GTF file $gtf\n";
}

while(<$fh>) {
/^\#/ and next;

my @l = split(/\t/, $_);
next if $l[2] ne "gene";

my ($wbgid) = $l[8] =~ /gene_id\s+\"(\S+)\"/;

$locs{$wbgid} = {
assembly => $assembly_name,
chromosome => $l[0],
startPosition => $l[3] + 0,
endPosition => $l[4] + 0,
strand => $l[6],
};
}

return \%locs;
}

1;

0 comments on commit addf3b9

Please sign in to comment.