diff --git a/modules/Bio/EnsEMBL/Variation/DBSQL/PhenotypeFeatureAdaptor.pm b/modules/Bio/EnsEMBL/Variation/DBSQL/PhenotypeFeatureAdaptor.pm index 0e9a0a0aa..872596b41 100644 --- a/modules/Bio/EnsEMBL/Variation/DBSQL/PhenotypeFeatureAdaptor.pm +++ b/modules/Bio/EnsEMBL/Variation/DBSQL/PhenotypeFeatureAdaptor.pm @@ -117,6 +117,41 @@ sub _is_significant_constraint { } +=head2 _filter_dna_type + + Arg [1] : string $constraint + Arg [1] : string $dna_type + Example : $self->_filter_dna_type($constraint, $dna_type) + Description: Internal method to add a constraint on the column "DNA_type". + Phenotype features from ClinVar have DNA_type 'Germline' or 'Somatic'. + For non-human or other human sources DNA_type is NULL. + This method add a constraint to return a specific type of DNA and/or null. + Returntype : string + Exceptions : none + Caller : internal + Status : stable + +=cut + +sub _filter_dna_type { + my $self = shift; + my $constraint = shift; + my $dna_type = shift; + + my $dna_constraint; + + if($dna_type eq "Germline") { + $dna_constraint = qq{ (pf.DNA_type='$dna_type' OR pf.DNA_type is NULL) }; + } + else { + $dna_constraint = qq{ pf.DNA_type='$dna_type' }; + } + + $constraint .= (defined($constraint)) ? " AND$dna_constraint" : $dna_constraint; + + return $constraint; +} + =head2 _is_class_constraint Arg [1] : string $constraint @@ -153,7 +188,8 @@ sub _fetch_all_by_object { my $self = shift; my $object = shift; my $type = shift; - + my $dna_type = shift; # dna_type to include + $type ||= (split '::', ref($object))[-1]; throw("$type is not a valid object type, valid types are: ".(join ", ", sort keys %TYPES)) unless defined $type and defined($TYPES{$type}); @@ -163,7 +199,12 @@ sub _fetch_all_by_object { $constraint = $self->_is_significant_constraint($constraint); # Add the constraint for phenotype class $constraint = $self->_is_class_constraint($constraint); - + + # Filter phenotype features by their DNA_type (when searching for germline also includes null DNA_type) + if(defined $dna_type) { + $constraint = $self->_filter_dna_type($constraint, $dna_type); + } + return $self->generic_fetch($constraint); } @@ -400,7 +441,7 @@ sub fetch_all_by_Slice_with_ontology_accession { Arg [1] : Bio::EnsEMBL::Variation::Variation $var Example : my @pfs = @{$pfa->fetch_all_by_Variation($var)}; - Description: Retrieves all PhenotypeFeatures for a given variation. + Description: Retrieves all PhenotypeFeatures with 'Germline' DNA type for a given variation. Returntype : reference to list Bio::EnsEMBL::Variation::PhenotypeFeature Exceptions : throw on bad argument Caller : general @@ -416,7 +457,32 @@ sub fetch_all_by_Variation { throw('Bio::EnsEMBL::Variation::Variation arg expected'); } - return $self->_fetch_all_by_object($var, 'Variation'); + # Only include phenotype features with DNA_type 'Germline' or null + return $self->_fetch_all_by_object($var, 'Variation', 'Germline'); +} + +=head2 fetch_all_somatic_by_Variation + + Arg [1] : Bio::EnsEMBL::Variation::Variation $var + Example : my @pfs = @{$pfa->fetch_all_somatic_by_Variation($var)}; + Description: Retrieves all PhenotypeFeatures with 'Somatic' DNA type for a given variation. + Returntype : reference to list Bio::EnsEMBL::Variation::PhenotypeFeature + Exceptions : throw on bad argument + Caller : general + Status : stable + +=cut + +sub fetch_all_somatic_by_Variation { + my $self = shift; + my $var = shift; + + if(!ref($var) || !$var->isa('Bio::EnsEMBL::Variation::Variation')) { + throw('Bio::EnsEMBL::Variation::Variation arg expected'); + } + + # Only include phenotype features with DNA_type 'Somatic' + return $self->_fetch_all_by_object($var, 'Variation', 'Somatic'); } @@ -1183,7 +1249,8 @@ sub get_clinsig_alleles_by_location { AND pf.seq_region_id = ? AND pf.seq_region_start >= ? AND pf.seq_region_end <= ? - AND pf.source_id = ? + AND pf.source_id = ? + AND (DNA_type = 'Germline' OR DNA_type is NULL) AND EXISTS(select value from phenotype_feature_attrib where phenotype_feature_id = pf.phenotype_feature_id && attrib_type_id = 483) GROUP BY pf.phenotype_feature_id @@ -1215,6 +1282,76 @@ sub get_clinsig_alleles_by_location { return $hash; } +sub get_somatic_clin_impact_by_location { + my $self = shift; + my $seq_region_id = shift; + my $seq_region_start = shift; + my $seq_region_end = shift; + my $source_id = shift; + throw("Cannot fetch attributes without seq region information") unless defined($seq_region_id) && defined($seq_region_start) && defined($seq_region_end); + + my $extra_sql = $self->_is_significant_constraint(); + # Add the constraint for phenotype class + $extra_sql = $self->_is_class_constraint($extra_sql); + + my $sth = $self->dbc->prepare(qq{ + SELECT + CONCAT(pf.seq_region_id, ':', pf.seq_region_start, '-', pf.seq_region_end), + CONCAT_WS('; ', + CONCAT('id=', pf.object_id), CONCAT('pf_id=', pf.phenotype_feature_id), + GROUP_CONCAT(IF(at.code in ('somatic_clin_sig', 'oncogenic_clin_sig'), at.code, NULL), "=", concat('', pfa.value, '') SEPARATOR '; '), + CONCAT('phenotype=', p.description) + ) AS attribute + + FROM + phenotype p, + phenotype_feature pf + + LEFT JOIN phenotype_feature_attrib pfa + ON pf.phenotype_feature_id = pfa.phenotype_feature_id + LEFT JOIN attrib_type `at` + ON pfa.attrib_type_id = at.attrib_type_id + + WHERE pf.phenotype_id = p.phenotype_id + AND pf.seq_region_id = ? + AND pf.seq_region_start >= ? + AND pf.seq_region_end <= ? + AND pf.source_id = ? + AND pf.DNA_type = 'Somatic' + + GROUP BY pf.phenotype_feature_id + ORDER BY pf.seq_region_id, pf.seq_region_start, pf.seq_region_end + }); + + $sth->bind_param(1, $seq_region_id, SQL_VARCHAR); + $sth->bind_param(2, $seq_region_start, SQL_VARCHAR); + $sth->bind_param(3, $seq_region_end, SQL_VARCHAR); + $sth->bind_param(4, $source_id, SQL_VARCHAR); + $sth->execute(); + + my $pf_id; + my $output_string; + my $hash; + $sth->bind_columns(\$pf_id, \$output_string); + + while ($sth->fetch){ + $hash->{$pf_id} = [] if !defined($hash->{$pf_id}); + my $internal_hash; + + # Example: $output_string => "id=rs1555760738; pf_id=266451087; somatic_clin_sig=Tier III - Unknown" + foreach my $id (split/\;/,$output_string){ + my ($key, $value) = split /\=/, $id; + $key =~ s/^\s+|\s+$//g; + + $internal_hash->{$key} = $value; + } + push(@{$hash->{$pf_id}}, $internal_hash); + } + + $sth->finish(); + + return $hash; +} # stub method used by web @@ -1530,6 +1667,7 @@ sub _obj_from_row { '_source_id' => $row->{source_id}, '_source_name' => $row->{name}, 'is_significant' => $row->{is_significant}, + 'dna_type' => $row->{dna_type}, } ); @@ -1591,8 +1729,9 @@ sub store{ seq_region_id, seq_region_start, seq_region_end, - seq_region_strand - ) VALUES (?,?,?,?,?,?,?,?,?,?) + seq_region_strand, + dna_type + ) VALUES (?,?,?,?,?,?,?,?,?,?,?) }); $sth->execute( @@ -1605,7 +1744,8 @@ sub store{ defined($pf->{slice}) ? $pf->slice()->get_seq_region_id() : undef, defined($pf->{start}) ? $pf->{start} :undef, defined($pf->{end}) ? $pf->{end} : undef, - defined($pf->{strand})? $pf->{strand} : undef + defined($pf->{strand})? $pf->{strand} : undef, + defined($pf->{dna_type})? $pf->{dna_type} : undef ); $sth->finish; diff --git a/modules/Bio/EnsEMBL/Variation/PhenotypeFeature.pm b/modules/Bio/EnsEMBL/Variation/PhenotypeFeature.pm index 3b7ebe6dc..a4412afe6 100644 --- a/modules/Bio/EnsEMBL/Variation/PhenotypeFeature.pm +++ b/modules/Bio/EnsEMBL/Variation/PhenotypeFeature.pm @@ -82,7 +82,7 @@ use Exporter; use vars qw(@EXPORT_OK @ISA); our @ISA = ('Bio::EnsEMBL::Feature', 'Exporter'); -@EXPORT_OK = qw(%TYPES); +@EXPORT_OK = qw(%TYPES %DNA_TYPES); # define valid object types # this must correspond to the types defined in the type column of @@ -96,6 +96,11 @@ our %TYPES = ( 'RegulatoryFeature' => 1, ); +our %DNA_TYPES = ( + 'Germline' => 1, + 'Somatic' => 1 +); + =head2 new Arg [-dbID] : @@ -157,12 +162,12 @@ sub new { my $class = ref($caller) || $caller; my $self = $class->SUPER::new(@_); - my ($dbID,$adaptor,$phenotype_id,$phenotype,$type,$object,$object_id,$source_name,$source_id,$source,$study,$study_id,$is_significant,$attribs, $ontology_accessions) = + my ($dbID,$adaptor,$phenotype_id,$phenotype,$type,$object,$object_id,$source_name,$source_id,$source,$study,$study_id,$is_significant,$dna_type,$attribs, $ontology_accessions) = rearrange([qw( dbID ADAPTOR _PHENOTYPE_ID PHENOTYPE TYPE OBJECT _OBJECT_ID SOURCE_NAME _SOURCE_ID SOURCE STUDY _STUDY_ID - IS_SIGNIFICANT + IS_SIGNIFICANT DNA_TYPE ATTRIBS ONTOLOGY_ACCESSIONS )], @_); @@ -207,6 +212,7 @@ sub new { $self->{type} = $type; $self->{is_significant} = $is_significant; + $self->{dna_type} = $dna_type || undef; $self->{attribs} = $attribs || {}; $self->{ontology_accessions} = $ontology_accessions || undef; @@ -510,6 +516,30 @@ sub type { return $self->{'type'}; } +=head2 dna_type + + Arg [1] : string $type (optional) + The new value to set the clinical significance type attribute to + Example : $type = $obj->dna_type() + Description: Getter/Setter for the object dna_type of the PhenotypeFeature. + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub dna_type { + my $self = shift; + my $type = shift; + + if(defined($type)) { + throw("$type is not a valid object type, valid types are: ".(join ", ", sort %DNA_TYPES)) unless defined($DNA_TYPES{$type}); + $self->{'dna_type'} = $type; + } + + return $self->{'dna_type'}; +} =head2 is_significant @@ -938,6 +968,48 @@ sub clinical_significance { return defined($self->get_all_attributes->{'clinvar_clin_sig'}) ? $self->get_all_attributes->{'clinvar_clin_sig'} : undef; } +=head2 somatic_classification + + Example : $somatic_classification = $obj->somatic_classification() + Description: Getter for the somatic_clin_sig attribute. + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub somatic_classification { + my $self = shift; + + my $classification = undef; + + if(defined $self->get_all_attributes->{'somatic_clin_sig'}) { + $classification = $self->get_all_attributes->{'somatic_clin_sig'}; + } + + return $classification; +} + +=head2 oncogenicity_classification + + Example : $oncogenicity = $obj->oncogenicity_classification() + Description: Getter/setter for the oncogenic_clin_sig attribute. + Returntype : string + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub oncogenicity_classification { + my $self = shift; + my $new = shift; + + $self->_set_attribute('oncogenic_clin_sig', $new) if defined($new); + + return defined($self->get_all_attributes->{'oncogenic_clin_sig'}) ? $self->get_all_attributes->{'oncogenic_clin_sig'} : undef; +} =head2 external_id diff --git a/modules/Bio/EnsEMBL/Variation/Utils/Config.pm b/modules/Bio/EnsEMBL/Variation/Utils/Config.pm index df5092bbc..6c4f78f29 100644 --- a/modules/Bio/EnsEMBL/Variation/Utils/Config.pm +++ b/modules/Bio/EnsEMBL/Variation/Utils/Config.pm @@ -87,6 +87,19 @@ our @clinvar_clinical_significance_types = ( 'established risk allele' ); +our @clinvar_clinical_significance_somatic = ( + 'Tier I - Strong', + 'Tier II - Potential', + 'Tier III - Uncertain significance', + 'Tier III - Unknown', + 'Tier IV - Benign/likely benign', + 'Oncogenic', + 'Likely oncogenic', + 'Uncertain significance', + 'Likely benign ', + 'Benign' +); + our @dgva_clinical_significance_types = ( 'Not tested', 'Benign', @@ -1243,6 +1256,16 @@ our @ATTRIB_TYPES = ( name => 'ClinVar clinical significance', description => 'The clinical significance of a variant as reported by ClinVar', }, + { + code => 'somatic_clin_sig', + name => 'ClinVar somatic significance', + description => 'The somatic classification of a variant as reported by ClinVar', + }, + { + code => 'oncogenic_clin_sig', + name => 'ClinVar somatic classification of oncogenicity', + description => 'The somatic classification of oncogenicity of a variant as reported by ClinVar', + }, { code => 'prot_func_analysis', name => 'Protein function analysis ', @@ -1363,6 +1386,7 @@ our %ATTRIBS = ( 'dbsnp_clin_sig' => \@dbsnp_clinical_significance_types, 'dgva_clin_sig' => \@dgva_clinical_significance_types, 'clinvar_clin_sig' => \@clinvar_clinical_significance_types, + 'clinvar_somatic' => \@clinvar_clinical_significance_somatic, 'polyphen_prediction' => ['probably damaging', 'possibly damaging', 'benign', 'unknown'], 'sift_prediction' => ['tolerated', 'deleterious', 'tolerated - low confidence', 'deleterious - low confidence'], 'prot_func_analysis' => [qw(sift polyphen_humvar polyphen_humdiv)], diff --git a/modules/Bio/EnsEMBL/Variation/Utils/VEP.pm b/modules/Bio/EnsEMBL/Variation/Utils/VEP.pm index a91f2687c..4bedb3852 100644 --- a/modules/Bio/EnsEMBL/Variation/Utils/VEP.pm +++ b/modules/Bio/EnsEMBL/Variation/Utils/VEP.pm @@ -191,6 +191,7 @@ our @EXTRA_HEADERS = ( { flag => 'flag_pick_allele',cols => ['PICK'] }, { flag => 'variant_class', cols => ['VARIANT_CLASS']}, { flag => 'minimal', cols => ['MINIMISED']}, + { flag => 'clinvar_somatic_classification', cols => ['CLINVAR_SOMATIC_CLASSIFICATION']}, # gene-related { flag => 'symbol', cols => ['SYMBOL','SYMBOL_SOURCE','HGNC_ID'] }, @@ -309,6 +310,7 @@ our %COL_DESCS = ( 'GENE_PHENO' => 'Indicates if gene is associated with a phenotype, disease or trait', 'MINIMISED' => 'Alleles in this variant have been converted to minimal representation before consequence calculation', 'HGVS_OFFSET' => 'Indicates by how many bases the HGVS notations for this variant have been shifted', + 'CLINVAR_SOMATIC_CLASSIFICATION' => 'ClinVar somatic classification of the dbSNP variant', ); our @REG_FEAT_TYPES = qw( diff --git a/modules/Bio/EnsEMBL/Variation/Variation.pm b/modules/Bio/EnsEMBL/Variation/Variation.pm index ab6dbb5e3..fc0461bac 100755 --- a/modules/Bio/EnsEMBL/Variation/Variation.pm +++ b/modules/Bio/EnsEMBL/Variation/Variation.pm @@ -1276,6 +1276,17 @@ sub get_all_PhenotypeFeatures { } +sub get_all_PhenotypeFeatures_Somatic { + my $self = shift; + + #Assert the adaptor reference + assert_ref($self->adaptor(),'Bio::EnsEMBL::Variation::DBSQL::BaseAdaptor'); + + # Get the annotations from the database + return $self->adaptor->db->get_PhenotypeFeatureAdaptor()->fetch_all_somatic_by_Variation($self); + +} + =head2 display_consequence Arg [1] : (optional) String $term_type diff --git a/modules/t/phenotypeFeature.t b/modules/t/phenotypeFeature.t index 10c4df5cf..465e25636 100644 --- a/modules/t/phenotypeFeature.t +++ b/modules/t/phenotypeFeature.t @@ -230,8 +230,12 @@ ok($pf2->source()->name eq $source_name, "pf 'source' (using argument)"); # Test associated studies my $pfs3 = $pf_adaptor->fetch_all_by_object_id('rs2299222'); -my $asso_study = $pfs3->[0]->associated_studies->[0]; -ok($asso_study->name eq 'asso_study', 'associated_studies'); +for my $pheno_feat (@$pfs3) { + if(!$pheno_feat->somatic_classification) { + my $asso_study = $pheno_feat->associated_studies->[0]; + ok($asso_study->name eq 'asso_study', 'associated_studies'); + } +} # Test project_fullname my $project = 'Full name of the project'; diff --git a/modules/t/phenotypeFeatureAdaptor.t b/modules/t/phenotypeFeatureAdaptor.t index d9b762b6c..8ab3a680b 100644 --- a/modules/t/phenotypeFeatureAdaptor.t +++ b/modules/t/phenotypeFeatureAdaptor.t @@ -39,9 +39,9 @@ my $sl = $sla->fetch_by_region('chromosome', 7, 86442403, 86442405); # fetch_all_by_object_id my $pfs = $pfa->fetch_all_by_object_id('rs2299222'); -ok(ref($pfs) eq 'ARRAY' && scalar @$pfs == 1 && $pfs->[0]->object_id eq 'rs2299222', "fetch_all_by_object_id"); +ok(ref($pfs) eq 'ARRAY' && scalar @$pfs == 2 && $pfs->[0]->object_id eq 'rs2299222', "fetch_all_by_object_id"); $pfs = $pfa->fetch_all_by_object_id('rs2299222', 'Variation'); -ok(ref($pfs) eq 'ARRAY' && scalar @$pfs == 1 && $pfs->[0]->object_id eq 'rs2299222', "fetch_all_by_object_id + type"); +ok(ref($pfs) eq 'ARRAY' && scalar @$pfs == 2 && $pfs->[0]->object_id eq 'rs2299222', "fetch_all_by_object_id + type"); throws_ok { $pfa->fetch_all_by_object_id('rs2299222', 'Variant'); } qr/is not a valid object type, valid types are/, ' > Throw on wrong object type'; # fetch_all_by_object_id_accession_type @@ -57,7 +57,7 @@ throws_ok { $pfa->fetch_all_by_object_id('rs2299222', 'Variant'); } qr/is not a # fetch_all_by_Slice_type $pfs = $pfa->fetch_all_by_Slice_type($sl, 'Variation'); -ok(ref($pfs) eq 'ARRAY' && scalar @$pfs == 1 && $pfs->[0]->object_id eq 'rs2299222', "fetch_all_by_Slice_type"); +ok(ref($pfs) eq 'ARRAY' && scalar @$pfs == 2 && $pfs->[0]->object_id eq 'rs2299222', "fetch_all_by_Slice_type"); # fetch_all_by_Variation my $va = $vdba->get_VariationAdaptor(); @@ -66,10 +66,13 @@ $pfs = $pfa->fetch_all_by_Variation($v); ok(ref($pfs) eq 'ARRAY' && scalar @$pfs == 1 && $pfs->[0]->object_id eq 'rs2299222', "fetch_all_by_Variation"); throws_ok { $pfa->fetch_all_by_Variation('Variant'); } qr/Variation arg expected/, ' > Throw on wrong argument'; +# fetch_all_somatic_by_Variation +my $somatic_pfs = $pfa->fetch_all_somatic_by_Variation($v); +ok(ref($somatic_pfs) eq 'ARRAY' && scalar @$somatic_pfs == 1 && $somatic_pfs->[0]->object_id eq 'rs2299222', "fetch_all_somatic_by_Variation"); # fetch_all_by_Variation_list $pfs = $pfa->fetch_all_by_Variation_list([$v]); -ok(ref($pfs) eq 'ARRAY' && scalar @$pfs == 1 && $pfs->[0]->object_id eq 'rs2299222', "fetch_all_by_Variation_list"); +ok(ref($pfs) eq 'ARRAY' && scalar @$pfs == 2 && $pfs->[0]->object_id eq 'rs2299222', "fetch_all_by_Variation_list"); throws_ok { $pfa->fetch_all_by_Variation_list(['Variant']); } qr/Variation arg expected/, ' > Throw on wrong argument'; throws_ok { $pfa->fetch_all_by_Variation_list([Bio::EnsEMBL::Variation::Variation->new()]); } qr/Variation arg must have defined name/, ' > Throw on wrong argument'; @@ -89,7 +92,7 @@ throws_ok { $pfa->fetch_all_by_Gene('gene'); } qr/Gene arg expected/, ' > Throw # fetch_all_by_VariationFeature_list $pfs = $pfa->fetch_all_by_VariationFeature_list($v->get_all_VariationFeatures); -ok(ref($pfs) eq 'ARRAY' && scalar @$pfs == 1 && $pfs->[0]->object_id eq 'rs2299222', "fetch_all_by_VariationFeature_list"); +ok(ref($pfs) eq 'ARRAY' && scalar @$pfs == 2 && $pfs->[0]->object_id eq 'rs2299222', "fetch_all_by_VariationFeature_list"); throws_ok { $pfa->fetch_all_by_VariationFeature_list(['VariationFeature']); } qr/VariationFeature arg expected/, ' > Throw on wrong argument'; throws_ok { $pfa->fetch_all_by_VariationFeature_list([Bio::EnsEMBL::Variation::VariationFeature->new()]); } qr/VariationFeatures in list must have defined names/, ' > Throw on wrong argument'; @@ -147,12 +150,14 @@ ok(ref($pfs) eq 'ARRAY' && scalar @$pfs == 1 && $pfs->[0]->object_id eq 'rs2299 $pfa->use_phenotype_classes('trait,non_specified,tumour'); $sl_oa = $sla->fetch_by_region('chromosome', 18, 721588, 86442450); $pfs = $pfa->fetch_all_by_Slice_with_ontology_accession($sl_oa); + ok(ref($pfs) eq 'ARRAY' && scalar @$pfs == 3 && (grep {$_->object_id eq 'rs2299298'} @$pfs) && (grep {$_->phenotype_class_id == 663 } @$pfs), "fetch_all_by_Slice_accession_type - phenotype class all "); $pfa->clear_cache(); $pfa->use_phenotype_classes('trait'); $pfs = $pfa->fetch_all_by_Slice_with_ontology_accession($sl_oa); + ok(ref($pfs) eq 'ARRAY' && scalar @$pfs == 2 && (grep {$_->object_id eq 'ENSG00000176105'} @$pfs) && (grep {$_->phenotype_class_id == 665} @$pfs), "fetch_all_by_Slice_accession_type - phenotype class - trait"); @@ -256,7 +261,7 @@ ok($count && $count->{'Variation'} == 1 && $count->{'StructuralVariation'} == 2 # fetch_all $pfs = $pfa->fetch_all(); -ok(ref($pfs) eq 'ARRAY' && scalar @$pfs == 7 && (grep {$_->object_id eq 'rs2299222'} @$pfs), "fetch_all"); +ok(ref($pfs) eq 'ARRAY' && scalar @$pfs == 8 && (grep {$_->object_id eq 'rs2299222'} @$pfs), "fetch_all"); # store my $pf = $pfs->[0]; diff --git a/modules/t/test-genome-DBs/homo_sapiens/variation/attrib_type.txt b/modules/t/test-genome-DBs/homo_sapiens/variation/attrib_type.txt index a30d63ac0..033868908 100644 --- a/modules/t/test-genome-DBs/homo_sapiens/variation/attrib_type.txt +++ b/modules/t/test-genome-DBs/homo_sapiens/variation/attrib_type.txt @@ -41,3 +41,4 @@ 534 dbnsfp_ma_pred dbNSFP mutation assessor prediction dbNSFP mutation assessor prediction 542 citation_source Citation source Variant citation data source 544 phenotype_type Phenotype type Type of the phenotype information +577 somatic_clin_sig ClinVar somatic classification The somatic classification of a variant as reported by ClinVar diff --git a/modules/t/test-genome-DBs/homo_sapiens/variation/meta.txt b/modules/t/test-genome-DBs/homo_sapiens/variation/meta.txt index aa4bf17b7..bfb6ef9b6 100644 --- a/modules/t/test-genome-DBs/homo_sapiens/variation/meta.txt +++ b/modules/t/test-genome-DBs/homo_sapiens/variation/meta.txt @@ -1,5 +1,5 @@ 1 \N schema_type variation -2 \N schema_version 114 +2 \N schema_version 115 3 \N patch patch_73_74_a.sql|schema version 4 \N patch patch_73_74_b.sql|Add doi and UCSC id to publication table 5 \N patch patch_73_74_c.sql|Add clinical_significance to variation_feature table @@ -140,3 +140,5 @@ 147 \N patch patch_112_113_a.sql|schema version 148 \N patch patch_112_113_b.sql|Update meta_key length 149 \N patch patch_113_114_a.sql|schema version +150 \N patch patch_114_115_a.sql|schema version +151 \N patch patch_114_115_b.sql|Add column DNA_type to phenotype_feature diff --git a/modules/t/test-genome-DBs/homo_sapiens/variation/phenotype.txt b/modules/t/test-genome-DBs/homo_sapiens/variation/phenotype.txt index e58d79332..0c7c4a5cc 100644 --- a/modules/t/test-genome-DBs/homo_sapiens/variation/phenotype.txt +++ b/modules/t/test-genome-DBs/homo_sapiens/variation/phenotype.txt @@ -2,3 +2,4 @@ 2 \N \N COSMIC:tumour_site:skin 664 3 \N \N BRUGADA SYNDROME 665 4 \N \N ClinVar: phenotype not specified 663 +5 \N \N Acute myeloid leukemia 664 \ No newline at end of file diff --git a/modules/t/test-genome-DBs/homo_sapiens/variation/phenotype_feature.txt b/modules/t/test-genome-DBs/homo_sapiens/variation/phenotype_feature.txt index e3c71fd7f..2259ab408 100644 --- a/modules/t/test-genome-DBs/homo_sapiens/variation/phenotype_feature.txt +++ b/modules/t/test-genome-DBs/homo_sapiens/variation/phenotype_feature.txt @@ -1,7 +1,8 @@ -1 1 1 4237 Variation rs2299222 1 27506 86442404 86442404 1 -2 1 1 4237 Gene ENSG00000176105 1 27512 721588 812327 -1 -3 1 11 4246 StructuralVariation esv2751608 1 27512 4170411 4288923 1 -4 2 26 \N Variation COSM946275 1 27513 25744274 25744274 1 -5 1 32 4247 StructuralVariation esv93078 1 27523 7823440 8819373 1 -6 3 32 \N Variation rs2299299 1 27513 86442404 86442404 1 -7 4 32 \N Variation rs2299298 1 27512 86442404 86442404 1 +1 1 1 4237 Variation rs2299222 1 27506 86442404 86442404 1 \N +2 1 1 4237 Gene ENSG00000176105 1 27512 721588 812327 -1 \N +3 1 11 4246 StructuralVariation esv2751608 1 27512 4170411 4288923 1 \N +4 2 26 \N Variation COSM946275 1 27513 25744274 25744274 1 \N +5 1 32 4247 StructuralVariation esv93078 1 27523 7823440 8819373 1 \N +6 3 32 \N Variation rs2299299 1 27513 86442404 86442404 1 \N +7 4 32 \N Variation rs2299298 1 27512 86442404 86442404 1 Germline +8 5 32 \N Variation rs2299222 1 27506 86442404 86442404 1 Somatic diff --git a/modules/t/test-genome-DBs/homo_sapiens/variation/phenotype_feature_attrib.txt b/modules/t/test-genome-DBs/homo_sapiens/variation/phenotype_feature_attrib.txt index a5a93bd8f..d1a733753 100644 --- a/modules/t/test-genome-DBs/homo_sapiens/variation/phenotype_feature_attrib.txt +++ b/modules/t/test-genome-DBs/homo_sapiens/variation/phenotype_feature_attrib.txt @@ -2,3 +2,4 @@ 6 483 YES1 1 13 YES1 6 13 NOS1 +8 577 Tier I - Strong:diagnostic:supports diagnosis \ No newline at end of file diff --git a/modules/t/test-genome-DBs/homo_sapiens/variation/table.sql b/modules/t/test-genome-DBs/homo_sapiens/variation/table.sql index 3098ecb83..3884e3eeb 100644 --- a/modules/t/test-genome-DBs/homo_sapiens/variation/table.sql +++ b/modules/t/test-genome-DBs/homo_sapiens/variation/table.sql @@ -186,7 +186,7 @@ CREATE TABLE `meta` ( PRIMARY KEY (`meta_id`), UNIQUE KEY `species_key_value_idx` (`species_id`,`meta_key`,`meta_value`), KEY `species_value_idx` (`species_id`,`meta_value`) -) ENGINE=MyISAM AUTO_INCREMENT=150 DEFAULT CHARSET=latin1; +) ENGINE=MyISAM AUTO_INCREMENT=152 DEFAULT CHARSET=latin1; CREATE TABLE `meta_coord` ( `table_name` varchar(40) NOT NULL, @@ -239,12 +239,14 @@ CREATE TABLE `phenotype_feature` ( `seq_region_start` int(11) unsigned DEFAULT NULL, `seq_region_end` int(11) unsigned DEFAULT NULL, `seq_region_strand` tinyint(4) DEFAULT NULL, + `DNA_type` enum('Germline','Somatic') DEFAULT NULL, PRIMARY KEY (`phenotype_feature_id`), KEY `phenotype_idx` (`phenotype_id`), KEY `object_idx` (`object_id`,`type`), KEY `type_idx` (`type`), KEY `pos_idx` (`seq_region_id`,`seq_region_start`,`seq_region_end`), - KEY `source_idx` (`source_id`) + KEY `source_idx` (`source_id`), + KEY `dna_type_idx` (`DNA_type`) ) ENGINE=MyISAM AUTO_INCREMENT=8 DEFAULT CHARSET=latin1; CREATE TABLE `phenotype_feature_attrib` ( diff --git a/scripts/import/clinvar_post_process.pl b/scripts/import/clinvar_post_process.pl index fa71e060c..207086f39 100644 --- a/scripts/import/clinvar_post_process.pl +++ b/scripts/import/clinvar_post_process.pl @@ -195,11 +195,13 @@ sub update_variation{ print "Starting update_variation...\n" if $DEBUG == 1; + # Fetch 'clinvar_clin_sig' which is the germline classification my $clinvar_ext_sth = $dbh->prepare(qq[ select variation.variation_id, phenotype_feature_attrib.value from variation, phenotype_feature, phenotype_feature_attrib, source, attrib_type where source.name = ? and phenotype_feature.source_id = source.source_id and phenotype_feature.phenotype_feature_id = phenotype_feature_attrib.phenotype_feature_id + and phenotype_feature.dna_type = 'Germline' and phenotype_feature_attrib.attrib_type_id = attrib_type.attrib_type_id and attrib_type.code = 'clinvar_clin_sig' and variation.name = phenotype_feature.object_id diff --git a/scripts/import/import_clinvar_xml b/scripts/import/import_clinvar_xml index ddb379f07..4842742b8 100644 --- a/scripts/import/import_clinvar_xml +++ b/scripts/import/import_clinvar_xml @@ -53,6 +53,8 @@ use Date::Manip::Date; use XML::LibXML::Reader; use XML::Hash::XS qw(); use Text::ParseWords; +use Scalar::Util 'blessed'; +use List::MoreUtils qw(uniq); use Bio::EnsEMBL::Variation::Utils::Sequence qw( get_hgvs_alleles SO_variation_class); use Bio::EnsEMBL::Variation::Utils::SpecialChar qw(replace_char decode_text); @@ -155,9 +157,13 @@ my $haplotype_type = "Haplotype"; ## the phenotype list includes a clean version of the descriptions my %phenotype_cache = %{get_phenotype_cache()}; -## accepted clinsig values +## accepted clinsig values (germline) my $def_clinsig = $ATTRIBS{clinvar_clin_sig}; -my %known_clinsig = map { $_ => 1 } @{$def_clinsig} ; +my %known_clinsig = map { $_ => 1 } @{$def_clinsig}; + +## accepted clinsig values (somatic + somatic oncogenicity) +my $def_clinsig_somatic = $ATTRIBS{clinvar_somatic}; +my %known_clinsig_somatic = map { $_ => 1 } @{$def_clinsig_somatic}; ## handle incomplete runs - takes list of ClinVar RC accessions my %done; @@ -229,27 +235,36 @@ sub process_clinvar_set { return undef; } - ## clinical significance - $record{Desc} = get_clinsig ($set->{ReferenceClinVarAssertion}->{Classifications}->{GermlineClassification}); - # Skip invalid clinical significance - if(!defined $record{Desc} || $record{Desc} =~ "no classifications from unflagged records" || $record{Desc} eq "") { + ## clinical significance, confidence of assertation and date of assertion + $record{Classifications} = get_clinsig($set->{ReferenceClinVarAssertion}->{Classifications}); + + # Skip invalid clinical significance (germline) + if (defined $record{Classifications}->{germline} && + ($record{Classifications}->{germline}->{clin_sign} =~ "no classifications from unflagged records" || + $record{Classifications}->{germline}->{clin_sign} eq "")) { return undef; } - ## confidence of assertation - $record{Status} = $set->{ReferenceClinVarAssertion}->{Classifications}->{GermlineClassification}->{ReviewStatus}; - - ## date of assertion - $record{DateLastEvaluated} = $set->{ReferenceClinVarAssertion}->{Classifications}->{GermlineClassification}->{Description}->{DateLastEvaluated}; - ## check for somatic, Autosomal dominant, etc. $record{inheritance_type} = get_inheritance($set->{ReferenceClinVarAssertion}->{AttributeSet}); + # Some entries do not have mode of inheritance + # Try to get the somatic status from the sample origin + if(!$record{inheritance_type}) { + my $sample_origin = get_sample_origin($set->{ReferenceClinVarAssertion}->{ObservedIn}); + if(defined $sample_origin && $sample_origin eq "somatic") { + $record{inheritance_type} = "Somatic mutation"; + } + } + ## trait info (using Acc for error logging) ($record{disease}, $record{ontology_accession}) = get_disease($set->{ReferenceClinVarAssertion}->{TraitSet}->{Trait}, $record{Acc}); ## extract arrayref of PMIDs for citations + # The germline classification has the citations under ObservedData + # The somatic classification has the citations under ClinVarAssertion - Classification - "SomaticClinicalImpact" $record{citations} = get_citations($set->{ReferenceClinVarAssertion}->{ObservedIn}, "ObservedData"); + $record{citations_somatic} = get_citations($set->{ClinVarAssertion}, "Classification"); ## Variant name, HGVS, OMIM id and location if(defined $set->{ReferenceClinVarAssertion}->{GenotypeSet} && $DEBUG == 1) { @@ -271,7 +286,7 @@ sub process_clinvar_set { $record{clinvar_variant_id} = $set->{ReferenceClinVarAssertion}->{MeasureSet}->{Acc}; $record{feature_info} = get_feature($set->{ReferenceClinVarAssertion}->{MeasureSet}->{Measure}, $record{Acc},$set->{ReferenceClinVarAssertion}->{MeasureSet}->{Type}); - $record{submitters} = get_submitters($set->{ClinVarAssertion}, $record{feature_info}); + ($record{submitters}, $record{submitters_somatic}) = get_submitters($set->{ClinVarAssertion}, $record{feature_info}); if (defined $structvar ){ if( defined $record{feature_info}->{dbVar} && $record{feature_info}->{dbVar}->[0] =~/\d+/ ){ @@ -370,29 +385,127 @@ sub get_accession{ return $accession; } -## clean clinical significance -sub get_clinsig{ +## Fetch and clean clinical significance +## There are three types of classification: +## - GermlineClassification +## - SomaticClinicalImpact +## - OncogenicityClassification +## A single submission can have more than one type of classification. +sub get_clinsig { + my $classifications = shift; - my $ClinicalSignificance = shift; my $desc; + my $clinical_impact_assertion; + my $clinical_impact_clin_sig; + my $status; + my $date; + my $type; + my %result_by_type; # clinical significance by type + + # Check by type of classification + if (defined $classifications->{GermlineClassification}) { + $type = "germline"; + # Clinical significance - conflicting interpretations need an explanation + if ($classifications->{GermlineClassification}->{Description}->{content} =~/Conflicting interpretations of pathogenicity/) { + ## Description = 'conflicting data from submitters' - the values are in the explanation + defined $classifications->{GermlineClassification}->{Explanation} ? + $desc = "\L$classifications->{GermlineClassification}->{Explanation}->{content}" : + print "warning: conflicting ClinicalSignificance but no Explanation\n"; + $desc |=''; + $desc =~ s/\(\d+\)//g; ## remove bracketed counts + $desc =~ s/\;/\,/g; ## switch to comma delimited for set + } + else { + $desc = "\L$classifications->{GermlineClassification}->{Description}->{content}"; + } + + # Remove empty space from beginning and end of the description + $desc =~ s/^\s*(.*?)\s*$/$1/; + + my %germline_data = ( + "clin_sign" => $desc, + "status" => $classifications->{GermlineClassification}->{ReviewStatus}, + "date" => $classifications->{GermlineClassification}->{Description}->{DateLastEvaluated} + ); - if ($ClinicalSignificance->{Description}->{content} =~/Conflicting interpretations of pathogenicity/){ - ## Description = 'conflicting data from submitters' - the values are in the explanation - defined $ClinicalSignificance->{Explanation} ? - $desc = "\L$ClinicalSignificance->{Explanation}->{content}" : - print "warning: conflicting ClinicalSignificance but no Explanation\n"; - $desc |=''; - $desc =~ s/\(\d+\)//g; ## remove bracketed counts - $desc =~ s/\;/\,/g; ## switch to comma delimited for set + $result_by_type{$type} = \%germline_data; } - else{ - $desc = "\L$ClinicalSignificance->{Description}->{content}"; + + # Somatic classification + if (defined $classifications->{SomaticClinicalImpact}) { + my @somatic_desc; + my @dates; + my @clinical_assertions; + my @clinical_impact_cs; + $type = "somatic"; + # {SomaticClinicalImpact}->{Description} can be array or hash + if (ref($classifications->{SomaticClinicalImpact}->{Description}) eq "ARRAY") { + for my $description (@{$classifications->{SomaticClinicalImpact}->{Description}}) { + # clinical significance + my $tmp_desc = $description->{content}; + # clinical impact assertion type + if($description->{ClinicalImpactAssertionType}) { + $tmp_desc .= ":" . $description->{ClinicalImpactAssertionType}; + } + # clinical impact clinical significance + if($description->{ClinicalImpactClinicalSignificance}) { + $tmp_desc .= ":" . $description->{ClinicalImpactClinicalSignificance}; + } + push @somatic_desc, $tmp_desc; + + # date last evaluated + push @dates, $description->{DateLastEvaluated}; + } + $desc = join(",", @somatic_desc); + $date = join(",", @dates); + } + else { + $desc = $classifications->{SomaticClinicalImpact}->{Description}->{content}; + if($classifications->{SomaticClinicalImpact}->{Description}->{ClinicalImpactAssertionType}) { + $desc .= ":" . $classifications->{SomaticClinicalImpact}->{Description}->{ClinicalImpactAssertionType}; + } + if($classifications->{SomaticClinicalImpact}->{Description}->{ClinicalImpactClinicalSignificance}) { + $desc .= ":" . $classifications->{SomaticClinicalImpact}->{Description}->{ClinicalImpactClinicalSignificance}; + } + $date = $classifications->{SomaticClinicalImpact}->{Description}->{DateLastEvaluated}; + } + + # Remove empty space from beginning and end of the description + $desc =~ s/^\s*(.*?)\s*$/$1/; + + my %somatic_data = ( + "clin_sign" => $desc, + "status" => $classifications->{SomaticClinicalImpact}->{ReviewStatus}, + "date" => $date + ); + + $result_by_type{$type} = \%somatic_data; + } + + # Oncogenic classification + if (defined $classifications->{OncogenicityClassification}) { + $type = "oncogenicity"; + $desc = $classifications->{OncogenicityClassification}->{Description}->{content}; + + # Remove empty space from beginning and end of the description + $desc =~ s/^\s*(.*?)\s*$/$1/; + + my %oncogenic_data = ( + "clin_sign" => $desc, + "status" => $classifications->{OncogenicityClassification}->{ReviewStatus}, + "date" => $classifications->{OncogenicityClassification}->{Description}->{DateLastEvaluated} + ); + + $result_by_type{$type} = \%oncogenic_data; } - return $desc; + + return \%result_by_type; } =head2 get_inheritance -- mode of inheritance attribute holds somatic status +Mode of inheritance attribute holds somatic status. +The 'ModeOfInheritance' depends on the submitter, +most of the submitters do not provide a value. =cut sub get_inheritance{ @@ -407,9 +520,30 @@ sub get_inheritance{ $moi = $attribute->{Attribute}->{content}; } + return $moi; } +=head2 get_sample_origin +Method to fetch the origin of the sample: +germline, somatic +=cut +sub get_sample_origin { + my $observed_in = shift; + + my $origin; + + my $attributes = to_array($observed_in); + + foreach my $attribute(@{$attributes}){ + next unless defined $attribute->{Sample} && defined $attribute->{Sample}->{Origin}; + + $origin = $attribute->{Sample}->{Origin}; + } + + return $origin; +} + =head2 get_feature - get variant/ structural variant info + location on required assembly @@ -521,7 +655,7 @@ sub get_feature{ $attrib->{Attribute}->{Type} =~ /HGVS,\s+genomic,\s+top\s+level/ ); # Make sure the chr matches the hgvs chromosome - if($feature{accession} =~ $attrib->{Attribute}->{Accession}) { + if(defined $feature{accession} && $feature{accession} =~ $attrib->{Attribute}->{Accession}) { $feature{hgvs_g} = $attrib->{Attribute}->{Change}; } elsif(defined $feature{ypar} && $feature{ypar}{accession} =~ $attrib->{Attribute}->{Accession}) { @@ -659,6 +793,10 @@ sub get_citations{ my $observed_data = to_array($observed_in->{$label}); foreach my $obs( @{$observed_data} ){ + if($label eq "Classification" && $obs->{GermlineClassification}) { + next; + } + if ($obs->{Citation}){ my $citations = to_array($obs->{Citation}); foreach my $cit(@{$citations}){ @@ -671,7 +809,9 @@ sub get_citations{ } } - return \@citations; + my @citations_uniq = uniq @citations; + + return \@citations_uniq; } =head2 get_submitters @@ -685,12 +825,26 @@ sub get_submitters{ my $mainXrefs = shift; my @submitters; + my @submitters_somatic; my $assertions = to_array($structure); foreach my $assert(@{$assertions}){ - push @submitters, $assert->{ClinVarSubmissionID}->{submitter}; + my $submitter_id = $assert->{ClinVarSubmissionID}->{submitter}; + my $submitter_type; + + if($assert->{Classification}->{GermlineClassification}) { + push @submitters, $submitter_id; + $submitter_type = "germline"; + } + else { + push @submitters_somatic, $submitter_id; + $submitter_type = "somatic"; + } + # if submitter OMIM then get OMIM allelic variant ID and corresponding rsID from reference record - next unless $assert->{ClinVarSubmissionID}->{submitter} eq $omimstr; + # TODO update to support all submitter types + next unless $submitter_id eq $omimstr && $submitter_type eq "germline"; + if (defined $assert->{ExternalID} && $assert->{ExternalID}->{DB} eq $omimstr) { my $omimAlleleID = $assert->{ExternalID}->{ID}; @@ -724,10 +878,10 @@ sub get_submitters{ push @{$mainXrefs->{'dbSNP'}}, @{$mainXrefs->{'measureXrefs'}{$tmpMeasure}{'dbSNP'}{'rs'}} if defined $mainXrefs->{'measureXrefs'}{$tmpMeasure}{'dbSNP'} && defined $mainXrefs->{'measureXrefs'}{$tmpMeasure}{'dbSNP'}{'rs'}; } # make sure the list contains unique rsIDs to deal with haplotypes/records encountered with multiple sets using same rsID - @{$mainXrefs->{'dbSNP'}} = keys {map { $_ => 1 } @{$mainXrefs->{'dbSNP'}} } if defined $mainXrefs->{'dbSNP'}; + @{$mainXrefs->{'dbSNP'}} = keys %{ { map { $_ => 1 } @{$mainXrefs->{'dbSNP'}} } } if defined $mainXrefs->{'dbSNP'}; } - return \@submitters; + return \@submitters, \@submitters_somatic; } sub to_array{ @@ -763,7 +917,7 @@ sub import{ my $alt_allele; ## disease associated allele from HGVS my $feat; ## use stored *variation_features where possible - if(defined $record->{feature_info}->{dbSNP} && !defined $structvar){ + if(defined $record->{feature_info}->{dbSNP}){ foreach my $rs (@{ $record->{feature_info}->{dbSNP} }){ my $rsID = 'rs'.$rs; # for haplotypes (multiple rsIDs) each rsID has it's own hgvs_g string @@ -780,26 +934,18 @@ sub import{ } } - elsif(!defined $structvar && $try_match ){ + elsif($try_match ){ + print "try to match ", $record->{Acc}, "\n" if $DEBUG == 1; ($feature_object, $feat, $alt_allele) = get_variant($record, undef); if (! defined $feature_object){ print "no variation found for $record->{Acc}, $record->{clinvar_variant_id}\n" if $DEBUG == 1; return undef; } - add_clinvar_data($feature_object, $feat, $alt_allele, $record); } - elsif(defined $record->{feature_info}->{dbVar} && defined $structvar){ - ($feature_object, $feat) = get_structural_variant($record); - - if ( defined $feature_object){ - ## add phenotype_feature & attrib - import_phenotype_feature($record, $feature_object, 'StructuralVariation', $feat, $alt_allele); - } - } elsif ($DEBUG == 1){ - print "Can't import ". $record->{Acc} ." as no dbSNP or location or SV\n" if $DEBUG == 1; + elsif ($DEBUG == 1){ + print "Can't import ". $record->{Acc} ." as no dbSNP or location\n" if $DEBUG == 1; } - } sub add_clinvar_data{ @@ -820,6 +966,7 @@ sub add_clinvar_data{ add_synonyms($variation_object, $record->{feature_info}{$omimstr}, $omim_source) if defined $record->{feature_info}{$omimstr}; ## add phenotype_feature & attrib (there may not be an alt_allele) + import_phenotype_feature($record, $variation_object, 'Variation', $feat, $alt_allele); } @@ -844,16 +991,19 @@ sub import_phenotype_feature{ $record->{disease} = $default_pheno if $record->{disease} eq "not specified"; ## check for new clin_sig - my $found = check_known_clinsig($record->{Desc}); - if ( ! $found ){ - warn "clin_sig not known: >". $record->{Desc} ."< feature: ". $feature_object->name ."\n"; + my ($found, $term) = check_known_clinsig($record->{Classifications}); + + if (!$found && $term) { + warn "clin_sig not known: >". $term ."< feature: ". $feature_object->name ."\n"; } # Remove special characters from the phenotype description and submitter name $record->{disease} = decode_text( $record->{disease}); $record->{disease} = replace_char( $record->{disease}); + + # TODO: fix foreach my $sub (@{$record->{submitters}}){ - $sub = replace_char( $sub); + $sub = replace_char($sub); } ## look for existing or enter new phenotype object @@ -866,61 +1016,114 @@ sub import_phenotype_feature{ return; } - my %attribs; - $attribs{review_status} = $record->{Status}; - $attribs{external_id} = $record->{Acc}; - $attribs{clinvar_clin_sig} = $record->{Desc} if $record->{Desc} ne ''; #avoids empty entry if explanation is missing for conflicting evidence - $attribs{risk_allele} = $alt_allele if defined $alt_allele && $alt_allele ne "-" && $alt_allele ne ''; - $attribs{associated_gene} = $record->{feature_info}->{gene} if defined $record->{feature_info}->{gene}; - $attribs{MIM} = $record->{feature_info}->{MIM} if defined $record->{feature_info}->{MIM}; - $attribs{pubmed_id} = join(",", @{$record->{citations}}) if $record->{citations} && exists $record->{citations}->[0]; - if (defined $attribs{pubmed_id} && length($attribs{pubmed_id}) > 255) { - $attribs{pubmed_id} = substr($attribs{pubmed_id}, 0, 255); - $attribs{pubmed_id} = substr($attribs{pubmed_id}, 0,rindex($attribs{pubmed_id}, ",")); - } - $attribs{inheritance_type} = $record->{inheritance_type} if defined $record->{inheritance_type}; - $attribs{DateLastEvaluated} = $record->{DateLastEvaluated} if defined $record->{DateLastEvaluated}; - $attribs{variation_names} = join(",", @{$record->{feature_info}->{haplo}->{$feature_object->name}}) if defined $record->{feature_info}->{haplo} && defined $record->{feature_info}->{haplo}->{$feature_object->name}; - my %submitter_ids; - foreach my $sub (@{$record->{submitters}}){ - ##enter submitter unless already available - unless ($submitters->{$sub}){ - $submitters->{$sub} = add_submitter($sub); - print "Added submitter $sub id " . $submitters->{$sub} ." for sub\n" if $DEBUG == 1; + for my $classification (keys %{$record->{Classifications}}) { + my %attribs; + my %submitter_ids; + my $dna_type = undef; + + # Germline classification + if ($classification eq "germline") { + # avoids empty entry if explanation is missing for conflicting evidence + $attribs{clinvar_clin_sig} = $record->{Classifications}->{germline}->{clin_sign}; + $dna_type = "Germline"; + $attribs{review_status} = $record->{Classifications}->{germline}->{status}; + + if(defined $record->{Classifications}->{germline}->{date}){ + $attribs{DateLastEvaluated} = $record->{Classifications}->{germline}->{date}; + } + } + # Somatic classification + elsif ($classification eq "somatic") { + # avoids empty entry if explanation is missing for conflicting evidence + $attribs{somatic_clin_sig} = $record->{Classifications}->{somatic}->{clin_sign}; + $dna_type = "Somatic"; + $attribs{somatic_status} = $record->{Classifications}->{somatic}->{status}; + + if(defined $record->{Classifications}->{somatic}->{date}){ + $attribs{somatic_date} = $record->{Classifications}->{somatic}->{date}; + } + } + # Oncogenicity classification + # In the phenotype feature this classification has DNA_type "Somatic" + elsif ($classification eq "oncogenicity") { + # avoids empty entry if explanation is missing for conflicting evidence + $attribs{oncogenic_clin_sig} = $record->{Classifications}->{oncogenicity}->{clin_sign}; + $dna_type = "Somatic"; + $attribs{oncogenic_status} = $record->{Classifications}->{oncogenicity}->{status}; + + if($record->{Classifications}->{oncogenicity}->{date}) { + $attribs{oncogenic_date} = $record->{Classifications}->{oncogenicity}->{date}; + } + } + else { + print "Warning: no classification type\n"; } - $submitter_ids{ $submitters->{$sub} } = 1; - } - $attribs{submitter_id} = join(",", keys %submitter_ids) if (keys %submitter_ids) >0; - - #if OMIM is a submitter save the variation in the ph_omim variation_set_variation - if ($type ne 'StructuralVariation' && - exists $submitters->{$omimstr} && - exists $submitter_ids{$submitters->{$omimstr}} && - ! defined $omimVarSet{$feature_object->dbID()}){ - update_variation_set($feature_object->dbID()); - } - foreach my $feature (@{$feat}){ #feat is the VariationFeature which contains the genome coordonates for this variation - print "entering phenotype_feature type : $type & object id: ". $feature_object->dbID() . ", position " .$feature->seq_region_start() . "-". $feature->seq_region_end() . "\n" if $DEBUG == 1; - - ## add evidence and variation_set_id to variation_feature - update_variation_feature($feature, $attribs{clinvar_clin_sig}); - - my $phenofeat = Bio::EnsEMBL::Variation::PhenotypeFeature->new( - -slice => $feature->slice(), - -start => $feature->seq_region_start(), - -strand => $feature->seq_region_strand(), - -end => $feature->seq_region_end(), - -phenotype => $pheno, - -is_significant => 1, - -type => $type, - -object => $feature_object, - -source => $source, - -attribs => \%attribs - ); - $pheno_feat_adaptor->store($phenofeat); + $attribs{external_id} = $record->{Acc}; + $attribs{risk_allele} = $alt_allele if defined $alt_allele && $alt_allele ne "-" && $alt_allele ne ''; + $attribs{associated_gene} = $record->{feature_info}->{gene} if defined $record->{feature_info}->{gene}; + $attribs{MIM} = $record->{feature_info}->{MIM} if defined $record->{feature_info}->{MIM}; + + # Prepare the citations + # Germline and somatic have publications in different places + my $citations_data = $classification eq "germline" ? $record->{citations} : $record->{citations_somatic}; + $attribs{pubmed_id} = join(",", @{$citations_data}) if $citations_data && exists $citations_data->[0]; + + if (defined $attribs{pubmed_id} && length($attribs{pubmed_id}) > 255) { + $attribs{pubmed_id} = substr($attribs{pubmed_id}, 0, 255); + $attribs{pubmed_id} = substr($attribs{pubmed_id}, 0,rindex($attribs{pubmed_id}, ",")); + } + + $attribs{inheritance_type} = $record->{inheritance_type} if defined $record->{inheritance_type}; + $attribs{variation_names} = join(",", @{$record->{feature_info}->{haplo}->{$feature_object->name}}) if defined $record->{feature_info}->{haplo} && defined $record->{feature_info}->{haplo}->{$feature_object->name}; + + # Submitters + my $submitters_data = $classification eq "germline" ? $record->{submitters} : $record->{submitters_somatic}; + foreach my $sub (@{$submitters_data}){ + ##enter submitter unless already available + unless ($submitters->{$sub}){ + $submitters->{$sub} = add_submitter($sub); + print "Added submitter $sub id " . $submitters->{$sub} ." for sub\n" if $DEBUG == 1; + } + $submitter_ids{ $submitters->{$sub} } = 1; + } + $attribs{submitter_id} = join(",", keys %submitter_ids) if (keys %submitter_ids) >0; + + #if OMIM is a submitter save the variation in the ph_omim variation_set_variation + if (exists $submitters->{$omimstr} && + exists $submitter_ids{$submitters->{$omimstr}} && + ! defined $omimVarSet{$feature_object->dbID()}){ + update_variation_set($feature_object->dbID()); + } + + # feat is the VariationFeature which contains the genome coordinates for this variation + foreach my $feature (@{$feat}){ + print "entering phenotype_feature type : $type & object id: ". $feature_object->dbID() . ", position " .$feature->seq_region_start() . "-". $feature->seq_region_end() . "\n" if $DEBUG == 1; + + ## add evidence and variation_set_id to variation_feature + if ($classification eq "germline") { + update_variation_feature($feature, $attribs{clinvar_clin_sig}); + } + + my $phenofeat = Bio::EnsEMBL::Variation::PhenotypeFeature->new( + -slice => $feature->slice(), + -start => $feature->seq_region_start(), + -strand => $feature->seq_region_strand(), + -end => $feature->seq_region_end(), + -phenotype => $pheno, + -is_significant => 1, + -type => $type, + -object => $feature_object, + -source => $source, + -dna_type => $dna_type, + -attribs => \%attribs + ); + $pheno_feat_adaptor->store($phenofeat); + + } } + } @@ -1076,6 +1279,7 @@ sub add_submitter{ - look up or enter variation - returns variation & variation_feature objects & associated allele (from HGVS) + - TODO: return $ref_allele =cut sub get_variant{ @@ -1232,7 +1436,7 @@ sub get_variant_by_slice_allele{ return undef unless defined $cv_allele; - if(!defined $record->{feature_info}->{end} || $record->{feature_info}->{start} != $record->{feature_info}->{end}) { + if(!defined $record->{feature_info}->{start} || !defined $record->{feature_info}->{end} || ($record->{feature_info}->{start} != $record->{feature_info}->{end})) { print "too long for slice look up: ", $record->{feature_info}->{hgvs_g}, "\n" if $DEBUG == 1; return undef; } @@ -1734,26 +1938,52 @@ sub clean_old_data{ print "Old data deleted!\n"; } +# Check if the clinical significance is known for each type of classification sub check_known_clinsig { - my $current_clinsig = shift; - - my $tmpClinsig = $current_clinsig; - $tmpClinsig =~ s/\//\,/ ; # convert 'pathogenic/likely pathogenic' to 'pathogenic,likely pathogenic' - $tmpClinsig =~ s/\,\s+/\,/ ; # convert 'likely benign, other' to 'likely benign,other' - #replace 'conflicting interpretations of pathogenicity' with 'uncertain significance' - #for the purpose of variation, variation_feature clin_sig entry - $tmpClinsig =~ s/conflicting interpretations of pathogenicity/uncertain significance/g; - #similar replace of 'association not found' to 'other' - $tmpClinsig =~ s/association not found/other/g; - #remove comma from accepted values - $tmpClinsig =~ s/pathogenic,low penetrance/pathogenic low penetrance/; - my @tmpClinsigs = split(",", $tmpClinsig); - my $found = 1; - foreach my $clinsig (@tmpClinsigs) { - $found = 0 if ( ! defined $known_clinsig{ $clinsig } ) ; + my $current_classifications = shift; + + my $found; + my $term = undef; + + foreach my $type (keys %{$current_classifications}) { + my $tmpClinsig = $current_classifications->{$type}->{clin_sign}; + + # Specific checks for germline classification + if ($type eq "germline") { + $tmpClinsig =~ s/\//\,/ ; # convert 'pathogenic/likely pathogenic' to 'pathogenic,likely pathogenic' + $tmpClinsig =~ s/\,\s+/\,/ ; # convert 'likely benign, other' to 'likely benign,other' + #replace 'conflicting interpretations of pathogenicity' with 'uncertain significance' + #for the purpose of variation, variation_feature clin_sig entry + $tmpClinsig =~ s/conflicting interpretations of pathogenicity/uncertain significance/g; + #similar replace of 'association not found' to 'other' + $tmpClinsig =~ s/association not found/other/g; + #remove comma from accepted values + $tmpClinsig =~ s/pathogenic,low penetrance/pathogenic low penetrance/; + + my @tmpClinsigs = split(/,\s*|;\s*/, $tmpClinsig); + my $found = 1; + foreach my $clinsig (@tmpClinsigs) { + if (!defined $known_clinsig{$clinsig}) { + $found = 0; + $term = $clinsig; + } + } + } + # Somatic classifications and somatic oncogenicity classifications + elsif ($type eq "somatic" || $type eq "oncogenicity") { + my @tmpClinsigs = split(/,\s*|;\s*/, $tmpClinsig); + my $found = 1; + foreach my $clinsig (@tmpClinsigs) { + my @clinsig_clean = split(/:/, $clinsig); + if (!defined $known_clinsig_somatic{$clinsig_clean[0]}) { + $found = 0; + $term = $clinsig_clean[0]; + } + } + } } - return $found; + return $found, $term; } sub get_phenotype_cache { diff --git a/sql/attrib_entries.sql b/sql/attrib_entries.sql index cfcc96bd1..88ac448b8 100644 --- a/sql/attrib_entries.sql +++ b/sql/attrib_entries.sql @@ -331,8 +331,8 @@ INSERT IGNORE INTO attrib_type (attrib_type_id, code, name, description) VALUES INSERT IGNORE INTO attrib_type (attrib_type_id, code, name, description) VALUES (542, 'citation_source', 'Citation source', 'Variant citation data source'); INSERT IGNORE INTO attrib_type (attrib_type_id, code, name, description) VALUES (544, 'phenotype_type', 'Phenotype type', 'Type of the phenotype information'); INSERT IGNORE INTO attrib_type (attrib_type_id, code, name, description) VALUES (545,'external_plant_links','external_plant_links','Links to external plant related sources'); -INSERT IGNORE INTO attrib_type (attrib_type_id, code, name, description) VALUES (546,'coord_system_tag','coord_system_tag','A way to store information about seq_region\'s virtual coord_system names for single level assemblies. For example, we have an assembly with toplevel seq_regions (\"primary_assembly\") only, and we want to store the initial type of seq_regions like contig, scaffold, supercontig, chromosome, etc.'); -INSERT IGNORE INTO attrib_type (attrib_type_id, code, name, description) VALUES (547,'sequence_location','sequence_location','To identify sequence locations / cellular compartments that DNA sequence comes from.Values are supposed to be SO compliant (children of the plastid_sequence SO:0000740 and nuclear_sequence SO:0000738 ): \"apicoplast_chromosome\", \"chloroplast_chromosome\", \"chromoplast_chromosome\", \"cyanelle_chromosome\", \"leucoplast_chromosome\", \"macronuclear_chromosome\", \"micronuclear_chromosome\", \"mitochondrial_chromosome\", \"nuclear_chromosome\".'); +INSERT IGNORE INTO attrib_type (attrib_type_id, code, name, description) VALUES (546,'coord_system_tag','coord_system_tag',"A way to store information about seq_region\'s virtual coord_system names for single level assemblies. For example, we have an assembly with toplevel seq_regions (\"primary_assembly\") only, and we want to store the initial type of seq_regions like contig, scaffold, supercontig, chromosome, etc."); +INSERT IGNORE INTO attrib_type (attrib_type_id, code, name, description) VALUES (547,'sequence_location','sequence_location',"To identify sequence locations / cellular compartments that DNA sequence comes from.Values are supposed to be SO compliant (children of the plastid_sequence SO:0000740 and nuclear_sequence SO:0000738 ): \"apicoplast_chromosome\", \"chloroplast_chromosome\", \"chromoplast_chromosome\", \"cyanelle_chromosome\", \"leucoplast_chromosome\", \"macronuclear_chromosome\", \"micronuclear_chromosome\", \"mitochondrial_chromosome\", \"nuclear_chromosome\"."); INSERT IGNORE INTO attrib_type (attrib_type_id, code, name, description) VALUES (548,'BRC4_seq_region_name','BRC4_seq_region_name','seq_region name for the BRC4 project'); INSERT IGNORE INTO attrib_type (attrib_type_id, code, name, description) VALUES (549,'EBI_seq_region_name','EBI_seq_region_name','EBI seq_region name for the BRC4 project data roundtripping'); INSERT IGNORE INTO attrib_type (attrib_type_id, code, name, description) VALUES (550, 'MANE_Plus_Clinical', 'MANE Plus Clinical', 'Transcripts in the MANE Plus Clinical set are additional transcripts per locus necessary to support clinical variant reporting, for example transcripts containing known Pathogenic or Likely Pathogenic clinical variants not reportable using the MANE Select set. Note there may be additional clinically relevant transcripts in the wider RefSeq and Ensembl/GENCODE sets but not yet in MANE.'); @@ -362,6 +362,12 @@ INSERT IGNORE INTO attrib_type (attrib_type_id, code, name, description) VALUES INSERT IGNORE INTO attrib_type (attrib_type_id, code, name, description) VALUES (574, 'sha512t24u_pep', 'sha512 Checksum for translation sequences ', 'sha512 Checksum for translation sequences'); INSERT IGNORE INTO attrib_type (attrib_type_id, code, name, description) VALUES (575, 'md5_pep', 'md5 Checksum for translation sequences', 'md5 Checksum for translation sequences '); INSERT IGNORE INTO attrib_type (attrib_type_id, code, name, description) VALUES (576, 'gencode_primary', 'GENCODE primary annotation', 'Gencode Primary replaces Gencode Basic for protein coding genes as the set of the most biologically relevant transcripts. Protein coding transcripts are compared to MANE Select or to Ensembl Canonical where MANE Select is not yet defined. The features (exons and introns) that are novel to the reference transcript are filtered and assessed for biological relevance using conservation and expression data. The transcripts having the filtered features are retained for the set. For noncoding genes, the Gencode Primary set is identical to the Gencode Basic set.'); +INSERT IGNORE INTO attrib_type (attrib_type_id, code, name, description) VALUES (577, 'somatic_clin_sig', 'ClinVar somatic classification', 'The somatic classification of a variant as reported by ClinVar'); +INSERT IGNORE INTO attrib_type (attrib_type_id, code, name, description) VALUES (578, 'oncogenic_clin_sig', 'ClinVar somatic classification of oncogenicity', 'The somatic classification of oncogenicity of a variant as reported by ClinVar'); +INSERT IGNORE INTO attrib_type (attrib_type_id, code, name, description) VALUES (579, 'somatic_status', 'ClinVar somatic review_status', 'ClinVar review_status for assertation for somatic classification'); +INSERT IGNORE INTO attrib_type (attrib_type_id, code, name, description) VALUES (580, 'oncogenic_status', 'ClinVar somatic oncogenicity review_status', 'ClinVar review_status for assertation for somatic classification of oncogenicity'); +INSERT IGNORE INTO attrib_type (attrib_type_id, code, name, description) VALUES (581, 'somatic_date', 'Date last evaluated for ClinVar evidence', 'The most recent date on which evidence was evaluated and this conclusion drawn for ClinVar somatic classification'); +INSERT IGNORE INTO attrib_type (attrib_type_id, code, name, description) VALUES (582, 'oncogenic_date', 'Date last evaluated for ClinVar evidence', 'The most recent date on which evidence was evaluated and this conclusion drawn for ClinVar somatic classification of oncogenicity'); INSERT IGNORE INTO attrib (attrib_id, attrib_type_id, value) VALUES (1, 469, 'SO:0001483'); INSERT IGNORE INTO attrib (attrib_id, attrib_type_id, value) VALUES (2, 470, 'SNV'); diff --git a/sql/patch_114_115_a.sql b/sql/patch_114_115_a.sql new file mode 100644 index 000000000..714485201 --- /dev/null +++ b/sql/patch_114_115_a.sql @@ -0,0 +1,20 @@ +-- Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute +-- Copyright [2016-2024] EMBL-European Bioinformatics Institute +-- +-- Licensed under the Apache License, Version 2.0 (the "License"); +-- you may not use this file except in compliance with the License. +-- You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, software +-- distributed under the License is distributed on an "AS IS" BASIS, +-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-- See the License for the specific language governing permissions and +-- limitations under the License. + + # update the schema_version entry in the meta table + UPDATE meta SET meta_value = '115' WHERE meta_key = 'schema_version'; + + # patch identifier + INSERT INTO meta (species_id, meta_key, meta_value) VALUES (NULL, 'patch', 'patch_114_115_a.sql|schema version'); diff --git a/sql/patch_114_115_b.sql b/sql/patch_114_115_b.sql new file mode 100644 index 000000000..8be3ced8f --- /dev/null +++ b/sql/patch_114_115_b.sql @@ -0,0 +1,22 @@ +-- Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute +-- Copyright [2016-2024] EMBL-European Bioinformatics Institute +-- +-- Licensed under the Apache License, Version 2.0 (the "License"); +-- you may not use this file except in compliance with the License. +-- You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, software +-- distributed under the License is distributed on an "AS IS" BASIS, +-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-- See the License for the specific language governing permissions and +-- limitations under the License. + +# Add column to phenotype_feature +ALTER TABLE phenotype_feature ADD DNA_type enum('Germline', 'Somatic') DEFAULT NULL; +# Create index +CREATE INDEX dna_type_idx on phenotype_feature (DNA_type); + +# patch identifier +INSERT INTO meta (species_id, meta_key, meta_value) VALUES (NULL, 'patch', 'patch_114_115_b.sql|Add column DNA_type to phenotype_feature'); diff --git a/sql/table.sql b/sql/table.sql index ad3da2e73..7633983fa 100755 --- a/sql/table.sql +++ b/sql/table.sql @@ -348,6 +348,7 @@ CREATE TABLE allele ( @column seq_region_start The start position of the feature on the @link seq_region. @column seq_region_end The end position of the feature on the @link seq_region. @column seq_region_strand The orientation of the feature on the @link seq_region. +@column DNA_type The type of DNA, 'Germline' or 'Somatic'. @see variation @see phenotype @@ -367,12 +368,14 @@ CREATE TABLE IF NOT EXISTS `phenotype_feature` ( `seq_region_start` INT(11) UNSIGNED DEFAULT NULL, `seq_region_end` INT(11) UNSIGNED DEFAULT NULL, `seq_region_strand` TINYINT(4) DEFAULT NULL, + `DNA_type` ENUM('Germline', 'Somatic') DEFAULT NULL, PRIMARY KEY (`phenotype_feature_id`), KEY `phenotype_idx` (`phenotype_id`), KEY `object_idx` (`object_id`,`type`), KEY `type_idx` (`type`), KEY `pos_idx` (`seq_region_id`,`seq_region_start`,`seq_region_end`), - KEY `source_idx` (`source_id`) + KEY `source_idx` (`source_id`), + KEY `dna_type_idx` (`DNA_type`) ); @@ -1835,10 +1838,11 @@ CREATE TABLE meta ( # Add schema type and schema version to the meta table. -INSERT INTO meta (species_id, meta_key, meta_value) VALUES (NULL, 'schema_type', 'variation'), (NULL, 'schema_version', '114'); +INSERT INTO meta (species_id, meta_key, meta_value) VALUES (NULL, 'schema_type', 'variation'), (NULL, 'schema_version', '115'); # Patch IDs for new release -INSERT INTO meta (species_id, meta_key, meta_value) VALUES (NULL, 'patch', 'patch_113_114_a.sql|schema version'); +INSERT INTO meta (species_id, meta_key, meta_value) VALUES (NULL, 'patch', 'patch_114_115_a.sql|schema version'); +INSERT INTO meta (species_id, meta_key, meta_value) VALUES (NULL, 'patch', 'patch_114_115_b.sql|Add column DNA_type to phenotype_feature'); /** @header Failed tables