Skip to content

Commit

Permalink
Update pathology extraction template (#472)
Browse files Browse the repository at this point in the history
  • Loading branch information
caufieldjh authored Nov 8, 2024
2 parents 1eea78d + 2534aa3 commit ce5c592
Show file tree
Hide file tree
Showing 2 changed files with 273 additions and 30 deletions.
170 changes: 155 additions & 15 deletions src/ontogpt/templates/pathology.py
Original file line number Diff line number Diff line change
@@ -1,29 +1,33 @@
from __future__ import annotations

import re
import sys
from datetime import (
datetime,
date,
datetime,
time
)
from decimal import Decimal
from enum import Enum
import re
import sys
from typing import (
Any,
ClassVar,
Dict,
List,
Literal,
Dict,
Optional,
Union
)

from pydantic import (
BaseModel,
ConfigDict,
Field,
RootModel,
field_validator
)


metamodel_version = "None"
version = "None"

Expand Down Expand Up @@ -101,18 +105,68 @@ class SeverityLevel(str, Enum):
Not_Specified = "Not Specified"


class PathologyClassification(str, Enum):
class PathologyClassificationOne(str, Enum):
"""
The final classification of the overall pathology. This uses a system of five categories, identified numerically.
"""
The final classification of the overall pathology.
# No significant pathological abnormality was observed.
number_1 = "1"
# Neoplastic malignant growth was observed.
number_2 = "2"
# Dysplastic pathology was observed (i.e., abnormal or atypical cell growth and/or appearance).
number_3 = "3"
# Proliferative non-neoplastic pathology was observed.
number_4 = "4"
# Inflammatory or other non-proliferative abnormalities were observed.
number_5 = "5"


class PathologyClassificationTwo(str, Enum):
"""
# The final classification of the overall pathology is unclear.
Unclear = "Unclear"
# The final classification of the overall pathology is benign.
Benign = "Benign"
# The final classification of the overall pathology is malignant.
Malignant = "Malignant"
# The final classification of the overall pathology is inflammation.
Inflammation = "Inflammation"
The final classification of the overall pathology. This must be a code, "2" or "2a".
"""
# No significant pathological abnormality was observed.
number_1 = "1"
# Neoplastic malignant pathology was observed.
number_2 = "2"
# Carcinoma was observed.
number_2a = "2a"
# Sarcoma was observed.
number_2b = "2b"
# Lymphoma was observed.
number_2c = "2c"
# Other neoplastic malignant growth was observed.
number_2d = "2d"
# Dysplastic pathology was observed (i.e., abnormal or atypical cell growth and/or appearance).
number_3 = "3"
# High grade dysplasia was observed.
number_3a = "3a"
# Low grade dysplasia was observed.
number_3b = "3b"
# Proliferative non-neoplastic pathology was observed.
number_4 = "4"
# Inflammatory or other non-proliferative abnormalities were observed.
number_5 = "5"
# Acute or active inflammation was observed.
number_5a = "5a"
# Chronic inflammation was observed.
number_5b = "5b"
# Eosinophils were present.
number_5c = "5c"
# Granulomas / histiocytes / macrophages were present.
number_5d = "5d"
# Organisms (Bacterial, Viral, Parasitic, Fungal) were present.
number_5e = "5e"
# Collagen abnormalities were observed.
number_5f = "5f"
# Vessel abnormalities were observed.
number_5g = "5g"
# Apoptosis was observed.
number_5h = "5h"
# Mast cells were observed.
number_5i = "5i"
# Amyloid was observed.
number_5j = "5j"



Expand Down Expand Up @@ -144,6 +198,23 @@ class NamedEntity(ConfiguredBaseModel):
'value': 'AnnotationProperty, AnnotationAssertion'}},
'domain_of': ['NamedEntity'],
'slot_uri': 'rdfs:label'} })
original_spans: Optional[List[str]] = Field(None, description="""The coordinates of the original text span from which the named entity was extracted, inclusive. For example, \"10:25\" means the span starting from the 10th character and ending with the 25th character. The first character in the text has index 0. Newlines are treated as single characters. Multivalued as there may be multiple spans for a single text.""", json_schema_extra = { "linkml_meta": {'alias': 'original_spans',
'annotations': {'prompt.skip': {'tag': 'prompt.skip', 'value': 'true'}},
'comments': ['This is determined during grounding and normalization',
'But is based on the full input text'],
'domain_of': ['NamedEntity']} })

@field_validator('original_spans')
def pattern_original_spans(cls, v):
pattern=re.compile(r"^\d+:\d+$")
if isinstance(v,list):
for element in v:
if isinstance(v, str) and not pattern.match(element):
raise ValueError(f"Invalid original_spans format: {element}")
elif isinstance(v,str):
if not pattern.match(v):
raise ValueError(f"Invalid original_spans format: {v}")
return v


class CompoundExpression(ConfiguredBaseModel):
Expand Down Expand Up @@ -204,6 +275,23 @@ class RelationshipType(NamedEntity):
'value': 'AnnotationProperty, AnnotationAssertion'}},
'domain_of': ['NamedEntity'],
'slot_uri': 'rdfs:label'} })
original_spans: Optional[List[str]] = Field(None, description="""The coordinates of the original text span from which the named entity was extracted, inclusive. For example, \"10:25\" means the span starting from the 10th character and ending with the 25th character. The first character in the text has index 0. Newlines are treated as single characters. Multivalued as there may be multiple spans for a single text.""", json_schema_extra = { "linkml_meta": {'alias': 'original_spans',
'annotations': {'prompt.skip': {'tag': 'prompt.skip', 'value': 'true'}},
'comments': ['This is determined during grounding and normalization',
'But is based on the full input text'],
'domain_of': ['NamedEntity']} })

@field_validator('original_spans')
def pattern_original_spans(cls, v):
pattern=re.compile(r"^\d+:\d+$")
if isinstance(v,list):
for element in v:
if isinstance(v, str) and not pattern.match(element):
raise ValueError(f"Invalid original_spans format: {element}")
elif isinstance(v,str):
if not pattern.match(v):
raise ValueError(f"Invalid original_spans format: {v}")
return v


class Publication(ConfiguredBaseModel):
Expand Down Expand Up @@ -238,7 +326,8 @@ class PathologyReport(ConfiguredBaseModel):
risks: Optional[List[Union[Risk, str]]] = Field(None, description="""A semicolon-delimited list of risks for development of more severe pathologies, along with what they are a risk for. Format each in parentheses as \"risk factor (potential pathology)\". If not specified, this value must be \"Not Specified\".""", json_schema_extra = { "linkml_meta": {'alias': 'risks',
'any_of': [{'range': 'Risk'}, {'range': 'string'}],
'domain_of': ['PathologyReport']} })
overall_classification: Optional[PathologyClassification] = Field(None, description="""The final classification of the overall pathology. This must be one of the following: \"Unclear\", \"Benign\", \"Malignant\", or \"Inflammation\".""", json_schema_extra = { "linkml_meta": {'alias': 'overall_classification', 'domain_of': ['PathologyReport']} })
overall_classification_one: Optional[PathologyClassificationOne] = Field(None, description="""The final classification of the overall pathology. This must be a single number, corresponding to one of the following: 1 if No significant pathological abnormality, 2 if Neoplastic malignant, 3 if Dysplastic, 4 if Proliferative non-neoplastic, or 5 if Inflammatory and other non-proliferative abnormalities.""", json_schema_extra = { "linkml_meta": {'alias': 'overall_classification_one', 'domain_of': ['PathologyReport']} })
overall_classification_two: Optional[PathologyClassificationTwo] = Field(None, description="""The final classification of the overall pathology. This must be a code, like the following (i.e., \"2\" or \"2a\" are acceptable): 1. No significant pathological abnormality 2. Neoplastic malignant pathology 2a. Carcinoma 2b. Sarcoma 2c. Lymphoma 2d. Other neoplastic malignant growth 3. Dysplastic 3a. High grade dysplasia 3b. Low grade dysplasia 4. Proliferative non-neoplastic pathology 5. Inflammatory and other non-proliferative abnormalities 5a. Acute or active inflammation 5b. Chronic inflammation 5c. Eosinophils present 5d. Granulomas / histiocytes / macrophages present 5e. Organisms (Bacterial, Viral, Parasitic, Fungal) present 5f. Collagen abnormalities 5g. Vessel abnormalities 5h. Apoptosis 5i. Mast cells 5j. Amyloid""", json_schema_extra = { "linkml_meta": {'alias': 'overall_classification_two', 'domain_of': ['PathologyReport']} })


class PathologyStatement(ConfiguredBaseModel):
Expand Down Expand Up @@ -289,6 +378,23 @@ class Diagnosis(NamedEntity):
'value': 'AnnotationProperty, AnnotationAssertion'}},
'domain_of': ['NamedEntity'],
'slot_uri': 'rdfs:label'} })
original_spans: Optional[List[str]] = Field(None, description="""The coordinates of the original text span from which the named entity was extracted, inclusive. For example, \"10:25\" means the span starting from the 10th character and ending with the 25th character. The first character in the text has index 0. Newlines are treated as single characters. Multivalued as there may be multiple spans for a single text.""", json_schema_extra = { "linkml_meta": {'alias': 'original_spans',
'annotations': {'prompt.skip': {'tag': 'prompt.skip', 'value': 'true'}},
'comments': ['This is determined during grounding and normalization',
'But is based on the full input text'],
'domain_of': ['NamedEntity']} })

@field_validator('original_spans')
def pattern_original_spans(cls, v):
pattern=re.compile(r"^\d+:\d+$")
if isinstance(v,list):
for element in v:
if isinstance(v, str) and not pattern.match(element):
raise ValueError(f"Invalid original_spans format: {element}")
elif isinstance(v,str):
if not pattern.match(v):
raise ValueError(f"Invalid original_spans format: {v}")
return v


class AnatomicalEntity(NamedEntity):
Expand All @@ -307,6 +413,23 @@ class AnatomicalEntity(NamedEntity):
'value': 'AnnotationProperty, AnnotationAssertion'}},
'domain_of': ['NamedEntity'],
'slot_uri': 'rdfs:label'} })
original_spans: Optional[List[str]] = Field(None, description="""The coordinates of the original text span from which the named entity was extracted, inclusive. For example, \"10:25\" means the span starting from the 10th character and ending with the 25th character. The first character in the text has index 0. Newlines are treated as single characters. Multivalued as there may be multiple spans for a single text.""", json_schema_extra = { "linkml_meta": {'alias': 'original_spans',
'annotations': {'prompt.skip': {'tag': 'prompt.skip', 'value': 'true'}},
'comments': ['This is determined during grounding and normalization',
'But is based on the full input text'],
'domain_of': ['NamedEntity']} })

@field_validator('original_spans')
def pattern_original_spans(cls, v):
pattern=re.compile(r"^\d+:\d+$")
if isinstance(v,list):
for element in v:
if isinstance(v, str) and not pattern.match(element):
raise ValueError(f"Invalid original_spans format: {element}")
elif isinstance(v,str):
if not pattern.match(v):
raise ValueError(f"Invalid original_spans format: {v}")
return v


class Risk(ConfiguredBaseModel):
Expand Down Expand Up @@ -336,6 +459,23 @@ class Qualifier(NamedEntity):
'value': 'AnnotationProperty, AnnotationAssertion'}},
'domain_of': ['NamedEntity'],
'slot_uri': 'rdfs:label'} })
original_spans: Optional[List[str]] = Field(None, description="""The coordinates of the original text span from which the named entity was extracted, inclusive. For example, \"10:25\" means the span starting from the 10th character and ending with the 25th character. The first character in the text has index 0. Newlines are treated as single characters. Multivalued as there may be multiple spans for a single text.""", json_schema_extra = { "linkml_meta": {'alias': 'original_spans',
'annotations': {'prompt.skip': {'tag': 'prompt.skip', 'value': 'true'}},
'comments': ['This is determined during grounding and normalization',
'But is based on the full input text'],
'domain_of': ['NamedEntity']} })

@field_validator('original_spans')
def pattern_original_spans(cls, v):
pattern=re.compile(r"^\d+:\d+$")
if isinstance(v,list):
for element in v:
if isinstance(v, str) and not pattern.match(element):
raise ValueError(f"Invalid original_spans format: {element}")
elif isinstance(v,str):
if not pattern.match(v):
raise ValueError(f"Invalid original_spans format: {v}")
return v


# Model rebuild
Expand Down
133 changes: 118 additions & 15 deletions src/ontogpt/templates/pathology.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -69,11 +69,41 @@ classes:
- range: Risk
- range: string
multivalued: true
overall_classification:
overall_classification_one:
description: >-
The final classification of the overall pathology. This must be one of
the following: "Unclear", "Benign", "Malignant", or "Inflammation".
range: PathologyClassification
The final classification of the overall pathology. This must be a
single number, corresponding to one of
the following: 1 if No significant pathological abnormality, 2 if
Neoplastic malignant, 3 if Dysplastic, 4 if Proliferative
non-neoplastic, or 5 if Inflammatory and other non-proliferative
abnormalities.
range: PathologyClassificationOne
overall_classification_two:
description: >-
The final classification of the overall pathology. This must be a
code, like the following (i.e., "2" or "2a" are acceptable):
1. No significant pathological abnormality
2. Neoplastic malignant pathology
2a. Carcinoma
2b. Sarcoma
2c. Lymphoma
2d. Other neoplastic malignant growth
3. Dysplastic
3a. High grade dysplasia
3b. Low grade dysplasia
4. Proliferative non-neoplastic pathology
5. Inflammatory and other non-proliferative abnormalities
5a. Acute or active inflammation
5b. Chronic inflammation
5c. Eosinophils present
5d. Granulomas / histiocytes / macrophages present
5e. Organisms (Bacterial, Viral, Parasitic, Fungal) present
5f. Collagen abnormalities
5g. Vessel abnormalities
5h. Apoptosis
5i. Mast cells
5j. Amyloid
range: PathologyClassificationTwo

PathologyStatement:
description: >-
Expand Down Expand Up @@ -189,20 +219,93 @@ enums:
Not Specified:
description: >-
The severity of the pathology is not specified.
PathologyClassification:
PathologyClassificationOne:
description: >-
The final classification of the overall pathology.
The final classification of the overall pathology. This uses
a system of five categories, identified numerically.
permissible_values:
Unclear:
1:
description: >-
The final classification of the overall pathology is unclear.
Benign:
No significant pathological abnormality was observed.
2:
description: >-
The final classification of the overall pathology is benign.
Malignant:
Neoplastic malignant growth was observed.
3:
description: >-
The final classification of the overall pathology is malignant.
Inflammation:
Dysplastic pathology was observed (i.e., abnormal or atypical cell
growth and/or appearance).
4:
description: >-
The final classification of the overall pathology is inflammation.
Proliferative non-neoplastic pathology was observed.
5:
description: >-
Inflammatory or other non-proliferative abnormalities were observed.
PathologyClassificationTwo:
description: >-
The final classification of the overall pathology. This must be a
code, "2" or "2a".
permissible_values:
1:
description: >-
No significant pathological abnormality was observed.
2:
description: >-
Neoplastic malignant pathology was observed.
2a:
description: >-
Carcinoma was observed.
2b:
description: >-
Sarcoma was observed.
2c:
description: >-
Lymphoma was observed.
2d:
description: >-
Other neoplastic malignant growth was observed.
3:
description: >-
Dysplastic pathology was observed (i.e., abnormal or atypical cell
growth and/or appearance).
3a:
description: >-
High grade dysplasia was observed.
3b:
description: >-
Low grade dysplasia was observed.
4:
description: >-
Proliferative non-neoplastic pathology was observed.
5:
description: >-
Inflammatory or other non-proliferative abnormalities were observed.
5a:
description: >-
Acute or active inflammation was observed.
5b:
description: >-
Chronic inflammation was observed.
5c:
description: >-
Eosinophils were present.
5d:
description: >-
Granulomas / histiocytes / macrophages were present.
5e:
description: >-
Organisms (Bacterial, Viral, Parasitic, Fungal) were present.
5f:
description: >-
Collagen abnormalities were observed.
5g:
description: >-
Vessel abnormalities were observed.
5h:
description: >-
Apoptosis was observed.
5i:
description: >-
Mast cells were observed.
5j:
description: >-
Amyloid was observed.

0 comments on commit ce5c592

Please sign in to comment.