Merge pull request #50 from berkeleybop/robot-helper-work

Adding a component for processing, seeded with pre-processing
berkeleybop · Mar 19, 2024 · 08630c3 · 08630c3
2 parents 9ef12bb + b46f41d
commit 08630c3
Show file tree

Hide file tree

Showing 3 changed files with 214 additions and 0 deletions.
diff --git a/src/ontology/artificial-intelligence-ontology.Makefile b/src/ontology/artificial-intelligence-ontology.Makefile
@@ -23,6 +23,14 @@ aio-component.owl: aio-src.tsv
 	  annotate --annotation-file aio-annotations.ttl \
 	  -o $@
 
+components/%.owl: components/%.csv
+	robot template \
+	  --add-prefix 'AIO: https://w3id.org/aio/' \
+	  --add-prefix 'oio: http://www.geneontology.org/formats/oboInOwl#' \
+	  -t $< \
+	  annotate --annotation-file aio-annotations.ttl \
+	  -o $@
+
 # hacky step for now - but we should treat as a proper component
 aio-edit.owl: aio-component.owl
 	cp $< $@

diff --git a/src/ontology/components/processing.csv b/src/ontology/components/processing.csv
@@ -0,0 +1,14 @@
+identifier,name,parent,description,related_concepts
+ID,LABEL,SC %,A rdfs:comment,A oboInOwl:hasRelatedSynonym
+AIO:DataPreparation,data preparation,,,"Techniques focused on preparing raw data for training, including cleaning, normalization, and tokenization."
+AIO:ModelEfficiency,model efficiency,,,"Techniques aimed at making models more efficient, such as knowledge distillation."
+AIO:TrainingStrategies,training strategies,,,"Specific strategies or methodologies employed during model training."
+AIO:DataEnhancement,data enhancement,,,"Methods that enhance the training data or its representation, including augmentation and feature extraction."
+AIO:Distillation,distillation,model efficiency,"Knowledge distillation involves training a smaller model to replicate the behavior of a larger model, aiming to compress the knowledge into a more compact form without significant loss of performance.",Knowledge compression|Teacher-student model
+AIO:TokenizationAndVocabularyReduction,tokenization and vocabulary reduction,data preparation,"Breaking down text data into manageable pieces called tokens and reducing the model's vocabulary to streamline processing.",Tokenization|Vocabulary size reduction
+AIO:CleaningAndNormalization,cleaning and normalization,data preparation,"Removing irrelevant data, correcting typos, and standardizing text to reduce noise and ensure consistency in the data.",Data cleaning|Text normalization
+AIO:SubwordSegmentation,subword segmentation,data preparation,"Utilizing techniques like Byte Pair Encoding (BPE) or SentencePiece to break down words into smaller units, allowing the model to handle a wide range of vocabulary with a fixed-size list.",Byte Pair Encoding|SentencePiece
+AIO:DataAugmentation,data augmentation,data enhancement,"Expanding the training dataset artificially by modifying existing data points to improve the model's robustness and generalization ability.",Paraphrasing|Synonym replacement
+AIO:CurriculumLearning,curriculum learning,training strategies,"Training the model on simpler tasks or easier data first, then gradually introducing more complex tasks to improve learning efficiency and performance.",Sequential learning|Complexity grading
+AIO:TransferLearning,transfer learning,training strategies,"Starting the training from a model already trained on a related task to reduce training time and improve performance on tasks with limited data.",Pretrained models|Adaptation
+AIO:FeatureExtraction,feature extraction,data enhancement,"Extracting specific features or patterns from the text before training to guide the model's learning process, including syntactic information or semantic embeddings.",Syntactic information|Semantic embeddings
diff --git a/src/ontology/components/processing.owl b/src/ontology/components/processing.owl
@@ -0,0 +1,192 @@
+<?xml version="1.0"?>
+<rdf:RDF xmlns="http://www.w3.org/2002/07/owl#"
+     xml:base="http://www.w3.org/2002/07/owl"
+     xmlns:dc="http://purl.org/dc/elements/1.1/"
+     xmlns:owl="http://www.w3.org/2002/07/owl#"
+     xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+     xmlns:xml="http://www.w3.org/XML/1998/namespace"
+     xmlns:xsd="http://www.w3.org/2001/XMLSchema#"
+     xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#"
+     xmlns:terms="http://purl.org/dc/terms/"
+     xmlns:oboInOwl="http://www.geneontology.org/formats/oboInOwl#">
+    <Ontology>
+        <dc:description>This ontology models classes and relationships describing deep learning networks, their component layers and activation functions, as well as potential biases.</dc:description>
+        <dc:title>Artificial Intelligence Ontology</dc:title>
+        <terms:license rdf:resource="http://creativecommons.org/licenses/by/4.0/"/>
+    </Ontology>
+
+
+
+    <!-- 
+    ///////////////////////////////////////////////////////////////////////////////////////
+    //
+    // Annotation properties
+    //
+    ///////////////////////////////////////////////////////////////////////////////////////
+     -->
+
+
+
+
+    <!-- http://purl.org/dc/elements/1.1/description -->
+
+    <AnnotationProperty rdf:about="http://purl.org/dc/elements/1.1/description"/>
+
+
+
+    <!-- http://purl.org/dc/elements/1.1/title -->
+
+    <AnnotationProperty rdf:about="http://purl.org/dc/elements/1.1/title"/>
+
+
+
+    <!-- http://purl.org/dc/terms/license -->
+
+    <AnnotationProperty rdf:about="http://purl.org/dc/terms/license"/>
+
+
+
+    <!-- http://www.geneontology.org/formats/oboInOwl#hasRelatedSynonym -->
+
+    <AnnotationProperty rdf:about="http://www.geneontology.org/formats/oboInOwl#hasRelatedSynonym"/>
+
+
+
+    <!-- 
+    ///////////////////////////////////////////////////////////////////////////////////////
+    //
+    // Classes
+    //
+    ///////////////////////////////////////////////////////////////////////////////////////
+     -->
+
+
+
+
+    <!-- https://w3id.org/aio/CleaningAndNormalization -->
+
+    <Class rdf:about="https://w3id.org/aio/CleaningAndNormalization">
+        <rdfs:subClassOf rdf:resource="https://w3id.org/aio/DataPreparation"/>
+        <oboInOwl:hasRelatedSynonym>Data cleaning|Text normalization</oboInOwl:hasRelatedSynonym>
+        <rdfs:comment>Removing irrelevant data, correcting typos, and standardizing text to reduce noise and ensure consistency in the data.</rdfs:comment>
+        <rdfs:label>cleaning and normalization</rdfs:label>
+    </Class>
+
+
+
+    <!-- https://w3id.org/aio/CurriculumLearning -->
+
+    <Class rdf:about="https://w3id.org/aio/CurriculumLearning">
+        <rdfs:subClassOf rdf:resource="https://w3id.org/aio/TrainingStrategies"/>
+        <oboInOwl:hasRelatedSynonym>Sequential learning|Complexity grading</oboInOwl:hasRelatedSynonym>
+        <rdfs:comment>Training the model on simpler tasks or easier data first, then gradually introducing more complex tasks to improve learning efficiency and performance.</rdfs:comment>
+        <rdfs:label>curriculum learning</rdfs:label>
+    </Class>
+
+
+
+    <!-- https://w3id.org/aio/DataAugmentation -->
+
+    <Class rdf:about="https://w3id.org/aio/DataAugmentation">
+        <rdfs:subClassOf rdf:resource="https://w3id.org/aio/DataEnhancement"/>
+        <oboInOwl:hasRelatedSynonym>Paraphrasing|Synonym replacement</oboInOwl:hasRelatedSynonym>
+        <rdfs:comment>Expanding the training dataset artificially by modifying existing data points to improve the model&apos;s robustness and generalization ability.</rdfs:comment>
+        <rdfs:label>data augmentation</rdfs:label>
+    </Class>
+
+
+
+    <!-- https://w3id.org/aio/DataEnhancement -->
+
+    <Class rdf:about="https://w3id.org/aio/DataEnhancement">
+        <oboInOwl:hasRelatedSynonym>Methods that enhance the training data or its representation, including augmentation and feature extraction.</oboInOwl:hasRelatedSynonym>
+        <rdfs:label>data enhancement</rdfs:label>
+    </Class>
+
+
+
+    <!-- https://w3id.org/aio/DataPreparation -->
+
+    <Class rdf:about="https://w3id.org/aio/DataPreparation">
+        <oboInOwl:hasRelatedSynonym>Techniques focused on preparing raw data for training, including cleaning, normalization, and tokenization.</oboInOwl:hasRelatedSynonym>
+        <rdfs:label>data preparation</rdfs:label>
+    </Class>
+
+
+
+    <!-- https://w3id.org/aio/Distillation -->
+
+    <Class rdf:about="https://w3id.org/aio/Distillation">
+        <rdfs:subClassOf rdf:resource="https://w3id.org/aio/ModelEfficiency"/>
+        <oboInOwl:hasRelatedSynonym>Knowledge compression|Teacher-student model</oboInOwl:hasRelatedSynonym>
+        <rdfs:comment>Knowledge distillation involves training a smaller model to replicate the behavior of a larger model, aiming to compress the knowledge into a more compact form without significant loss of performance.</rdfs:comment>
+        <rdfs:label>distillation</rdfs:label>
+    </Class>
+
+
+
+    <!-- https://w3id.org/aio/FeatureExtraction -->
+
+    <Class rdf:about="https://w3id.org/aio/FeatureExtraction">
+        <rdfs:subClassOf rdf:resource="https://w3id.org/aio/DataEnhancement"/>
+        <oboInOwl:hasRelatedSynonym>Syntactic information|Semantic embeddings</oboInOwl:hasRelatedSynonym>
+        <rdfs:comment>Extracting specific features or patterns from the text before training to guide the model&apos;s learning process, including syntactic information or semantic embeddings.</rdfs:comment>
+        <rdfs:label>feature extraction</rdfs:label>
+    </Class>
+
+
+
+    <!-- https://w3id.org/aio/ModelEfficiency -->
+
+    <Class rdf:about="https://w3id.org/aio/ModelEfficiency">
+        <oboInOwl:hasRelatedSynonym>Techniques aimed at making models more efficient, such as knowledge distillation.</oboInOwl:hasRelatedSynonym>
+        <rdfs:label>model efficiency</rdfs:label>
+    </Class>
+
+
+
+    <!-- https://w3id.org/aio/SubwordSegmentation -->
+
+    <Class rdf:about="https://w3id.org/aio/SubwordSegmentation">
+        <rdfs:subClassOf rdf:resource="https://w3id.org/aio/DataPreparation"/>
+        <oboInOwl:hasRelatedSynonym>Byte Pair Encoding|SentencePiece</oboInOwl:hasRelatedSynonym>
+        <rdfs:comment>Utilizing techniques like Byte Pair Encoding (BPE) or SentencePiece to break down words into smaller units, allowing the model to handle a wide range of vocabulary with a fixed-size list.</rdfs:comment>
+        <rdfs:label>subword segmentation</rdfs:label>
+    </Class>
+
+
+
+    <!-- https://w3id.org/aio/TokenizationAndVocabularyReduction -->
+
+    <Class rdf:about="https://w3id.org/aio/TokenizationAndVocabularyReduction">
+        <rdfs:subClassOf rdf:resource="https://w3id.org/aio/DataPreparation"/>
+        <oboInOwl:hasRelatedSynonym>Tokenization|Vocabulary size reduction</oboInOwl:hasRelatedSynonym>
+        <rdfs:comment>Breaking down text data into manageable pieces called tokens and reducing the model&apos;s vocabulary to streamline processing.</rdfs:comment>
+        <rdfs:label>tokenization and vocabulary reduction</rdfs:label>
+    </Class>
+
+
+
+    <!-- https://w3id.org/aio/TrainingStrategies -->
+
+    <Class rdf:about="https://w3id.org/aio/TrainingStrategies">
+        <oboInOwl:hasRelatedSynonym>Specific strategies or methodologies employed during model training.</oboInOwl:hasRelatedSynonym>
+        <rdfs:label>training strategies</rdfs:label>
+    </Class>
+
+
+
+    <!-- https://w3id.org/aio/TransferLearning -->
+
+    <Class rdf:about="https://w3id.org/aio/TransferLearning">
+        <rdfs:subClassOf rdf:resource="https://w3id.org/aio/TrainingStrategies"/>
+        <oboInOwl:hasRelatedSynonym>Pretrained models|Adaptation</oboInOwl:hasRelatedSynonym>
+        <rdfs:comment>Starting the training from a model already trained on a related task to reduce training time and improve performance on tasks with limited data.</rdfs:comment>
+        <rdfs:label>transfer learning</rdfs:label>
+    </Class>
+</rdf:RDF>
+
+
+
+<!-- Generated by the OWL API (version 4.5.26) https://github.com/owlcs/owlapi -->
+