end-to-end system

ieeta-pt · Apr 18, 2024 · f813050 · f813050
1 parent 5be6c7a
commit f813050
Show file tree

Hide file tree

Showing 30 changed files with 3,070 additions and 101 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,7 @@
+dataset
+knowledge-bases
+testset
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]

diff --git a/OllamaAPICall.py b/OllamaAPICall.py
@@ -0,0 +1,20 @@
+from src.utils import GenericAPICall
+import requests
+
+class OllamaAPICall(GenericAPICall):
+
+    def __init__(self, address) -> None:
+        super().__init__(address)
+        self.endpoint = f"{address}/api/generate/"
+
+    def run(self, prompt):
+        response = requests.post(self.endpoint, 
+                                 json = {"model": "nous-hermes2-mixtral:latest", 
+                                         "prompt": prompt,
+                                         "options": {
+                                             "temperature": 0 , 
+                                             "num_predict": 200, 
+                                             "num_gpu":33}, 
+                                         "stream":False})
+        model_out = response.json()['response']
+        return model_out
diff --git a/dataset/.keep b/dataset/.keep
diff --git a/knowledge-bases/.keep b/knowledge-bases/.keep
diff --git a/main.py b/main.py
@@ -1,5 +1,8 @@
 
 import argparse
+from src import grouping_args
+from src.utils import load_biocjson
+
 
 if __name__ == "__main__":
 
@@ -10,41 +13,90 @@
 
     configs = parser.add_argument_group('Global settings', 'This settings are related with the location of the files and directories.')
 
-    configs.add_argument('-t', '--tagger', default=False, action='store_true', \
+    configs.add_argument('-t', '--tagger', dest="use_tagger", default=False, action='store_true', \
                          help='(default: False)')
-    configs.add_argument('-l', '--linker', default=False, action='store_true', \
+    configs.add_argument('-l', '--linker', dest="use_linker", default=False, action='store_true', \
                             help='(default: False)')
-    configs.add_argument('-e', '--extractor', default=False, action='store_true', \
+    configs.add_argument('-e', '--extractor', dest="use_extractor", default=False, action='store_true', \
                             help='(default: False)')
 
     ### tagger options
     tagger_configs = parser.add_argument_group('Tagger settings', 'This settings are related to the indexer module.')
-    tagger_configs.add_argument('--tagger.model_checkpoint', dest='tagger_model_checkpoint', \
-                                 type=str, nargs='+', default=None, \
-                                 help='The tagger model cfg path')
+    tagger_configs.add_argument('--tagger.checkpoint', \
+                                 type=str, default="trained_models/tagger/BioLinkBERT-large-dense-60-2-unk-P0.25-0.75-42-full/checkpoint-1200", \
+                                 help='')
+    tagger_configs.add_argument('--tagger.batch_size', \
+                                 type=int, default=8, \
+                                 help='')
+    tagger_configs.add_argument('--tagger.output_file', \
+                                 type=str, default="outputs/tagger/predicts.json", \
+                                 help='')
 
     ## linker options
     linker_configs = parser.add_argument_group('Linker settings', 'This settings are related to the normalizer module.')
-    linker_configs.add_argument('--linker.llm_api', dest='linker_llm_api', \
+    linker_configs.add_argument('--linker.llm_api.module', \
+                                 default=None, \
+                                 help='')
+    linker_configs.add_argument('--linker.llm_api.address', \
                                  default=None, \
                                  help='')
+    linker_configs.add_argument('--linker.kb_folder', \
+                                 default="knowledge-bases/", \
+                                 help='')
+    linker_configs.add_argument('--linker.dataset_folder', \
+                                 default="dataset/", \
+                                 help='')
+    linker_configs.add_argument('--linker.output_file', \
+                                 default="outputs/linker/predicts.json", \
+                                 help='')
 
     # extractor options
     extractor_configs = parser.add_argument_group('Extractor settings', 'This settings are related to the extractor module.')
-    extractor_configs.add_argument('--extractor.write_path', dest='indexer_write_path', \
-                                 type=str, default=None, \
+    extractor_configs.add_argument('--extractor.output_file', \
+                                 type=str, default="outputs/extractor/predicts.json", \
                                  help='The extractor outputs path')
+    extractor_configs.add_argument('--extractor.checkpoint', \
+                                 type=str, default="trained_models/extractor/biolinkbert-large-full-mha-both-3-32456-20-mask-False/checkpoint-17340", \
+                                 help='')
+    extractor_configs.add_argument('--extractor.batch_size', \
+                                 type=int, default=128, \
+                                 help='')
 
-    args = parser.parse_args()
+
+    args = grouping_args(parser.parse_args())
 
-    if not args.tagger and \
-       not args.linker and \
-       not args.extractor:
+    print(args)
+    if not args.use_tagger and \
+       not args.use_linker and \
+       not args.use_extractor:
         # by default lets assume that we want to run the full pipeline!
-        args.tagger, args.linker, args.extractor = True, True, True
+        args.use_tagger, args.use_linker, args.use_extractor = True, True, True
 
-    if (args.tagger, args.linker, args.extractor) in {(True, False, True)}:
+    if (args.use_tagger, args.use_linker, args.use_extractor) in {(True, False, True)}:
         print("It is not possible to run the extractor after the annotator module in this pipeline. Any other configuration is valid. ")
         exit()
+
+    pipeline = []
+
+    if args.use_tagger:
+        from src.tagger import Tagger
+        pipeline.append(Tagger(**args.tagger.get_kwargs()))
+
+    if args.use_linker:
+        from src.linker import Linker
+        pipeline.append(Linker(**args.linker.get_kwargs()))
+
+    if args.use_extractor:
+        from src.extractor import Extractor
+        pipeline.append(Extractor(**args.extractor.get_kwargs()))
+
+    print("Pipeline built")
+    print(pipeline)
+
+    print("Running")
+    input_file = args.source_file
+    for module in pipeline:
+        input_file = module.run(input_file)
+
 
 
diff --git a/outputs/extractor/.keep b/outputs/extractor/.keep
diff --git a/outputs/linker/.keep b/outputs/linker/.keep
diff --git a/outputs/tagger/.keep b/outputs/tagger/.keep
diff --git a/src/__init__.py b/src/__init__.py
@@ -0,0 +1,182 @@
+import argparse
+
+class Params:
+    """
+    A container class used to store group of parameters.
+    
+    Note: Students can ignore this class, this is only used to
+    help with grouping of similar arguments at the CLI.
+
+    Attributes
+    ----------
+    params_keys : List[str]
+        a list that holds the all of the name of the parameters
+        added to this class
+
+    """
+    def __init__(self):
+        self.params_keys = []
+
+    def __str__(self):
+        attr_str = ", ".join([f"{param}={getattr(self,param)}" for param in self.params_keys])
+        return f"Params({attr_str})"
+
+    def __repr__(self):
+        return self.__str__()
+
+    def add_parameter(self, parameter_name, parameter_value):
+        """
+        Adds at runtime a parameter with its respective 
+        value to this object.
+        
+        Parameters
+        ----------
+        parameter_name : str
+            Name of the parameter/variable (identifier)
+        parameter_value: object
+            Value of the variable
+        """
+        if len(parameter_name)>1:
+            if parameter_name[0] in self.params_keys:
+                getattr(self, parameter_name[0]).add_parameter(parameter_name[1:], parameter_value)
+            else:
+                new_parms = Params()
+                new_parms.add_parameter(parameter_name[1:], parameter_value)
+                setattr(self, parameter_name[0], new_parms)
+                self.params_keys.append(parameter_name[0])
+
+        else:
+            setattr(self, parameter_name[0], parameter_value)
+            self.params_keys.append(parameter_name[0])
+
+    def get_kwargs(self) -> dict:
+        """
+        Gets all of the parameters stored inside as
+        python keyword arguments.
+        
+        Returns
+        ----------
+        dict
+            python dictionary with variable names as keys
+            and their respective value as values.
+        """
+        kwargs = {}
+        for var_name in self.params_keys:
+            value = getattr(self, var_name)
+            if isinstance(value,Params):
+                value = value.get_kwargs()
+
+            kwargs[var_name] = value
+
+        # is a nested dict with only one key? if so, tries to simplify if possible
+        if len(kwargs)==1:# and any(isinstance(i,dict) for i in kwargs.values()
+            key = list(kwargs.keys())[0]
+            if isinstance(kwargs[key], dict):
+                return kwargs[key] # just ignores the first dict
+
+        return kwargs
+
+    def get_kwargs_without_defaults(self):
+        cli_recorder = CLIRecorder()
+        kwargs = self.get_kwargs()
+        # remove the arguments that were not specified on the terminal
+        return { k:v for k,v in kwargs.items() if k in cli_recorder}
+
+
+class Singleton(type):
+    """
+    Python cookbook
+    """
+    _instances = {}
+    def __call__(cls, *args, **kwargs):
+        if cls not in cls._instances:
+            cls._instances[cls] = super(Singleton, cls).__call__(*args, **kwargs)
+        return cls._instances[cls]
+
+class CLIRecorder(metaclass=Singleton):
+    def __init__(self):
+        self.args = set()
+
+    def add_arg(self, arg):
+        self.args.add(arg.split(".")[-1])
+
+    def __contains__(self, value):
+        return value in self.args
+
+class RecordArgument(argparse.Action):
+    def __init__(self, option_strings, dest, nargs=None, **kwargs):
+        if nargs is not None:
+            raise ValueError("nargs not allowed")
+        super().__init__(option_strings, dest, **kwargs)
+        self.cli_args = CLIRecorder()
+
+    def __call__(self, parser, namespace, values, option_string=None):
+        self.cli_args.add_arg(self.dest)
+        setattr(namespace, self.dest, values)
+
+
+def grouping_args(args):
+    """
+    Auxiliar function to group the arguments group
+    the optional arguments that belong to a specific group.
+    
+    A group is identified with the dot (".") according to the
+    following format --<group name>.<variable name> <variable value>.
+    
+    This method will gather all of the variables that belong to a 
+    specific group. Each group is represented as an instance of the
+    Params class.
+    
+    For instance:
+        indexer.posting_threshold and indexer.memory_threshold, will be 
+        assigned to the same group "indexer", which can be then accessed
+        through args.indexer
+        
+    Parameters
+        ----------
+        args : argparse.Namespace
+            current namespace from argparse
+            
+    Returns
+        ----------
+        argparse.Namespace
+            modified namespace after performing the grouping
+    """
+
+
+    namespace_dict = vars(args)
+    keys = set(namespace_dict.keys())
+    for x in keys:
+        if "." in x:
+            group_name, *param_name = x.split(".")
+            if group_name not in namespace_dict:
+                namespace_dict[group_name] = Params()
+            #print(namespace_dict.keys())
+            namespace_dict[group_name].add_parameter(param_name, namespace_dict[x])
+
+            del namespace_dict[x]
+
+    return args
+
+def cli_debug_printer(list_parameters, tab=""):
+    """
+    A helping function to tree print 
+    """
+
+    params_tree_str = ""
+
+    _tab = " "*(len(tab)-5) + tab[-5:]
+
+    for var, value in list_parameters:
+        params_tree_str += f"{_tab}{var}: "
+        if isinstance(value, Params):
+            value_str = cli_debug_printer(sorted(value.get_kwargs().items()), tab+'|--- ')
+            params_tree_str += f"\n{_tab}{value_str}"
+        elif isinstance(value, dict):
+            value_str = cli_debug_printer(sorted(value.items()), tab+'|--- ')
+            params_tree_str += f"\n{value_str}"
+        else:
+            params_tree_str += f"{value}\n"
+        #params_tree_str += f"\n"
+
+    return params_tree_str
diff --git a/src/core.py b/src/core.py
diff --git a/src/tagger/data.py → src/data.py b/src/tagger/data.py → src/data.py
@@ -5,7 +5,7 @@
 import random
 import math
 
-from utils import split_chunks, RangeDict
+from src.utils import split_chunks, RangeDict
 from collections import defaultdict
 
 from transformers import AutoTokenizer, DataCollatorForTokenClassification