Skip to content

Commit

Permalink
end-to-end system
Browse files Browse the repository at this point in the history
  • Loading branch information
T-Almeida committed Apr 18, 2024
1 parent 5be6c7a commit f813050
Show file tree
Hide file tree
Showing 30 changed files with 3,070 additions and 101 deletions.
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
dataset
knowledge-bases
testset

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
Expand Down
20 changes: 20 additions & 0 deletions OllamaAPICall.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from src.utils import GenericAPICall
import requests

class OllamaAPICall(GenericAPICall):

def __init__(self, address) -> None:
super().__init__(address)
self.endpoint = f"{address}/api/generate/"

def run(self, prompt):
response = requests.post(self.endpoint,
json = {"model": "nous-hermes2-mixtral:latest",
"prompt": prompt,
"options": {
"temperature": 0 ,
"num_predict": 200,
"num_gpu":33},
"stream":False})
model_out = response.json()['response']
return model_out
Empty file added dataset/.keep
Empty file.
Empty file added knowledge-bases/.keep
Empty file.
82 changes: 67 additions & 15 deletions main.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@

import argparse
from src import grouping_args
from src.utils import load_biocjson


if __name__ == "__main__":

Expand All @@ -10,41 +13,90 @@

configs = parser.add_argument_group('Global settings', 'This settings are related with the location of the files and directories.')

configs.add_argument('-t', '--tagger', default=False, action='store_true', \
configs.add_argument('-t', '--tagger', dest="use_tagger", default=False, action='store_true', \
help='(default: False)')
configs.add_argument('-l', '--linker', default=False, action='store_true', \
configs.add_argument('-l', '--linker', dest="use_linker", default=False, action='store_true', \
help='(default: False)')
configs.add_argument('-e', '--extractor', default=False, action='store_true', \
configs.add_argument('-e', '--extractor', dest="use_extractor", default=False, action='store_true', \
help='(default: False)')

### tagger options
tagger_configs = parser.add_argument_group('Tagger settings', 'This settings are related to the indexer module.')
tagger_configs.add_argument('--tagger.model_checkpoint', dest='tagger_model_checkpoint', \
type=str, nargs='+', default=None, \
help='The tagger model cfg path')
tagger_configs.add_argument('--tagger.checkpoint', \
type=str, default="trained_models/tagger/BioLinkBERT-large-dense-60-2-unk-P0.25-0.75-42-full/checkpoint-1200", \
help='')
tagger_configs.add_argument('--tagger.batch_size', \
type=int, default=8, \
help='')
tagger_configs.add_argument('--tagger.output_file', \
type=str, default="outputs/tagger/predicts.json", \
help='')

## linker options
linker_configs = parser.add_argument_group('Linker settings', 'This settings are related to the normalizer module.')
linker_configs.add_argument('--linker.llm_api', dest='linker_llm_api', \
linker_configs.add_argument('--linker.llm_api.module', \
default=None, \
help='')
linker_configs.add_argument('--linker.llm_api.address', \
default=None, \
help='')
linker_configs.add_argument('--linker.kb_folder', \
default="knowledge-bases/", \
help='')
linker_configs.add_argument('--linker.dataset_folder', \
default="dataset/", \
help='')
linker_configs.add_argument('--linker.output_file', \
default="outputs/linker/predicts.json", \
help='')

# extractor options
extractor_configs = parser.add_argument_group('Extractor settings', 'This settings are related to the extractor module.')
extractor_configs.add_argument('--extractor.write_path', dest='indexer_write_path', \
type=str, default=None, \
extractor_configs.add_argument('--extractor.output_file', \
type=str, default="outputs/extractor/predicts.json", \
help='The extractor outputs path')
extractor_configs.add_argument('--extractor.checkpoint', \
type=str, default="trained_models/extractor/biolinkbert-large-full-mha-both-3-32456-20-mask-False/checkpoint-17340", \
help='')
extractor_configs.add_argument('--extractor.batch_size', \
type=int, default=128, \
help='')

args = parser.parse_args()

args = grouping_args(parser.parse_args())

if not args.tagger and \
not args.linker and \
not args.extractor:
print(args)
if not args.use_tagger and \
not args.use_linker and \
not args.use_extractor:
# by default lets assume that we want to run the full pipeline!
args.tagger, args.linker, args.extractor = True, True, True
args.use_tagger, args.use_linker, args.use_extractor = True, True, True

if (args.tagger, args.linker, args.extractor) in {(True, False, True)}:
if (args.use_tagger, args.use_linker, args.use_extractor) in {(True, False, True)}:
print("It is not possible to run the extractor after the annotator module in this pipeline. Any other configuration is valid. ")
exit()

pipeline = []

if args.use_tagger:
from src.tagger import Tagger
pipeline.append(Tagger(**args.tagger.get_kwargs()))

if args.use_linker:
from src.linker import Linker
pipeline.append(Linker(**args.linker.get_kwargs()))

if args.use_extractor:
from src.extractor import Extractor
pipeline.append(Extractor(**args.extractor.get_kwargs()))

print("Pipeline built")
print(pipeline)

print("Running")
input_file = args.source_file
for module in pipeline:
input_file = module.run(input_file)



Empty file added outputs/extractor/.keep
Empty file.
Empty file added outputs/linker/.keep
Empty file.
Empty file added outputs/tagger/.keep
Empty file.
182 changes: 182 additions & 0 deletions src/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
import argparse

class Params:
"""
A container class used to store group of parameters.
Note: Students can ignore this class, this is only used to
help with grouping of similar arguments at the CLI.
Attributes
----------
params_keys : List[str]
a list that holds the all of the name of the parameters
added to this class
"""
def __init__(self):
self.params_keys = []

def __str__(self):
attr_str = ", ".join([f"{param}={getattr(self,param)}" for param in self.params_keys])
return f"Params({attr_str})"

def __repr__(self):
return self.__str__()

def add_parameter(self, parameter_name, parameter_value):
"""
Adds at runtime a parameter with its respective
value to this object.
Parameters
----------
parameter_name : str
Name of the parameter/variable (identifier)
parameter_value: object
Value of the variable
"""
if len(parameter_name)>1:
if parameter_name[0] in self.params_keys:
getattr(self, parameter_name[0]).add_parameter(parameter_name[1:], parameter_value)
else:
new_parms = Params()
new_parms.add_parameter(parameter_name[1:], parameter_value)
setattr(self, parameter_name[0], new_parms)
self.params_keys.append(parameter_name[0])

else:
setattr(self, parameter_name[0], parameter_value)
self.params_keys.append(parameter_name[0])

def get_kwargs(self) -> dict:
"""
Gets all of the parameters stored inside as
python keyword arguments.
Returns
----------
dict
python dictionary with variable names as keys
and their respective value as values.
"""
kwargs = {}
for var_name in self.params_keys:
value = getattr(self, var_name)
if isinstance(value,Params):
value = value.get_kwargs()

kwargs[var_name] = value

# is a nested dict with only one key? if so, tries to simplify if possible
if len(kwargs)==1:# and any(isinstance(i,dict) for i in kwargs.values()
key = list(kwargs.keys())[0]
if isinstance(kwargs[key], dict):
return kwargs[key] # just ignores the first dict

return kwargs

def get_kwargs_without_defaults(self):
cli_recorder = CLIRecorder()
kwargs = self.get_kwargs()
# remove the arguments that were not specified on the terminal
return { k:v for k,v in kwargs.items() if k in cli_recorder}


class Singleton(type):
"""
Python cookbook
"""
_instances = {}
def __call__(cls, *args, **kwargs):
if cls not in cls._instances:
cls._instances[cls] = super(Singleton, cls).__call__(*args, **kwargs)
return cls._instances[cls]

class CLIRecorder(metaclass=Singleton):
def __init__(self):
self.args = set()

def add_arg(self, arg):
self.args.add(arg.split(".")[-1])

def __contains__(self, value):
return value in self.args

class RecordArgument(argparse.Action):
def __init__(self, option_strings, dest, nargs=None, **kwargs):
if nargs is not None:
raise ValueError("nargs not allowed")
super().__init__(option_strings, dest, **kwargs)
self.cli_args = CLIRecorder()

def __call__(self, parser, namespace, values, option_string=None):
self.cli_args.add_arg(self.dest)
setattr(namespace, self.dest, values)


def grouping_args(args):
"""
Auxiliar function to group the arguments group
the optional arguments that belong to a specific group.
A group is identified with the dot (".") according to the
following format --<group name>.<variable name> <variable value>.
This method will gather all of the variables that belong to a
specific group. Each group is represented as an instance of the
Params class.
For instance:
indexer.posting_threshold and indexer.memory_threshold, will be
assigned to the same group "indexer", which can be then accessed
through args.indexer
Parameters
----------
args : argparse.Namespace
current namespace from argparse
Returns
----------
argparse.Namespace
modified namespace after performing the grouping
"""


namespace_dict = vars(args)
keys = set(namespace_dict.keys())
for x in keys:
if "." in x:
group_name, *param_name = x.split(".")
if group_name not in namespace_dict:
namespace_dict[group_name] = Params()
#print(namespace_dict.keys())
namespace_dict[group_name].add_parameter(param_name, namespace_dict[x])

del namespace_dict[x]

return args

def cli_debug_printer(list_parameters, tab=""):
"""
A helping function to tree print
"""

params_tree_str = ""

_tab = " "*(len(tab)-5) + tab[-5:]

for var, value in list_parameters:
params_tree_str += f"{_tab}{var}: "
if isinstance(value, Params):
value_str = cli_debug_printer(sorted(value.get_kwargs().items()), tab+'|--- ')
params_tree_str += f"\n{_tab}{value_str}"
elif isinstance(value, dict):
value_str = cli_debug_printer(sorted(value.items()), tab+'|--- ')
params_tree_str += f"\n{value_str}"
else:
params_tree_str += f"{value}\n"
#params_tree_str += f"\n"

return params_tree_str
Empty file added src/core.py
Empty file.
2 changes: 1 addition & 1 deletion src/tagger/data.py → src/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import random
import math

from utils import split_chunks, RangeDict
from src.utils import split_chunks, RangeDict
from collections import defaultdict

from transformers import AutoTokenizer, DataCollatorForTokenClassification
Expand Down
Loading

0 comments on commit f813050

Please sign in to comment.