Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

adding sssom support #109

Merged
merged 20 commits into from
Oct 24, 2023
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 8 additions & 5 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,13 @@
koza-env/
examples/ctd/
test_df.csv
test_lut.txt
test.py

# Default output / Generated / Unpacked data files
output/
tests/resources/source-files/string.tsv*


# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
Expand Down Expand Up @@ -133,7 +140,3 @@ dmypy.json

# IDE
.idea

# output directories
output/
test-output/
1,219 changes: 878 additions & 341 deletions poetry.lock

Large diffs are not rendered by default.

4 changes: 3 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "koza"
version = "0.3.1"
version = "0.4.0"
description = "Data transformation framework for LinkML data models"
authors = [
"The Monarch Initiative <[email protected]>",
Expand All @@ -24,6 +24,8 @@ ordered-set = ">=4.1.0"
typer = "^0.7.0"
typer-cli = "^0.0.13"
loguru = "*"
#sssom = "*"
sssom = { git = "https://github.com/glass-ships/sssom-py.git", branch = "develop" }
glass-ships marked this conversation as resolved.
Show resolved Hide resolved

[tool.poetry.dev-dependencies]
pytest = ">=6.0.0"
Expand Down
42 changes: 14 additions & 28 deletions src/koza/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,12 @@
import sys
from pathlib import Path
from typing import Dict, Union

import yaml

# For validation
from pydantic.error_wrappers import ValidationError
from linkml_validator.validator import Validator
from koza.converter.kgx_converter import KGXConverter
from pydantic.error_wrappers import ValidationError

from koza.converter.kgx_converter import KGXConverter
from koza.utils.exceptions import MapItemException, NextRowException
from koza.io.writer.jsonl_writer import JSONLWriter
from koza.io.writer.tsv_writer import TSVWriter
Expand All @@ -21,9 +19,6 @@
from koza.model.source import Source
from koza.model.translation_table import TranslationTable

# import logging
# logger = logging.getLogger(__name__)


class KozaApp:
"""
Expand All @@ -47,11 +42,9 @@ def __init__(
self._map_registry: Dict[str, Source] = {}
self._map_cache: Dict[str, Dict] = {}
self.curie_cleaner: CurieCleaner = CurieCleaner()
self.writer: KozaWriter = self._get_writer(
source.config.name, source.config.node_properties, source.config.edge_properties
)
self.writer: KozaWriter = self._get_writer()
self.logger = logger

if schema:
self.validator = Validator(schema=schema)
self.converter = KGXConverter()
Expand All @@ -63,10 +56,6 @@ def __init__(
map_file_config.transform_code = (str(Path(map_file).parent / Path(map_file).stem) + '.py')
self._map_registry[map_file_config.name] = Source(map_file_config)

self.writer = self._get_writer(
source.config.name, source.config.node_properties, source.config.edge_properties
)

def get_map(self, map_name: str):
map = self._map_cache[map_name]
return map
Expand Down Expand Up @@ -153,45 +142,42 @@ def next_row():
raise NextRowException

def write(self, *entities):

# If a schema/validator is defined, validate before writing
if hasattr(self, 'validator'):

(nodes, edges) = self.converter.convert(entities)

if self.output_format == OutputFormat.tsv:
if nodes:
for node in nodes:
self.validator.validate(obj=node, target_class="NamedThing", strict=True)

if edges:
for edge in edges:
self.validator.validate(obj=edge, target_class="Association", strict=True)

elif self.output_format == OutputFormat.jsonl:
if nodes:
for node in nodes:
# node = json.dumps(n, ensure_ascii=False)
self.validator.validate(obj=node, target_class="NamedThing", strict=True)

if edges:
for edge in edges:
# edge = json.dumps(e, ensure_ascii=False)
self.validator.validate(obj=edge, target_class="Association", strict=True)

self.writer.write(entities)

def _get_writer(self, name, node_properties, edge_properties) -> Union[TSVWriter, JSONLWriter]:
def _get_writer(self) -> Union[TSVWriter, JSONLWriter]:
writer_params = [
self.output_dir,
self.source.config.name,
self.source.config.node_properties,
self.source.config.edge_properties,
self.source.config.sssom_config,
]
if self.output_format == OutputFormat.tsv:
return TSVWriter(self.output_dir, name, node_properties, edge_properties)
return TSVWriter(*writer_params)

elif self.output_format == OutputFormat.jsonl:
return JSONLWriter(self.output_dir, name, node_properties, edge_properties)
return JSONLWriter(*writer_params)

def _load_map(self, map_file: Source):

# map_file = Source(map_file_config)

if not isinstance(map_file.config, MapFileConfig):
raise ValueError(f"Error loading map: {map_file.config.name} is not a MapFileConfig")

Expand Down
12 changes: 5 additions & 7 deletions src/koza/cli_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
"""

from pathlib import Path
from typing import Dict, Optional, Union
from typing import Dict, List, Optional, Union
import yaml

from koza.app import KozaApp
Expand Down Expand Up @@ -58,8 +58,6 @@ def transform_source(
verbose: bool = None,
log: bool = False,
):
# set_log_config(logging.INFO if (verbose is None) else logging.DEBUG if (verbose == True) else logging.WARNING)
# logger = get_logger(verbose, filename = f"logs/{Path(source).name}.log" if log else None)
logger = get_logger(name = Path(source).name if log else None, verbose = verbose)

with open(source, 'r') as source_fh:
Expand All @@ -73,16 +71,16 @@ def transform_source(
source_config.transform_code = str(Path(source).parent / Path(source).stem) + '.py'

koza_source = Source(source_config, row_limit)

logger.debug(f"Source created: {koza_source.config.name}")
translation_table = get_translation_table(
global_table if global_table else source_config.global_table,
local_table if local_table else source_config.local_table,
logger
)

source_koza = set_koza_app(koza_source, translation_table, output_dir, output_format, schema, logger)
source_koza.process_maps()
source_koza.process_sources()
koza_app = set_koza_app(koza_source, translation_table, output_dir, output_format, schema, logger)
koza_app.process_maps()
koza_app.process_sources()
glass-ships marked this conversation as resolved.
Show resolved Hide resolved


def validate_file(
Expand Down
11 changes: 6 additions & 5 deletions src/koza/converter/kgx_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ class KGXConverter:

"""

def convert(self, entities: Iterable) -> Tuple[dict, dict]:
def convert(self, entities: Iterable) -> Tuple[list, list]:

nodes = []
edges = []
Expand All @@ -26,12 +26,13 @@ def convert(self, entities: Iterable) -> Tuple[dict, dict]:
edges.append(self.convert_association(entity))

# if entity has id and name, but not subject/object/predicate, treat as node
elif all(hasattr(entity, attr) for attr in ["id", "name"]) and not all(
hasattr(entity, attr) for attr in ["subject", "object", "predicate"]
):
elif (
all(hasattr(entity, attr) for attr in ["id", "name"])
and not all(hasattr(entity, attr) for attr in ["subject", "object", "predicate"])
):
nodes.append(self.convert_node(entity))

# otherwise, not a
# otherwise, not a valid entity
else:
raise ValueError(
f"Cannot convert {entity}: Can only convert NamedThing or Association entities to KGX compatible dictionaries"
Expand Down
11 changes: 9 additions & 2 deletions src/koza/io/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,10 @@

import requests

##### Helper Functions for Reader classes #####

######################
### Reader Helpers ###
######################

def open_resource(resource: Union[str, PathLike]) -> IO[str]:
"""
Expand Down Expand Up @@ -83,7 +86,10 @@ def check_data(entry, path) -> bool:
else:
tag = ppart.pop(0)

##### Helper functions for Writer classes #####

######################
### Writer Helpers ###
######################

# Biolink 2.0 "Knowledge Source" association slots,
# including the deprecated 'provided_by' slot
Expand Down Expand Up @@ -246,3 +252,4 @@ def is_null(item: Any) -> bool:
"""
null_values = {None, "", " "}
return item in null_values

16 changes: 8 additions & 8 deletions src/koza/io/writer/jsonl_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,9 @@ def __init__(

os.makedirs(output_dir, exist_ok=True)
if node_properties:
self.nodes_file = open(f"{output_dir}/{source_name}_nodes.jsonl", "w")
self.nodeFH = open(f"{output_dir}/{source_name}_nodes.jsonl", "w")
if edge_properties:
self.edges_file = open(f"{output_dir}/{source_name}_edges.jsonl", "w")
self.edgeFH = open(f"{output_dir}/{source_name}_edges.jsonl", "w")

def write(self, entities: Iterable):

Expand All @@ -33,15 +33,15 @@ def write(self, entities: Iterable):
if nodes:
for n in nodes:
node = json.dumps(n, ensure_ascii=False)
self.nodes_file.write(node + '\n')
self.nodeFH.write(node + '\n')

if edges:
for e in edges:
edge = json.dumps(e, ensure_ascii=False)
self.edges_file.write(edge + '\n')
self.edgeFH.write(edge + '\n')

def finalize(self):
if hasattr(self, 'nodes_file'):
self.nodes_file.close()
if hasattr(self, 'edge_file'):
self.edges_file.close()
if hasattr(self, 'nodeFH'):
self.nodeFH.close()
if hasattr(self, 'edgeFH'):
self.edgeFH.close()
Loading