Skip to content

Commit

Permalink
Merge pull request #27 from eriknovak/tests
Browse files Browse the repository at this point in the history
Add new and fix older tests
  • Loading branch information
eriknovak authored Dec 11, 2024
2 parents 26bdc85 + cdea9c5 commit 01e0adc
Show file tree
Hide file tree
Showing 11 changed files with 387 additions and 166 deletions.
2 changes: 2 additions & 0 deletions .github/workflows/unittests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ on:
jobs:
test:
runs-on: ${{ matrix.os }}
continue-on-error: true
timeout-minutes: 30
strategy:
matrix:
os: [ubuntu-latest, macos-latest, windows-latest]
Expand Down
6 changes: 5 additions & 1 deletion anonipy/anonymize/extractors/multi_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,10 @@ def __init__(self, extractors: List[ExtractorInterface]):
extractors: The list of extractors to use.
"""
if len(extractors) == 0:
raise ValueError("At least one extractor must be provided.")
if not all(isinstance(e, ExtractorInterface) for e in extractors):
raise ValueError("All extractors must be instances of ExtractorInterface.")

self.extractors = extractors

Expand Down Expand Up @@ -125,7 +129,7 @@ def _merge_entities(
if len(extractor_outputs) == 0:
return []
if len(extractor_outputs) == 1:
return extractor_outputs[1]
return extractor_outputs[0][1]

joint_entities = self._filter_entities(
list(
Expand Down
6 changes: 5 additions & 1 deletion anonipy/anonymize/extractors/ner_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,11 @@ def __init__(
self.gliner_model = gliner_model
self.spacy_style = spacy_style
self.labels = self._prepare_labels(labels)
self.pipeline = self._prepare_pipeline()

with warnings.catch_warnings():
# TODO: remove once the GLiNER package includes the fix (inproper file closing)
warnings.filterwarnings("ignore", category=ResourceWarning)
self.pipeline = self._prepare_pipeline()

def __call__(self, text: str, *args, **kwargs) -> Tuple[Doc, List[Entity]]:
"""Extract the entities from the text.
Expand Down
164 changes: 164 additions & 0 deletions anonipy/anonymize/pipeline.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
import os
import warnings
from typing import Union, List

from .extractors import ExtractorInterface, MultiExtractor
from .strategies import StrategyInterface
from ..utils.file_system import open_file, write_file


# =====================================
# Pipeline class
# =====================================


class Pipeline:
"""A class for anonymizing files using a pipeline of extractors and strategies.
Examples:
>>> from anonipy.pipeline import Pipeline
>>> extractor = NERExtractor(labels, lang=LANGUAGES.ENGLISH)
>>> strategy = RedactionStrategy()
>>> pipeline = Pipeline(extractor, strategy)
>>> pipeline.anonymize("/path/to/input_dir", "/path/to/output_dir", flatten=True)
Attributes:
extractor (ExtractorInterface, MultiExtractor, List[ExtractorInterface]): The extractor to use for entity extraction.
strategy (StrategyInterface): The strategy to use for anonymization.
Methods:
anonymize(input_dir, output_dir, flatten=False):
Anonymize files in the input directory and save the anonymized files to the output directory.
"""

def __init__(
self,
extractor: Union[ExtractorInterface, MultiExtractor, List[ExtractorInterface]],
strategy: StrategyInterface,
):
"""Initialize the pipeline.
Examples:
>>> from anonipy.pipeline import Pipeline
>>> extractor = NERExtractor(labels, lang=LANGUAGES.ENGLISH)
>>> strategy = RedactionStrategy()
>>> pipeline = Pipeline(extractor, strategy)
Args:
extractor: The extractor to use for entity extraction.
strategy: The strategy to use for anonymization.
"""

if isinstance(extractor, ExtractorInterface) or isinstance(
extractor, MultiExtractor
):
self.extractor = extractor
elif isinstance(extractor, list):
self.extractor = MultiExtractor(extractor)
else:
raise ValueError(
"Extractor must be an ExtractorInterface or a list of ExtractorInterface."
)

if not isinstance(strategy, StrategyInterface):
raise ValueError("Strategy must be a StrategyInterface.")

self.strategy = strategy

def anonymize(self, input_dir: str, output_dir: str, flatten: bool = False) -> dict:
"""Anonymize files in the input directory and save the anonymized files to the output directory.
Args:
input_dir: The path to the input directory containing files to be anonymized.
output_dir: The path to the output directory where anonymized files will be saved.
flatten: Whether to flatten the output directory structure. Defaults to False.
Raises:
ValueError: If the input directory does not exist or if the input and output directories are the same.
Returns:
A dictionary mapping the original file paths to the anonymized file paths.
"""

if not os.path.exists(input_dir):
raise ValueError(f"Input directory '{input_dir}' does not exist.")

if os.path.abspath(input_dir) == os.path.abspath(output_dir):
raise ValueError("Input and output directories cannot be the same.")

if not os.path.exists(output_dir):
os.makedirs(output_dir, exist_ok=True)

anonymized_files_count = 1
file_name_mapping = {}

for root, _, files in os.walk(input_dir):
for file_name in files:
file_path = os.path.join(root, file_name)

try:
anonymized_text = self._anonymize_file(file_path)
if anonymized_text is None:
continue
except Exception as e:
warnings.warn(f"Problems while processing file {file_path}: {e}")
continue

_, ext = os.path.splitext(file_name)
output_file_name = f"file{anonymized_files_count}_anony{ext}"
anonymized_files_count += 1

relative_path = os.path.relpath(file_path, input_dir)

if flatten:
output_file_path = os.path.join(output_dir, output_file_name)
else:
output_file_path = os.path.join(
output_dir, os.path.dirname(relative_path), output_file_name
)
os.makedirs(os.path.dirname(output_file_path), exist_ok=True)

write_file(anonymized_text, output_file_path)

file_path_before = os.path.join(
input_dir.split(os.sep)[-1], relative_path
)
file_path_after = os.path.relpath(output_file_path, output_dir)
file_name_mapping[file_path_before] = os.path.join(
output_dir.split(os.sep)[-1], file_path_after
)

return file_name_mapping

def _anonymize_file(self, file_path: str) -> Union[str, None]:
"""Anonymize a single file.
Args:
file_path: The path to the file to be anonymized.
Returns:
The anonymized text or None if the file is empty or if entity extraction fails.
"""

original_text = open_file(file_path)
if original_text is None or not original_text.strip():
warnings.warn(
f"Skipping file {file_path}: Failed to read or file is empty."
)
return None

_, entities = self.extractor(original_text)

if not entities:
warnings.warn(
f"Skipping file {file_path}: Entity extraction returned None."
)
return None

anonymized_text, _ = self.strategy.anonymize(original_text, entities)

return anonymized_text
149 changes: 0 additions & 149 deletions anonipy/pipeline.py

This file was deleted.

22 changes: 22 additions & 0 deletions test/resources/subfolder/example02.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
Medical Record

Patient Name: John Doe
Date of Birth: 15-01-1985
Date of Examination: 20-05-2024
Social Security Number: 123-45-6789

Examination Procedure:

John Doe underwent a routine physical examination. The procedure included measuring vital signs
(blood pressure, heart rate, temperature), a comprehensive blood panel, and a cardiovascular stress
test. The patient also reported occasional headaches and dizziness, prompting a neurological
assessment and an MRI scan to rule out any underlying issues.

Medication Prescribed:

Ibuprofen 200 mg Take one tablet every 6-8 hours as needed for headache and pain relief.
Lisinopril 10 mg Take one tablet daily to manage high blood pressure.

Next Examination Date:

15-11-2024
Loading

0 comments on commit 01e0adc

Please sign in to comment.