Merge pull request #27 from eriknovak/tests

Add new and fix older tests
eriknovak · Dec 11, 2024 · 01e0adc · 01e0adc
2 parents 26bdc85 + cdea9c5
commit 01e0adc
Show file tree

Hide file tree

Showing 11 changed files with 387 additions and 166 deletions.
diff --git a/.github/workflows/unittests.yaml b/.github/workflows/unittests.yaml
@@ -9,6 +9,8 @@ on:
 jobs:
   test:
     runs-on: ${{ matrix.os }}
+    continue-on-error: true
+    timeout-minutes: 30
     strategy:
       matrix:
         os: [ubuntu-latest, macos-latest, windows-latest]

diff --git a/anonipy/anonymize/extractors/multi_extractor.py b/anonipy/anonymize/extractors/multi_extractor.py
@@ -59,6 +59,10 @@ def __init__(self, extractors: List[ExtractorInterface]):
             extractors: The list of extractors to use.
 
         """
+        if len(extractors) == 0:
+            raise ValueError("At least one extractor must be provided.")
+        if not all(isinstance(e, ExtractorInterface) for e in extractors):
+            raise ValueError("All extractors must be instances of ExtractorInterface.")
 
         self.extractors = extractors
 
@@ -125,7 +129,7 @@ def _merge_entities(
         if len(extractor_outputs) == 0:
             return []
         if len(extractor_outputs) == 1:
-            return extractor_outputs[1]
+            return extractor_outputs[0][1]
 
         joint_entities = self._filter_entities(
             list(

diff --git a/anonipy/anonymize/extractors/ner_extractor.py b/anonipy/anonymize/extractors/ner_extractor.py
@@ -86,7 +86,11 @@ def __init__(
         self.gliner_model = gliner_model
         self.spacy_style = spacy_style
         self.labels = self._prepare_labels(labels)
-        self.pipeline = self._prepare_pipeline()
+
+        with warnings.catch_warnings():
+            # TODO: remove once the GLiNER package includes the fix (inproper file closing)
+            warnings.filterwarnings("ignore", category=ResourceWarning)
+            self.pipeline = self._prepare_pipeline()
 
     def __call__(self, text: str, *args, **kwargs) -> Tuple[Doc, List[Entity]]:
         """Extract the entities from the text.

diff --git a/anonipy/anonymize/pipeline.py b/anonipy/anonymize/pipeline.py
@@ -0,0 +1,164 @@
+import os
+import warnings
+from typing import Union, List
+
+from .extractors import ExtractorInterface, MultiExtractor
+from .strategies import StrategyInterface
+from ..utils.file_system import open_file, write_file
+
+
+# =====================================
+# Pipeline class
+# =====================================
+
+
+class Pipeline:
+    """A class for anonymizing files using a pipeline of extractors and strategies.
+
+    Examples:
+        >>> from anonipy.pipeline import Pipeline
+        >>> extractor = NERExtractor(labels, lang=LANGUAGES.ENGLISH)
+        >>> strategy = RedactionStrategy()
+        >>> pipeline = Pipeline(extractor, strategy)
+        >>> pipeline.anonymize("/path/to/input_dir", "/path/to/output_dir", flatten=True)
+
+    Attributes:
+        extractor (ExtractorInterface, MultiExtractor, List[ExtractorInterface]): The extractor to use for entity extraction.
+        strategy (StrategyInterface): The strategy to use for anonymization.
+
+    Methods:
+        anonymize(input_dir, output_dir, flatten=False):
+            Anonymize files in the input directory and save the anonymized files to the output directory.
+
+    """
+
+    def __init__(
+        self,
+        extractor: Union[ExtractorInterface, MultiExtractor, List[ExtractorInterface]],
+        strategy: StrategyInterface,
+    ):
+        """Initialize the pipeline.
+
+        Examples:
+            >>> from anonipy.pipeline import Pipeline
+            >>> extractor = NERExtractor(labels, lang=LANGUAGES.ENGLISH)
+            >>> strategy = RedactionStrategy()
+            >>> pipeline = Pipeline(extractor, strategy)
+
+        Args:
+            extractor: The extractor to use for entity extraction.
+            strategy: The strategy to use for anonymization.
+
+        """
+
+        if isinstance(extractor, ExtractorInterface) or isinstance(
+            extractor, MultiExtractor
+        ):
+            self.extractor = extractor
+        elif isinstance(extractor, list):
+            self.extractor = MultiExtractor(extractor)
+        else:
+            raise ValueError(
+                "Extractor must be an ExtractorInterface or a list of ExtractorInterface."
+            )
+
+        if not isinstance(strategy, StrategyInterface):
+            raise ValueError("Strategy must be a StrategyInterface.")
+
+        self.strategy = strategy
+
+    def anonymize(self, input_dir: str, output_dir: str, flatten: bool = False) -> dict:
+        """Anonymize files in the input directory and save the anonymized files to the output directory.
+
+        Args:
+            input_dir: The path to the input directory containing files to be anonymized.
+            output_dir: The path to the output directory where anonymized files will be saved.
+            flatten: Whether to flatten the output directory structure. Defaults to False.
+
+        Raises:
+            ValueError: If the input directory does not exist or if the input and output directories are the same.
+
+        Returns:
+            A dictionary mapping the original file paths to the anonymized file paths.
+
+        """
+
+        if not os.path.exists(input_dir):
+            raise ValueError(f"Input directory '{input_dir}' does not exist.")
+
+        if os.path.abspath(input_dir) == os.path.abspath(output_dir):
+            raise ValueError("Input and output directories cannot be the same.")
+
+        if not os.path.exists(output_dir):
+            os.makedirs(output_dir, exist_ok=True)
+
+        anonymized_files_count = 1
+        file_name_mapping = {}
+
+        for root, _, files in os.walk(input_dir):
+            for file_name in files:
+                file_path = os.path.join(root, file_name)
+
+                try:
+                    anonymized_text = self._anonymize_file(file_path)
+                    if anonymized_text is None:
+                        continue
+                except Exception as e:
+                    warnings.warn(f"Problems while processing file {file_path}: {e}")
+                    continue
+
+                _, ext = os.path.splitext(file_name)
+                output_file_name = f"file{anonymized_files_count}_anony{ext}"
+                anonymized_files_count += 1
+
+                relative_path = os.path.relpath(file_path, input_dir)
+
+                if flatten:
+                    output_file_path = os.path.join(output_dir, output_file_name)
+                else:
+                    output_file_path = os.path.join(
+                        output_dir, os.path.dirname(relative_path), output_file_name
+                    )
+                    os.makedirs(os.path.dirname(output_file_path), exist_ok=True)
+
+                write_file(anonymized_text, output_file_path)
+
+                file_path_before = os.path.join(
+                    input_dir.split(os.sep)[-1], relative_path
+                )
+                file_path_after = os.path.relpath(output_file_path, output_dir)
+                file_name_mapping[file_path_before] = os.path.join(
+                    output_dir.split(os.sep)[-1], file_path_after
+                )
+
+        return file_name_mapping
+
+    def _anonymize_file(self, file_path: str) -> Union[str, None]:
+        """Anonymize a single file.
+
+        Args:
+            file_path: The path to the file to be anonymized.
+
+        Returns:
+            The anonymized text or None if the file is empty or if entity extraction fails.
+
+        """
+
+        original_text = open_file(file_path)
+        if original_text is None or not original_text.strip():
+            warnings.warn(
+                f"Skipping file {file_path}: Failed to read or file is empty."
+            )
+            return None
+
+        _, entities = self.extractor(original_text)
+
+        if not entities:
+            warnings.warn(
+                f"Skipping file {file_path}: Entity extraction returned None."
+            )
+            return None
+
+        anonymized_text, _ = self.strategy.anonymize(original_text, entities)
+
+        return anonymized_text
diff --git a/anonipy/pipeline.py b/anonipy/pipeline.py
diff --git a/test/resources/subfolder/example02.txt b/test/resources/subfolder/example02.txt
@@ -0,0 +1,22 @@
+Medical Record
+
+Patient Name: John Doe
+Date of Birth: 15-01-1985
+Date of Examination: 20-05-2024
+Social Security Number: 123-45-6789
+
+Examination Procedure:
+
+John Doe underwent a routine physical examination. The procedure included measuring vital signs
+(blood pressure, heart rate, temperature), a comprehensive blood panel, and a cardiovascular stress
+test. The patient also reported occasional headaches and dizziness, prompting a neurological
+assessment and an MRI scan to rule out any underlying issues.
+
+Medication Prescribed:
+
+Ibuprofen 200 mg Take one tablet every 6-8 hours as needed for headache and pain relief.
+Lisinopril 10 mg Take one tablet daily to manage high blood pressure.
+
+Next Examination Date:
+
+15-11-2024