-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #27 from eriknovak/tests
Add new and fix older tests
- Loading branch information
Showing
11 changed files
with
387 additions
and
166 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,164 @@ | ||
import os | ||
import warnings | ||
from typing import Union, List | ||
|
||
from .extractors import ExtractorInterface, MultiExtractor | ||
from .strategies import StrategyInterface | ||
from ..utils.file_system import open_file, write_file | ||
|
||
|
||
# ===================================== | ||
# Pipeline class | ||
# ===================================== | ||
|
||
|
||
class Pipeline: | ||
"""A class for anonymizing files using a pipeline of extractors and strategies. | ||
Examples: | ||
>>> from anonipy.pipeline import Pipeline | ||
>>> extractor = NERExtractor(labels, lang=LANGUAGES.ENGLISH) | ||
>>> strategy = RedactionStrategy() | ||
>>> pipeline = Pipeline(extractor, strategy) | ||
>>> pipeline.anonymize("/path/to/input_dir", "/path/to/output_dir", flatten=True) | ||
Attributes: | ||
extractor (ExtractorInterface, MultiExtractor, List[ExtractorInterface]): The extractor to use for entity extraction. | ||
strategy (StrategyInterface): The strategy to use for anonymization. | ||
Methods: | ||
anonymize(input_dir, output_dir, flatten=False): | ||
Anonymize files in the input directory and save the anonymized files to the output directory. | ||
""" | ||
|
||
def __init__( | ||
self, | ||
extractor: Union[ExtractorInterface, MultiExtractor, List[ExtractorInterface]], | ||
strategy: StrategyInterface, | ||
): | ||
"""Initialize the pipeline. | ||
Examples: | ||
>>> from anonipy.pipeline import Pipeline | ||
>>> extractor = NERExtractor(labels, lang=LANGUAGES.ENGLISH) | ||
>>> strategy = RedactionStrategy() | ||
>>> pipeline = Pipeline(extractor, strategy) | ||
Args: | ||
extractor: The extractor to use for entity extraction. | ||
strategy: The strategy to use for anonymization. | ||
""" | ||
|
||
if isinstance(extractor, ExtractorInterface) or isinstance( | ||
extractor, MultiExtractor | ||
): | ||
self.extractor = extractor | ||
elif isinstance(extractor, list): | ||
self.extractor = MultiExtractor(extractor) | ||
else: | ||
raise ValueError( | ||
"Extractor must be an ExtractorInterface or a list of ExtractorInterface." | ||
) | ||
|
||
if not isinstance(strategy, StrategyInterface): | ||
raise ValueError("Strategy must be a StrategyInterface.") | ||
|
||
self.strategy = strategy | ||
|
||
def anonymize(self, input_dir: str, output_dir: str, flatten: bool = False) -> dict: | ||
"""Anonymize files in the input directory and save the anonymized files to the output directory. | ||
Args: | ||
input_dir: The path to the input directory containing files to be anonymized. | ||
output_dir: The path to the output directory where anonymized files will be saved. | ||
flatten: Whether to flatten the output directory structure. Defaults to False. | ||
Raises: | ||
ValueError: If the input directory does not exist or if the input and output directories are the same. | ||
Returns: | ||
A dictionary mapping the original file paths to the anonymized file paths. | ||
""" | ||
|
||
if not os.path.exists(input_dir): | ||
raise ValueError(f"Input directory '{input_dir}' does not exist.") | ||
|
||
if os.path.abspath(input_dir) == os.path.abspath(output_dir): | ||
raise ValueError("Input and output directories cannot be the same.") | ||
|
||
if not os.path.exists(output_dir): | ||
os.makedirs(output_dir, exist_ok=True) | ||
|
||
anonymized_files_count = 1 | ||
file_name_mapping = {} | ||
|
||
for root, _, files in os.walk(input_dir): | ||
for file_name in files: | ||
file_path = os.path.join(root, file_name) | ||
|
||
try: | ||
anonymized_text = self._anonymize_file(file_path) | ||
if anonymized_text is None: | ||
continue | ||
except Exception as e: | ||
warnings.warn(f"Problems while processing file {file_path}: {e}") | ||
continue | ||
|
||
_, ext = os.path.splitext(file_name) | ||
output_file_name = f"file{anonymized_files_count}_anony{ext}" | ||
anonymized_files_count += 1 | ||
|
||
relative_path = os.path.relpath(file_path, input_dir) | ||
|
||
if flatten: | ||
output_file_path = os.path.join(output_dir, output_file_name) | ||
else: | ||
output_file_path = os.path.join( | ||
output_dir, os.path.dirname(relative_path), output_file_name | ||
) | ||
os.makedirs(os.path.dirname(output_file_path), exist_ok=True) | ||
|
||
write_file(anonymized_text, output_file_path) | ||
|
||
file_path_before = os.path.join( | ||
input_dir.split(os.sep)[-1], relative_path | ||
) | ||
file_path_after = os.path.relpath(output_file_path, output_dir) | ||
file_name_mapping[file_path_before] = os.path.join( | ||
output_dir.split(os.sep)[-1], file_path_after | ||
) | ||
|
||
return file_name_mapping | ||
|
||
def _anonymize_file(self, file_path: str) -> Union[str, None]: | ||
"""Anonymize a single file. | ||
Args: | ||
file_path: The path to the file to be anonymized. | ||
Returns: | ||
The anonymized text or None if the file is empty or if entity extraction fails. | ||
""" | ||
|
||
original_text = open_file(file_path) | ||
if original_text is None or not original_text.strip(): | ||
warnings.warn( | ||
f"Skipping file {file_path}: Failed to read or file is empty." | ||
) | ||
return None | ||
|
||
_, entities = self.extractor(original_text) | ||
|
||
if not entities: | ||
warnings.warn( | ||
f"Skipping file {file_path}: Entity extraction returned None." | ||
) | ||
return None | ||
|
||
anonymized_text, _ = self.strategy.anonymize(original_text, entities) | ||
|
||
return anonymized_text |
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
Medical Record | ||
|
||
Patient Name: John Doe | ||
Date of Birth: 15-01-1985 | ||
Date of Examination: 20-05-2024 | ||
Social Security Number: 123-45-6789 | ||
|
||
Examination Procedure: | ||
|
||
John Doe underwent a routine physical examination. The procedure included measuring vital signs | ||
(blood pressure, heart rate, temperature), a comprehensive blood panel, and a cardiovascular stress | ||
test. The patient also reported occasional headaches and dizziness, prompting a neurological | ||
assessment and an MRI scan to rule out any underlying issues. | ||
|
||
Medication Prescribed: | ||
|
||
Ibuprofen 200 mg Take one tablet every 6-8 hours as needed for headache and pain relief. | ||
Lisinopril 10 mg Take one tablet daily to manage high blood pressure. | ||
|
||
Next Examination Date: | ||
|
||
15-11-2024 |
Oops, something went wrong.