Skip to content

Commit

Permalink
Merge pull request #11 from jcomedouteau/#2137-anonymise-patient-id-2
Browse files Browse the repository at this point in the history
#2137 anonymise patient id using a new column
  • Loading branch information
michaelkain authored Jan 20, 2025
2 parents 07552da + 5dbca9f commit 3843d66
Show file tree
Hide file tree
Showing 3 changed files with 43 additions and 36 deletions.
1 change: 0 additions & 1 deletion anonymization_fields.tsv
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ Admitting Diagnoses Description (0008,1080)
Derivation Description (0008,2111)
Patient's religious Preference (0010,21F0)
Patient's Birth Time (0010,0032)
Other Patient Ids (0010,1000)
Other Patient Names (0010,1001)
Patient's Birth Name (0010,1005)
Patient's Age (0010,1010)
Expand Down
8 changes: 4 additions & 4 deletions example_input_check.csv
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
sequence_id,shanoir_name,series_description
183900,0330085,FLAIR 3D SPACE SAG
75656,4872,Sag CUBE flair 2nex 1.2mm
75613,0720018,3D FLAIR 1mm ISO
sequence_id,shanoir_name,series_description,patient_id
183900,0330085,FLAIR 3D SPACE SAG,0330085
75656,4872,Sag CUBE flair 2nex 1.2mm,4872
75613,0720018,3D FLAIR 1mm ISO,0720018
70 changes: 39 additions & 31 deletions shanoir_downloader_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
import pydicom
import pandas
import numpy as np
from pydicom import Dataset

import shanoir_downloader
from py7zr import pack_7zarchive, unpack_7zarchive

Expand Down Expand Up @@ -73,11 +75,15 @@ def rename_path(old_path, new_path):
old_path.rename(new_path)
return new_path

def anonymize_fields(anonymization_fields, dicom_files, dicom_output_path, sequence_id):
def anonymize_fields(anonymization_fields, dicom_files, dicom_output_path, sequence_id, patient_id, shanoir_name):
for dicom_file in dicom_files:
ds = pydicom.dcmread(str(dicom_file))
ds.PatientID = sequence_id # [(0x0010, 0x0010)]
ds.PatientName = sequence_id # [(0x0010, 0x0020)]
# [(0x0010, 0x0010)]
ds.PatientID = patient_id if patient_id is not None else sequence_id
# [(0x0010, 0x0020)]
ds.PatientName = patient_id if patient_id is not None else sequence_id
# Update Other Patient IDs
ds.OtherPatientIDs = sequence_id
for index, row in anonymization_fields.iterrows():
codes = row['Code'][1:-1].split(',')
codes = [int('0x'+code, base=16) for code in codes]
Expand All @@ -88,7 +94,8 @@ def anonymize_fields(anonymization_fields, dicom_files, dicom_output_path, seque
data_element.value = ''
except KeyError as e:
pass # If the key is not found: juste ignore anonymization
ds.save_as(dicom_output_path / dicom_file.name)
file_name = dicom_file.name.replace(shanoir_name, patient_id)
ds.save_as(dicom_output_path / file_name)
return

def replace_with_sequence_id(sequence_id, dataset, tag):
Expand Down Expand Up @@ -148,14 +155,14 @@ def download_datasets(args, config=None, all_datasets=None):
all_datasets = pandas.read_csv(args.dataset_ids, sep=',' if args.dataset_ids.endswith('.csv') else '\t', dtype=datasets_dtype)
else:
all_datasets = pandas.read_excel(args.dataset_ids, dtype=datasets_dtype)

gpg_recipient = args.gpg_recipient or (os.environ['gpg_recipient'] if 'gpg_recipient' in os.environ else None)

if gpg_recipient is None and not args.skip_encryption:
logging.info('Warning: skipping encryption since gpg_recipient is None (even though skip_encryption is False).')

output_folder = Path(config['output_folder'])

all_datasets.set_index('sequence_id', inplace=True)
# Drop duplicates
all_datasets = all_datasets[~all_datasets.index.duplicated(keep='first')]
Expand All @@ -168,7 +175,7 @@ def download_datasets(args, config=None, all_datasets=None):
all_datasets = all_datasets[all_datasets[column_name] != value]
except Exception as e:
sys.exit(f'Error while parsing skip_columns argument: {skip_column}\n {e}')

# Create missing_datasets and downloaded_datasets tsv files
missing_datasets_path = output_folder / f'missing_datasets.tsv' if args.missing_datasets is None else Path(args.missing_datasets)
downloaded_datasets_path = output_folder / f'downloaded_datasets.tsv' if args.downloaded_datasets is None else Path(args.downloaded_datasets)
Expand Down Expand Up @@ -196,7 +203,7 @@ def download_datasets(args, config=None, all_datasets=None):
# Download and process datasets until there are no more datasets to process
# (all the missing datasets are unrecoverable or tried more than args.max_tries times)
while len(datasets_to_download) > 0:

# datasets_to_download is all_datasets except those already downloaded and those missing which are unrecoverable
datasets_to_download = all_datasets[~all_datasets.index.isin(downloaded_datasets.index)]
datasets_max_tries = missing_datasets[missing_datasets['n_tries'] >= args.max_tries].index
Expand All @@ -216,12 +223,13 @@ def download_datasets(args, config=None, all_datasets=None):
if now.hour >= SHANOIR_SHUTDOWN_HOUR and now.hour < SHANOIR_AVAILABLE_HOUR:
future = datetime(now.year, now.month, now.day, SHANOIR_AVAILABLE_HOUR, 0)
time.sleep((future-now).total_seconds())

sequence_id = index
shanoir_name = row['shanoir_name'] if 'shanoir_name' in row else None
series_description = row['series_description'] if 'series_description' in row else None
patient_id = row['patient_id'] if 'patient_id' in row else None

logging.info(f'Downloading dataset {sequence_id} ({n}/{len(datasets_to_download)}), shanoir name: {shanoir_name}, series description: {series_description}')
logging.info(f'Downloading dataset {sequence_id} ({n}/{len(datasets_to_download)}), shanoir name: {shanoir_name}, series description: {series_description}, patient id: {patient_id}')
n += 1

# Create the destination folder for this dataset
Expand All @@ -242,7 +250,7 @@ def download_datasets(args, config=None, all_datasets=None):
except Exception as e:
missing_datasets = add_missing_dataset(missing_datasets, sequence_id, 'unknown_http_error', str(e), raw_folder, args.unrecoverable_errors, missing_datasets_path)
continue

# List the downloaded zip files
zip_files = list(destination_folder.glob('*.zip'))

Expand All @@ -255,22 +263,22 @@ def download_datasets(args, config=None, all_datasets=None):

# Extract the zip file
dicom_zip = zip_files[0]

logging.info(f' Extracting {dicom_zip}...')
dicom_folder = destination_folder.parent / f'{sequence_id}' # dicom_zip.stem
dicom_folder.mkdir(exist_ok=True)
# shutil.unpack_archive(str(dicom_zip), str(dicom_folder))

with zipfile.ZipFile(str(dicom_zip), 'r') as zip_ref:
zip_ref.extractall(str(dicom_folder))

dicom_files = list(dicom_folder.glob('*.dcm'))

# Error if there are no dicom file found
if len(dicom_files) == 0:
missing_datasets = add_missing_dataset(missing_datasets, sequence_id, 'nodicom', f'No DICOM file was found in the dicom directory {dicom_folder}.', raw_folder, args.unrecoverable_errors, missing_datasets_path)
continue

patient_name_in_dicom = None
series_description_in_dicom = None
verified = None
Expand All @@ -289,19 +297,19 @@ def download_datasets(args, config=None, all_datasets=None):
message = f'Shanoir name {shanoir_name} differs in dicom: {patient_name_in_dicom}'
logging.error(f'For dataset {sequence_id}: {message}')
verified = verified_datasets is not None and len(verified_datasets[(verified_datasets.shanoir_name == shanoir_name) & (verified_datasets.patient_name_in_dicom == patient_name_in_dicom)]) > 0
# missing_datasets = add_missing_dataset(missing_datasets, sequence_id, 'content_patient_name', f'Shanoir name {patient_name} differs in dicom: {ds.PatientName}', raw_folder, args.unrecoverable_errors, missing_datasets_path)
# continue
# missing_datasets = add_missing_dataset(missing_datasets, sequence_id, 'content_patient_name', f'Shanoir name {patient_name} differs in dicom: {ds.PatientName}', raw_folder, args.unrecoverable_errors, missing_datasets_path)
# continue

if series_description_in_dicom.replace(' ', '') != series_description.replace(' ', ''): # or if ds[0x0008, 0x103E].value != series_description:
message = f'Series description {series_description} differs in dicom: {series_description_in_dicom}'
logging.error(f'For dataset {sequence_id}: {message}')
verified = verified_datasets is not None and verified is not False and len(verified_datasets[(verified_datasets.index == sequence_id) & (verified_datasets.series_description == series_description) & (verified_datasets.series_description_in_dicom == series_description_in_dicom)]) > 0
# missing_datasets = add_missing_dataset(missing_datasets, sequence_id, 'content_series_description', f'Series description {series_description} differs in dicom: {ds.SeriesDescription}', raw_folder, args.unrecoverable_errors, missing_datasets_path)
# continue
# missing_datasets = add_missing_dataset(missing_datasets, sequence_id, 'content_series_description', f'Series description {series_description} differs in dicom: {ds.SeriesDescription}', raw_folder, args.unrecoverable_errors, missing_datasets_path)
# continue
except Exception as e:
missing_datasets = add_missing_dataset(missing_datasets, sequence_id, 'content_read', f'Error while reading DICOM: {e}', raw_folder, args.unrecoverable_errors, missing_datasets_path)
continue

dicom_zip_to_encrypt = dicom_zip
anonymized_dicom_folder = None
final_output = dicom_zip
Expand All @@ -311,20 +319,20 @@ def download_datasets(args, config=None, all_datasets=None):
# Anonymize
anonymized_dicom_folder = dicom_folder.parent / f'{dicom_folder.name}_anonymized'
logging.info(f' Anonymizing dataset to {anonymized_dicom_folder}...')

# extraAnonymizationRules = {}
# extraAnonymizationRules[(0x0010, 0x0020)] = functools.partial(replace_with_sequence_id, sequence_id) # Patient ID
# extraAnonymizationRules[(0x0010, 0x0010)] = functools.partial(replace_with_sequence_id, sequence_id) # Patient's Name

try:
anonymized_dicom_folder.mkdir(exist_ok=True)
# import dicomanonymizer
# dicomanonymizer.anonymize(str(dicom_folder), str(anonymized_dicom_folder), extraAnonymizationRules, True)
anonymize_fields(anonymization_fields, dicom_files, anonymized_dicom_folder, sequence_id)
anonymize_fields(anonymization_fields, dicom_files, anonymized_dicom_folder, str(sequence_id), str(patient_id), str(shanoir_name))
except Exception as e:
missing_datasets = add_missing_dataset(missing_datasets, sequence_id, 'anonymization_error', str(e), raw_folder, args.unrecoverable_errors, missing_datasets_path)
continue

# Zip the anonymized dicom file
dicom_zip_to_encrypt = anonymized_dicom_folder.parent / f'{anonymized_dicom_folder.name}.7z'
logging.info(f' Compressing dataset to {dicom_zip_to_encrypt}...')
Expand All @@ -333,7 +341,7 @@ def download_datasets(args, config=None, all_datasets=None):
except Exception as e:
missing_datasets = add_missing_dataset(missing_datasets, sequence_id, 'zip_compression_error', str(e), raw_folder, args.unrecoverable_errors, missing_datasets_path)
continue

final_output = dicom_zip_to_encrypt

if not args.skip_encryption and gpg_recipient is not None:
Expand All @@ -348,12 +356,12 @@ def download_datasets(args, config=None, all_datasets=None):
missing_datasets = add_missing_dataset(missing_datasets, sequence_id, 'encryption_error', str(e), raw_folder, args.unrecoverable_errors, missing_datasets_path)
continue
if return_code != 0:
missing_datasets = add_missing_dataset(missing_datasets, sequence_id, 'encryption_error', str(e), raw_folder, args.unrecoverable_errors, missing_datasets_path)
missing_datasets = add_missing_dataset(missing_datasets, sequence_id, 'encryption_error', "Encryption error", raw_folder, args.unrecoverable_errors, missing_datasets_path)

final_output = encrypted_dicom_zip

# Remove and rename files

# Remove zip
# if not args.keep_intermediate_files:
# shutil.rmtree(dicom_zip)
Expand All @@ -374,11 +382,11 @@ def download_datasets(args, config=None, all_datasets=None):
rename_path(anonymized_dicom_folder, processed_folder / sequence_id / anonymized_dicom_folder.name)
if not args.skip_encryption:
rename_path(dicom_zip_to_encrypt, processed_folder / sequence_id / dicom_zip_to_encrypt.name)

# Remove dicom
if not args.keep_intermediate_files:
shutil.rmtree(dicom_folder)

# Remove downloaded_archive (which should be empty)
shutil.rmtree(destination_folder)

Expand All @@ -402,4 +410,4 @@ def download_datasets(args, config=None, all_datasets=None):
if args.dataset_ids and args.search_text:
print('Both --dataset_ids and --search_text arguments were provided. The --search_text argument will be ignored.')

download_datasets(args)
download_datasets(args)

0 comments on commit 3843d66

Please sign in to comment.