diff --git a/CHANGELOG.md b/CHANGELOG.md index 5acd2957..dbe9dc94 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,7 @@ and **Merged pull requests**. Critical items to know are: Referenced versions in headers are tagged on Github, in parentheses are for pypi. ## [vxx](https://github.com/pydicom/deid/tree/master) (master) + - Adding option to recursively replace sequences (0.1.34) - adding pylint to clean up code (0.1.33) - removing dependency that isn't used (simplejson) (0.1.31) - updating cleaner to use pixel array (0.1.30) diff --git a/deid/dicom/actions.py b/deid/dicom/actions.py index 6b37e2c2..466fad28 100644 --- a/deid/dicom/actions.py +++ b/deid/dicom/actions.py @@ -34,6 +34,8 @@ parse_value ) +from pydicom.dataset import Dataset +from pydicom.sequence import Sequence from .tags import ( add_tag, update_tag, @@ -41,6 +43,7 @@ remove_tag ) +import re # Actions @@ -61,6 +64,11 @@ def perform_action(dicom, action, item=None, fields=None, return_seen=False): value = action.get('value') # "suid" or "var:field" action = action.get('action') # "REPLACE" + # Validate the action + if action not in valid_actions: + bot.warning('%s in not a valid choice. Defaulting to blanked.' % action) + action = "BLANK" + # If there is an expander applied to field, we iterate over fields = expand_field_expression(field=field, dicom=dicom, @@ -68,13 +76,33 @@ def perform_action(dicom, action, item=None, fields=None, return_seen=False): # Keep track of fields we have seen seen = [] + + # An expanded field must END with that field + expanded_regexp = "__%s$" % field + for field in fields: seen.append(field) - dicom = _perform_action(dicom=dicom, - field=field, - item=item, - action=action, - value=value) + + # Handle top level field + _perform_action(dicom=dicom, + field=field, + item=item, + action=action, + value=value) + + + # Expand sequences + if item: + expanded_fields = [x for x in item if re.search(expanded_regexp, x)] + + # FieldA__FieldB + for expanded_field in expanded_fields: + _perform_expanded_action(dicom=dicom, + expanded_field=expanded_field, + item=item, + action=action, + value=value) + if return_seen: return dicom, seen return dicom @@ -86,13 +114,7 @@ def _perform_action(dicom, field, action, value=None, item=None): and perform_addition is typically done via an addition in a config Both result in a call to this function. If an action fails or is not done, None is returned, and the calling function should handle this. - ''' - if action not in valid_actions: - bot.warning('''%s in not a valid choice [%s]. - Defaulting to blanked.''' %(action, - ".".join(valid_actions))) - action = "BLANK" - + ''' if field in dicom and action != "ADD": # Blank the value @@ -109,7 +131,7 @@ def _perform_action(dicom, field, action, value=None, item=None): field=field, value=value) else: - bot.warning("REPLACE %s unsuccessful" %field) + bot.warning("REPLACE %s unsuccessful" % field) # Code the value with something in the response elif action == "JITTER": @@ -134,8 +156,68 @@ def _perform_action(dicom, field, action, value=None, item=None): if value is not None: dicom = add_tag(dicom, field, value, quiet=True) - return dicom +def _perform_expanded_action(dicom, expanded_field, action, value=None, item=None): + '''akin to _perform_action, but we expect to be dealing with an expanded + sequence, and need to step into the Dicom data structure. + + Add, jitter, and delete are currently not supported. + + The field is expected to have the format FieldA__FieldB where the + last one is where we want to do the replacement. + ''' + field = expanded_field.split('__')[-1] + + while field != expanded_field: + next_field, expanded_field = expanded_field.split('__', 1) + + # Case 1: we have a Dataset + if isinstance(dicom, Dataset): + dicom = dicom.get(next_field) + + elif isinstance(dicom, Sequence): + for sequence in dicom: + for subitem in sequence: + if subitem.keyword == next_field: + dicom = subitem + break + + # Field should be equal to expanded_field, and in dicom + if isinstance(dicom, Dataset): + return _perform_action(dicom=dicom, + field=field, + item=item, + action=action, + value=value) + + elif isinstance(dicom, Sequence): + for sequence in dicom: + for subitem in sequence: + if subitem.keyword == field: + dicom = subitem + break + + if not dicom: + return + + # Not sure if this is possible + if dicom.keyword != field: + bot.warning('Early return, looking for %s, found %s' %(field, dicom.keyword)) + return + + # Blank the value + if action == "BLANK": + if dicom.VR not in ['US', 'SS']: + dicom.value = '' + + # Code the value with something in the response + elif action == "REPLACE": + + value = parse_value(item, value, field) + if value is not None: + dicom.value = value + + # elif "KEEP" --> Do nothing. Keep the original # Timestamps diff --git a/deid/dicom/fields.py b/deid/dicom/fields.py index a41066b0..7a2e7a05 100644 --- a/deid/dicom/fields.py +++ b/deid/dicom/fields.py @@ -22,30 +22,67 @@ ''' +from deid.logger import bot from pydicom.sequence import Sequence -from pydicom.dataset import RawDataElement - +from pydicom.dataset import ( + RawDataElement, + Dataset +) import re +def extract_item(item, prefix=None, entry=None): + '''a helper function to extract sequence, will extract values from + a dicom sequence depending on the type. + + Parameters + ========== + item: an item from a sequence. + ''' + # First call, we define entry to be a lookup dictionary + if entry is None: + entry = {} + + # Skip raw data elements + if not isinstance(item, RawDataElement): + header = item.keyword + + # If there is no header or field, we can't evaluate + if header in [None, '']: + return entry + + if prefix is not None: + header = "%s__%s" %(prefix, header) + + value = item.value + if isinstance(value, bytes): + value = value.decode('utf-8') + if isinstance(value, Sequence): + return extract_sequence(value, prefix=header) + + entry[header] = value + return entry + def extract_sequence(sequence, prefix=None): '''return a pydicom.sequence.Sequence recursively - as a list of dictionary items + as a flattened list of items. For example, a nested FieldA and FieldB + would return as: + + {'FieldA__FieldB': '111111'} + + Parameters + ========== + sequence: the sequence to extract, should be pydicom.sequence.Sequence + prefix: the parent name ''' - items = [] + items = {} for item in sequence: - for _, val in item.items(): - if not isinstance(val, RawDataElement): - header = val.keyword - if prefix is not None: - header = "%s__%s" %(prefix, header) - value = val.value - if isinstance(value, bytes): - value = value.decode('utf-8') - if isinstance(value, Sequence): - items += extract_sequence(value, prefix=header) - continue - entry = {"key": header, "value": value} - items.append(entry) + + # If it's a Dataset, we need to further unwrap it + if isinstance(item, Dataset): + for subitem in item: + items.update(extract_item(subitem, prefix=prefix)) + else: + bot.warning("Unrecognized type %s in extract sequences, skipping." % type(item)) return items @@ -100,11 +137,18 @@ def expand_field_expression(field, dicom, contenders=None): def get_fields(dicom, skip=None, expand_sequences=True): '''get fields is a simple function to extract a dictionary of fields (non empty) from a dicom file. + + Parameters + ========== + dicom: the dicom file to get fields for. + skip: an optional list of fields to skip + expand_sequences: if True, expand values that are sequences. ''' if skip is None: skip = [] if not isinstance(skip, list): skip = [skip] + fields = dict() contenders = dicom.dir() for contender in contenders: @@ -116,9 +160,7 @@ def get_fields(dicom, skip=None, expand_sequences=True): # Adding expanded sequences if isinstance(value, Sequence) and expand_sequences is True: - sequence_fields = extract_sequence(value, prefix=contender) - for sf in sequence_fields: - fields[sf['key']] = sf['value'] + fields.update(extract_sequence(value, prefix=contender)) else: if value not in [None, ""]: if isinstance(value, bytes): diff --git a/deid/dicom/header.py b/deid/dicom/header.py index 515d8e52..ab701192 100644 --- a/deid/dicom/header.py +++ b/deid/dicom/header.py @@ -84,10 +84,14 @@ def get_shared_identifiers(dicom_files, # We will skip PixelData skip = config['skip'] for dicom_file in dicom_files: + dicom = read_file(dicom_file, force=True) + + # Get list of fields, expanded sequences are flattened fields = get_fields(dicom, skip=skip, expand_sequences=expand_sequences) + for key, val in fields.items(): # If it's there, only keep if the same @@ -125,6 +129,7 @@ def get_identifiers(dicom_files, config=None, expand_sequences=True, skip_fields=None): + ''' extract all identifiers from a dicom image. This function returns a lookup by file name @@ -133,14 +138,14 @@ def get_identifiers(dicom_files, dicom_files: the dicom file(s) to extract from force: force reading the file (default True) config: if None, uses default in provided module folder - expand_sequences: if True, expand sequences. otherwise, skips + expand_sequences: if True, expand sequences. Otherwise, skips skip_fields: if not None, added fields to skip ''' bot.debug('Extracting identifiers for %s dicom' %(len(dicom_files))) if config is None: - config = "%s/config.json" %(here) + config = "%s/config.json" % here if not os.path.exists(config): bot.error("Cannot find config %s, exiting" %(config)) @@ -165,8 +170,8 @@ def get_identifiers(dicom_files, ids[dicom_file] = dict() ids[dicom_file] = get_fields(dicom, - skip=skip, - expand_sequences=expand_sequences) + skip=skip, + expand_sequences=expand_sequences) return ids @@ -242,8 +247,9 @@ def replace_identifiers(dicom_files, remove_private=True): '''replace identifiers using pydicom, can be slow when writing - and saving new files''' - + and saving new files. If you want to replace sequences, they need + to be extracted with get_identifiers and expand_sequences to True. + ''' dicom_files, recipe, config = _prepare_replace_config(dicom_files, deid=deid, config=config) diff --git a/deid/dicom/tags.py b/deid/dicom/tags.py index 83473152..10d2d942 100644 --- a/deid/dicom/tags.py +++ b/deid/dicom/tags.py @@ -101,6 +101,12 @@ def _filter_tags(tags, idx, fields=None): def remove_sequences(dicom): + '''remove sequences from a dicom by removing the associated tag. + + Parameters + ========== + dicom: the loaded dicom to remove sequences + ''' for field in dicom.dir(): if isinstance(dicom.get(field), Sequence): dicom = remove_tag(dicom, field) diff --git a/deid/main/__init__.py b/deid/main/__init__.py index d010fad5..5eb14233 100644 --- a/deid/main/__init__.py +++ b/deid/main/__init__.py @@ -45,6 +45,10 @@ def get_parser(): help="use verbose logging to debug.", default=False, action='store_true') + parser.add_argument('--version', dest="version", + help="print version and exit.", + default=False, action='store_true') + parser.add_argument("--outfolder", "-o", dest='outfolder', help="full path to save output, will use temporary folder if not specified", type=str, default=None) @@ -118,7 +122,7 @@ def main(): except: sys.exit(0) - if args.command == "version": + if args.command == "version" or args.version: print(__version__) sys.exit(0) diff --git a/deid/version.py b/deid/version.py index 0af89ba0..e0fa59f7 100644 --- a/deid/version.py +++ b/deid/version.py @@ -22,7 +22,7 @@ ''' -__version__ = "0.1.33" +__version__ = "0.1.34" AUTHOR = 'Vanessa Sochat' AUTHOR_EMAIL = 'vsochat@stanford.edu' NAME = 'deid' diff --git a/docs/_docs/examples/func-replace.md b/docs/_docs/examples/func-replace.md index a8b5f1a9..6dc6235f 100644 --- a/docs/_docs/examples/func-replace.md +++ b/docs/_docs/examples/func-replace.md @@ -37,11 +37,32 @@ from deid.dicom import get_identifiers items = get_identifiers(dicom_files) ``` -The function performs an action to generate a uid, but you can also use -it to communicate with databases, APIs, or do something like +If you want to expand sequences, ask for it: + +```python +items = get_identifiers(dicom_files, expand_sequences=True) +``` + +When you expand sequences, they are flattened out in the data structure. +For example: + + +```python + 'ReferencedImageSequence__ReferencedSOPClassUID': '111111111111111111', + 'ReferencedImageSequence__ReferencedSOPInstanceUID': '111111111111111', + 'ReferencedPerformedProcedureStepSequence__InstanceCreationDate': '22222222', + 'ReferencedPerformedProcedureStepSequence__InstanceCreationTime': '22222222', + 'ReferencedPerformedProcedureStepSequence__InstanceCreatorUID': 'xxxxxxx', + 'ReferencedPerformedProcedureStepSequence__ReferencedSOPClassUID': 'xxxxxxxxxx', + 'ReferencedPerformedProcedureStepSequence__ReferencedSOPInstanceUID': 'xxxxxxxx', +``` + +The function we will use for the example will perform an action to generate a uid, +but you can also use it to communicate with databases, APIs, or do something like save the original (and newly generated one) in some (IRB approvied) place ## The Deid Recipe + The process of updating header values means writing a series of actions in the deid recipe, in this folder the file [deid.dicom](deid.dicom) has the following content: @@ -141,16 +162,27 @@ for item in items: ## Replace identifiers We are ready to go! Now let's generate the cleaned files! It will output to a -temporary directory. +temporary directory. ```python cleaned_files = replace_identifiers(dicom_files=dicom_files, deid=recipe, ids=items) +``` + +If your data was extracted with sequences expanded, those +same sequences will be checked for cleaning, but only if you set `strip_sequences` +to False. +```python +cleaned_files = replace_identifiers(dicom_files=dicom_files, + deid=recipe, + ids=items, + strip_sequences=False) ``` -You can load in a cleaned file to see what was done +See [here](https://github.com/pydicom/deid/tree/master/examples/dicom/header-manipulation/func-sequence-replace) for the code for the sequences replacement example. Note that expansion of sequences is not currently supported for operations that remove or add a value (ADD, REMOVE, JITTER). +You can load in a cleaned file to see what was done. ```python from pydicom import read_file diff --git a/docs/_docs/examples/func-sequence-replace.md b/docs/_docs/examples/func-sequence-replace.md new file mode 100644 index 00000000..4a92ea88 --- /dev/null +++ b/docs/_docs/examples/func-sequence-replace.md @@ -0,0 +1,172 @@ +--- +title: Header Sequence Manipuation +category: Examples +order: 4 +--- + +The code and files for this example can be found [here](https://github.com/pydicom/deid/tree/master/examples/dicom/header-manipulation/func-sequence-replace/). +For this example, we want to replace values that are nested (in sequences). +This operation is available for deid versions 0.1.34 and later, and currently +we support `REPLACE`, and `BLANK`. + +## Imports + +We first import the functions that we need + +```python +from deid.dicom import get_identifiers, replace_identifiers +from deid.config import DeidRecipe +``` + +We are using an MR.dcm that is provided in the example's folder linked above. + + +```python +dicom_files = ['MR.dcm'] +items = get_identifiers(dicom_files) +``` + +For each item (indexed by the dicom file name), sequences +are flattened out in the data structure. For example: + +```python + 'ReferencedImageSequence__ReferencedSOPClassUID': '111111111111111111', + 'ReferencedImageSequence__ReferencedSOPInstanceUID': '111111111111111', + 'ReferencedPerformedProcedureStepSequence__InstanceCreationDate': '22222222', + 'ReferencedPerformedProcedureStepSequence__InstanceCreationTime': '22222222', + 'ReferencedPerformedProcedureStepSequence__InstanceCreatorUID': 'xxxxxxx', + 'ReferencedPerformedProcedureStepSequence__ReferencedSOPClassUID': 'xxxxxxxxxx', + 'ReferencedPerformedProcedureStepSequence__ReferencedSOPInstanceUID': 'xxxxxxxx', +``` + +The function we will use for the example will perform an action to generate a uid, +but you can also use it to communicate with databases, APIs, or do something like +save the original (and newly generated one) in some (IRB approvied) place + +## The Deid Recipe + +The process of updating header values means writing a series of actions +in the deid recipe, in this folder the file [deid.dicom](deid.dicom) has the +following content: + +Along with a deid.dicom that asks to `REPLACE` a field with a function: + +``` +FORMAT dicom + +%header + +REPLACE InstanceCreationDate func:generate_uid +``` + +Let's create an instance of our recipe: + +```python +# Create the DeidRecipe Instance from deid.dicom +recipe = DeidRecipe('deid.dicom') +``` + +Here are a few different ways to interact: + +```python +# To see an entire (raw in a dictionary) recipe just look at +recipe.deid + +# What is the format? +recipe.get_format() +# dicom + +# What actions do we want to do on the header? +recipe.get_actions() + +[{'action': 'REPLACE', + 'field': 'InstanceCreationDate', + 'value': 'func:generate_uid'}] + +# We can filter to an action type (not useful here, we only have one type) +recipe.get_actions(action='REPLACE') + +# or we can filter to a field +recipe.get_actions(field='InstanceCreationDate') +[{'action': 'REPLACE', + 'field': 'InstanceCreationDate', + 'value': 'func:generate_uid'}] + +# and logically, both (not useful here) +recipe.get_actions(field='PatientID', action="REMOVE") +``` + +Our recipe instance is ready to go. From the above we are saying we want to replace the +`InstanceCreationDate` field with the output from the generate_uid function, +which is expected in the item dict. Let's write that next. + +## Write Your Function + +```python +def generate_uid(item, value, field): + '''This function will generate a uuid! You can expect it to be passed + the dictionary of items extracted from the dicom (and your function) + and variables, the original value (func:generate_uid) and the field + name you are applying it to. + ''' + import uuid + prefix = field.lower().replace(' ', " ") + return prefix + "-" + str(uuid.uuid4()) + +``` + +As stated in the docstring, you can expect it to be passed the dictionary of +items extracted from the dicom (and your function) and variables, the +original value (func:generate_uid) and the field name you are applying it to. + +## Update Your Items + +How do we update the items? Remember, the action is: + +``` +REPLACE InstanceCreationDate func:generate_uid +``` + +so the key for each item in items needs to be 'generate_uid." Just do this: + +```python +for item in items: + items[item]['generate_uid'] = generate_uid +``` + +## Replace identifiers +We are ready to go! Now let's generate the cleaned files! It will output to a +temporary directory. Since we want to replace nested sequences, we need to +set `strip_sequences` to False. + + +```python +cleaned_files = replace_identifiers(dicom_files=dicom_files, + deid=recipe, + ids=items, + strip_sequences=False) +``` + +Note that expansion of sequences is not currently supported for operations +that remove or add a value (ADD, REMOVE, JITTER). +You can load in a cleaned file to see what was done (a cleaned file is provided +in the example folder): + +```python +from pydicom import read_file +cleaned = read_file(cleaned_files[0]) +print(cleaned) + +# cleaned (subset of changed) +(0008, 0012) Instance Creation Date DA: 'instancecreationdate-2ad6c7f6-2264-4f9d-a3f2-ead2cf438fe1' +... +# here is a nested sequence + (0008, 0012) Instance Creation Date DA: 'instancecreationdate-7fb93a26-b2fe-446a-8899-84ac7a1fc217' +``` + +You can write to a different output folder by setting `output_folder` in +the function, and use `overwrite` to specify if files should be overwritten +that already exist. + +Full code for the +example above is [available here](https://github.com/pydicom/deid/tree/master/examples/dicom/header-manipulation/func-sequence-replace/). diff --git a/docs/_docs/examples/index.md b/docs/_docs/examples/index.md index 9e04e5c9..662d16a9 100644 --- a/docs/_docs/examples/index.md +++ b/docs/_docs/examples/index.md @@ -22,10 +22,10 @@ want a quick start overview of deid. ## Header Manipulation - - [Frame of Reference]({{ site.baseurl }}/examples/func-replace/) shows how to dynamically replace or update header values from a function. + - [Replace with Function]({{ site.baseurl }}/examples/func-replace/) shows how to dynamically replace or update header values from a function. + - [Replace Sequences]({{ site.baseurl }}/examples/func-sequence-replace/) similar, but replacement of nested sequences. - [Header Expanders]({{ site.baseurl }}/examples/header-expanders/) can be used to select one or more header fields to apply an action to - ## Cleaning Pixels See an example of just "inspection" (flagging images based on criteria) or "clean" diff --git a/examples/dicom/header-manipulation/func-sequence-replace/MR.dcm b/examples/dicom/header-manipulation/func-sequence-replace/MR.dcm new file mode 100644 index 00000000..d7b401e7 Binary files /dev/null and b/examples/dicom/header-manipulation/func-sequence-replace/MR.dcm differ diff --git a/examples/dicom/header-manipulation/func-sequence-replace/cleaned.dcm b/examples/dicom/header-manipulation/func-sequence-replace/cleaned.dcm new file mode 100644 index 00000000..ce3960ab Binary files /dev/null and b/examples/dicom/header-manipulation/func-sequence-replace/cleaned.dcm differ diff --git a/examples/dicom/header-manipulation/func-sequence-replace/deid.dicom b/examples/dicom/header-manipulation/func-sequence-replace/deid.dicom new file mode 100644 index 00000000..60c8461f --- /dev/null +++ b/examples/dicom/header-manipulation/func-sequence-replace/deid.dicom @@ -0,0 +1,5 @@ +FORMAT dicom + +%header + +REPLACE InstanceCreationDate func:generate_uid diff --git a/examples/dicom/header-manipulation/func-sequence-replace/example.py b/examples/dicom/header-manipulation/func-sequence-replace/example.py new file mode 100644 index 00000000..82ca4b52 --- /dev/null +++ b/examples/dicom/header-manipulation/func-sequence-replace/example.py @@ -0,0 +1,40 @@ +from deid.dicom import get_identifiers, replace_identifiers +from deid.config import DeidRecipe + +# This is supported for deid.dicom version 0.1.34 + +# This dicom has nested InstanceCreationDate fields + +dicom_files = ['MR.dcm'] + +# They are extracted, and flattened in items +# 'ReferencedPerformedProcedureStepSequence__InstanceCreationDate': '20091124', + +items = get_identifiers(dicom_files) + +# Load in the recipe, we want to REPLACE InstanceCreationDate with a function + +recipe = DeidRecipe('deid.dicom') + +# Here is our function + +def generate_uid(item, value, field): + '''This function will generate a uuid! You can expect it to be passed + the dictionary of items extracted from the dicom (and your function) + and variables, the original value (func:generate_uid) and the field + name you are applying it to. + ''' + import uuid + prefix = field.lower().replace(' ', " ") + return prefix + "-" + str(uuid.uuid4()) + +# Add the function to each item to be found +for item in items: + items[item]['generate_uid'] = generate_uid + +# Clean the files +cleaned_files = replace_identifiers(dicom_files=dicom_files, + deid=recipe, + strip_sequences=False, + ids=items) +