Merge pull request pydicom#106 from pydicom/add/sequence-replacement

Add/sequence replacement
sammaxwellxyz · Jun 24, 2019 · 2b69ddb · 2b69ddb
2 parents a2b402d + ebe0907
commit 2b69ddb
Show file tree

Hide file tree

Showing 14 changed files with 438 additions and 48 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -14,6 +14,7 @@ and **Merged pull requests**. Critical items to know are:
 Referenced versions in headers are tagged on Github, in parentheses are for pypi.
 
 ## [vxx](https://github.com/pydicom/deid/tree/master) (master)
+ - Adding option to recursively replace sequences (0.1.34)
  - adding pylint to clean up code (0.1.33)
  - removing dependency that isn't used (simplejson) (0.1.31)
  - updating cleaner to use pixel array (0.1.30)

diff --git a/deid/dicom/actions.py b/deid/dicom/actions.py
@@ -34,13 +34,16 @@
     parse_value
 )
 
+from pydicom.dataset import Dataset
+from pydicom.sequence import Sequence
 from .tags import (
     add_tag,
     update_tag,
     blank_tag,
     remove_tag
 )
 
+import re
 
 # Actions
 
@@ -61,20 +64,45 @@ def perform_action(dicom, action, item=None, fields=None, return_seen=False):
     value = action.get('value')   # "suid" or "var:field"
     action = action.get('action') # "REPLACE"
 
+    # Validate the action
+    if action not in valid_actions:
+        bot.warning('%s in not a valid choice. Defaulting to blanked.' % action)
+        action = "BLANK"
+
     # If there is an expander applied to field, we iterate over
     fields = expand_field_expression(field=field,
                                      dicom=dicom,
                                      contenders=fields)
 
     # Keep track of fields we have seen
     seen = []    
+
+    # An expanded field must END with that field
+    expanded_regexp = "__%s$" % field
+
     for field in fields:
         seen.append(field)
-        dicom = _perform_action(dicom=dicom,
-                                field=field,
-                                item=item,
-                                action=action,
-                                value=value)
+
+        # Handle top level field
+        _perform_action(dicom=dicom,
+                        field=field,
+                        item=item,
+                        action=action,
+                        value=value)
+
+
+    # Expand sequences
+    if item:
+        expanded_fields = [x for x in item if re.search(expanded_regexp, x)]
+
+        # FieldA__FieldB
+        for expanded_field in expanded_fields:
+            _perform_expanded_action(dicom=dicom,
+                                     expanded_field=expanded_field,
+                                     item=item,
+                                     action=action,
+                                     value=value)
+
     if return_seen:
         return dicom, seen
     return dicom
@@ -86,13 +114,7 @@ def _perform_action(dicom, field, action, value=None, item=None):
        and perform_addition is typically done via an addition in a config
        Both result in a call to this function. If an action fails or is not
        done, None is returned, and the calling function should handle this.
-    '''
-    if action not in valid_actions:
-        bot.warning('''%s in not a valid choice [%s]. 
-                       Defaulting to blanked.''' %(action,
-                                               ".".join(valid_actions)))
-        action = "BLANK"
-
+    '''    
     if field in dicom and action != "ADD":
 
         # Blank the value
@@ -109,7 +131,7 @@ def _perform_action(dicom, field, action, value=None, item=None):
                                    field=field,
                                    value=value)
             else:
-                bot.warning("REPLACE %s unsuccessful" %field)
+                bot.warning("REPLACE %s unsuccessful" % field)
 
         # Code the value with something in the response
         elif action == "JITTER":
@@ -134,8 +156,68 @@ def _perform_action(dicom, field, action, value=None, item=None):
         if value is not None:
             dicom = add_tag(dicom, field, value, quiet=True) 
 
-    return dicom
 
+def _perform_expanded_action(dicom, expanded_field, action, value=None, item=None):
+    '''akin to _perform_action, but we expect to be dealing with an expanded
+       sequence, and need to step into the Dicom data structure. 
+
+       Add, jitter, and delete are currently not supported.
+
+       The field is expected to have the format FieldA__FieldB where the
+       last one is where we want to do the replacement.
+    '''
+    field = expanded_field.split('__')[-1]
+
+    while field != expanded_field:
+        next_field, expanded_field = expanded_field.split('__', 1)
+
+        # Case 1: we have a Dataset
+        if isinstance(dicom, Dataset):
+            dicom = dicom.get(next_field)
+
+        elif isinstance(dicom, Sequence):
+            for sequence in dicom:
+                for subitem in sequence:
+                    if subitem.keyword == next_field:
+                        dicom = subitem
+                        break
+
+    # Field should be equal to expanded_field, and in dicom
+    if isinstance(dicom, Dataset):
+        return _perform_action(dicom=dicom,
+                               field=field,
+                               item=item,
+                               action=action,
+                               value=value)
+
+    elif isinstance(dicom, Sequence):
+        for sequence in dicom:
+            for subitem in sequence:
+                if subitem.keyword == field:
+                    dicom = subitem
+                    break
+
+    if not dicom:
+        return
+
+    # Not sure if this is possible
+    if dicom.keyword != field:
+        bot.warning('Early return, looking for %s, found %s' %(field, dicom.keyword))
+        return
+
+    # Blank the value
+    if action == "BLANK":
+        if dicom.VR not in ['US', 'SS']:
+            dicom.value = ''
+
+    # Code the value with something in the response
+    elif action == "REPLACE":
+
+        value = parse_value(item, value, field)
+        if value is not None:
+            dicom.value = value
+
+    # elif "KEEP" --> Do nothing. Keep the original
 
 # Timestamps
 

diff --git a/deid/dicom/fields.py b/deid/dicom/fields.py
@@ -22,30 +22,67 @@
 
 '''
 
+from deid.logger import bot
 from pydicom.sequence import Sequence
-from pydicom.dataset import RawDataElement
-
+from pydicom.dataset import (
+    RawDataElement, 
+    Dataset
+)
 import re
 
+def extract_item(item, prefix=None, entry=None):
+    '''a helper function to extract sequence, will extract values from 
+       a dicom sequence depending on the type.
+
+       Parameters
+       ==========
+       item: an item from a sequence.
+    '''
+    # First call, we define entry to be a lookup dictionary
+    if entry is None:
+        entry = {}
+
+    # Skip raw data elements
+    if not isinstance(item, RawDataElement):
+        header = item.keyword
+
+        # If there is no header or field, we can't evaluate
+        if header in [None, '']:
+            return entry
+
+        if prefix is not None:
+            header = "%s__%s" %(prefix, header)  
+
+        value = item.value
+        if isinstance(value, bytes):
+            value = value.decode('utf-8')
+        if isinstance(value, Sequence):
+            return extract_sequence(value, prefix=header)
+
+        entry[header] = value
+    return entry
+
 def extract_sequence(sequence, prefix=None):
     '''return a pydicom.sequence.Sequence recursively
-       as a list of dictionary items
+       as a flattened list of items. For example, a nested FieldA and FieldB
+       would return as:
+
+       {'FieldA__FieldB': '111111'}
+
+       Parameters
+       ==========
+       sequence: the sequence to extract, should be pydicom.sequence.Sequence
+       prefix: the parent name
     '''
-    items = []
+    items = {}
     for item in sequence:
-        for _, val in item.items():
-            if not isinstance(val, RawDataElement):
-                header = val.keyword
-                if prefix is not None:
-                    header = "%s__%s" %(prefix, header)  
-                value = val.value
-                if isinstance(value, bytes):
-                    value = value.decode('utf-8')
-                if isinstance(value, Sequence):
-                    items += extract_sequence(value, prefix=header)
-                    continue
-                entry = {"key": header, "value": value}
-                items.append(entry)
+
+        # If it's a Dataset, we need to further unwrap it
+        if isinstance(item, Dataset):
+            for subitem in item:
+                items.update(extract_item(subitem, prefix=prefix))          
+        else:
+            bot.warning("Unrecognized type %s in extract sequences, skipping." % type(item))
     return items
 
 
@@ -100,11 +137,18 @@ def expand_field_expression(field, dicom, contenders=None):
 def get_fields(dicom, skip=None, expand_sequences=True):
     '''get fields is a simple function to extract a dictionary of fields
        (non empty) from a dicom file.
+
+       Parameters
+       ==========
+       dicom: the dicom file to get fields for.
+       skip: an optional list of fields to skip
+       expand_sequences: if True, expand values that are sequences.
     '''    
     if skip is None:
         skip = []
     if not isinstance(skip, list):
         skip = [skip]
+
     fields = dict()
     contenders = dicom.dir()
     for contender in contenders:
@@ -116,9 +160,7 @@ def get_fields(dicom, skip=None, expand_sequences=True):
 
             # Adding expanded sequences
             if isinstance(value, Sequence) and expand_sequences is True:
-                sequence_fields = extract_sequence(value, prefix=contender)
-                for sf in sequence_fields:
-                    fields[sf['key']] = sf['value']
+                fields.update(extract_sequence(value, prefix=contender))
             else:
                 if value not in [None, ""]:
                     if isinstance(value, bytes):

diff --git a/deid/dicom/header.py b/deid/dicom/header.py
@@ -84,10 +84,14 @@ def get_shared_identifiers(dicom_files,
     # We will skip PixelData
     skip = config['skip']
     for dicom_file in dicom_files:
+
         dicom = read_file(dicom_file, force=True)
+
+        # Get list of fields, expanded sequences are flattened
         fields = get_fields(dicom,
                             skip=skip,
                             expand_sequences=expand_sequences)
+
         for key, val in fields.items():
 
             # If it's there, only keep if the same
@@ -125,6 +129,7 @@ def get_identifiers(dicom_files,
                     config=None,
                     expand_sequences=True,
                     skip_fields=None):
+
     ''' extract all identifiers from a dicom image.
         This function returns a lookup by file name
 
@@ -133,14 +138,14 @@ def get_identifiers(dicom_files,
         dicom_files: the dicom file(s) to extract from
         force: force reading the file (default True)
         config: if None, uses default in provided module folder
-        expand_sequences: if True, expand sequences. otherwise, skips
+        expand_sequences: if True, expand sequences. Otherwise, skips
         skip_fields: if not None, added fields to skip
 
     '''
     bot.debug('Extracting identifiers for %s dicom' %(len(dicom_files)))
 
     if config is None:
-        config = "%s/config.json" %(here)
+        config = "%s/config.json" % here
 
     if not os.path.exists(config):
         bot.error("Cannot find config %s, exiting" %(config))
@@ -165,8 +170,8 @@ def get_identifiers(dicom_files,
             ids[dicom_file] = dict()
 
         ids[dicom_file] = get_fields(dicom,
-                                  skip=skip,
-                                  expand_sequences=expand_sequences)
+                                     skip=skip,
+                                     expand_sequences=expand_sequences)
     return ids
 
 
@@ -242,8 +247,9 @@ def replace_identifiers(dicom_files,
                         remove_private=True):
 
     '''replace identifiers using pydicom, can be slow when writing
-    and saving new files'''
-
+       and saving new files. If you want to replace sequences, they need
+       to be extracted with get_identifiers and expand_sequences to True.
+    '''
     dicom_files, recipe, config = _prepare_replace_config(dicom_files, 
                                                           deid=deid,
                                                           config=config)

diff --git a/deid/dicom/tags.py b/deid/dicom/tags.py
@@ -101,6 +101,12 @@ def _filter_tags(tags, idx, fields=None):
 
 
 def remove_sequences(dicom):
+    '''remove sequences from a dicom by removing the associated tag.
+ 
+       Parameters
+       ==========
+       dicom: the loaded dicom to remove sequences
+    '''
     for field in dicom.dir():
         if isinstance(dicom.get(field), Sequence):
             dicom = remove_tag(dicom, field)

diff --git a/deid/main/__init__.py b/deid/main/__init__.py
@@ -45,6 +45,10 @@ def get_parser():
                         help="use verbose logging to debug.", 
                         default=False, action='store_true')
 
+    parser.add_argument('--version', dest="version", 
+                        help="print version and exit.", 
+                        default=False, action='store_true')
+
     parser.add_argument("--outfolder", "-o", dest='outfolder', 
                         help="full path to save output, will use temporary folder if not specified", 
                         type=str, default=None)
@@ -118,7 +122,7 @@ def main():
     except:
         sys.exit(0)
 
-    if args.command == "version":
+    if args.command == "version" or args.version:
         print(__version__)
         sys.exit(0)
 

diff --git a/deid/version.py b/deid/version.py
@@ -22,7 +22,7 @@
 
 '''
 
-__version__ = "0.1.33"
+__version__ = "0.1.34"
 AUTHOR = 'Vanessa Sochat'
 AUTHOR_EMAIL = '[email protected]'
 NAME = 'deid'