Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

filter germline events and all synonymous events #1003

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
194 changes: 168 additions & 26 deletions import-scripts/filter_non_somatic_events_py3.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,43 @@
from enum import Enum
from generate_az_study_changelog_py3 import DataHandler

UNIQUE_MUTATION_EVENT_KEY_FIELDS = {
'Hugo_Symbol',
'Chromosome',
'Start_Position',
'End_Position',
'Reference_Allele',
'Tumor_Seq_Allele2',
'Tumor_Sample_Barcode',
}

UNIQUE_STRUCTURAL_VARIANT_EVENT_KEY_FIELDS = {
'Sample_ID',
'Site1_Hugo_Symbol',
'Site2_Hugo_Symbol',
'Site1_Entrez_Gene_Id',
'Site2_Entrez_Gene_Id',
'Site1_Region_Number',
'Site2_Region_Number',
'Site1_Region',
'Site2_Region',
'Site1_Chromosome',
'Site2_Chromosome',
'Site1_Contig',
'Site2_Contig',
'Site1_Position',
'Site2_Position',
'Event_Info',
'Breakpoint_Type',
'Connection_Type',
}

REQUIRED_MUTATION_EVENT_FIELDS = UNIQUE_MUTATION_EVENT_KEY_FIELDS.union({'Mutation_Status'})

REQUIRED_STRUCTURAL_VARIANT_EVENT_FIELDS = UNIQUE_STRUCTURAL_VARIANT_EVENT_KEY_FIELDS.union({'SV_Status'})

ALL_REFERENCED_EVENT_FIELDS = REQUIRED_MUTATION_EVENT_FIELDS.union(REQUIRED_STRUCTURAL_VARIANT_EVENT_FIELDS)


class EventType(Enum):
"""An Enum class to represent mutation or structural variant event types."""
Expand All @@ -25,13 +62,32 @@ class EventType(Enum):


class LineProcessor:
"""Handles the processing of each line - filtering for somatic events only"""
"""Functionality common to all line processors"""

def __init__(self, event_type, col_indices, output_file_handle):
def __init__(self, event_type, col_indices):
self.event_type = event_type
self.col_indices = col_indices
self.output_file_handle = output_file_handle
self.header_was_written = False
self.raise_exception_if_missing_required_fields()

def raise_exception_if_missing_required_fields(self):
"""Checks that all required fields were found in the file for the given event type.

Raises:
IndexError: If any of the required fields are not found
"""
required_field_set = set()
if self.event_type == EventType.MUTATION:
required_field_set = REQUIRED_MUTATION_EVENT_FIELDS
if self.event_type == EventType.STRUCTURAL_VARIANT:
required_field_set = REQUIRED_STRUCTURAL_VARIANT_EVENT_FIELDS

missing_field_set = set()
for field_name in required_field_set:
if not field_name in self.col_indices:
missing_field_set.add(field_name)
if len(missing_field_set) > 0:
missing_fields = ','.join(missing_field_set)
raise IndexError(f'Unable to find required column(s) {missing_fields} in event file')

def line_is_commented(self, line):
"""Determines if the given line in the file is a comment.
Expand All @@ -44,6 +100,93 @@ def line_is_commented(self, line):
"""
return line[0] == '#'

def convert_line_to_fields(self, line):
"""Converts a tab-delimited data line to a list of values.

Args:
line (string): A line from the input file

Returns:
list: List of values from the line
"""
return line.rstrip('\n').split('\t')

def convert_line_to_field(self, field_index, line):
"""Returns a value of interest in a tab-delimited data line.

Args:
field_index (int): Index of the field of interest
line (string): A line from the input file

Returns:
string: The value found at field_index
"""
return self.convert_line_to_fields(line)[field_index]

def compute_key_for_line(self, line):
"""Computes a unique key for the given line of data, using the unique
key fields defined for the event type.

Args:
line (string): A line from the input file
"""
unique_key_field_set = set()
if self.event_type == EventType.MUTATION:
unique_key_field_set = UNIQUE_MUTATION_EVENT_KEY_FIELDS
if self.event_type == EventType.STRUCTURAL_VARIANT:
unique_key_field_set = UNIQUE_STRUCTURAL_VARIANT_EVENT_KEY_FIELDS

# Key will be string representation of the object
fields = self.convert_line_to_fields(line)
key_value_terms = []
for key in sorted(unique_key_field_set):
key_value_terms.append(key + '\t' + fields[self.col_indices[key]])
return '\t'.join(key_value_terms)


class LineGermlineEventScanner(LineProcessor):
"""Registers the unique event keys for each event with germline status"""

def __init__(self, event_type, col_indices, germline_event_key_set):
super().__init__(event_type, col_indices)
self.header_was_seen = False
self.germline_event_key_set = germline_event_key_set

def scan(self, line):
"""Scan data lines for all events which have status 'GERMLINE" and register them

Args:
line (string): A line from the input file

Raises:
IndexError: If any required column is not found in the input file
"""
if self.line_is_commented(line):
return
if not self.header_was_seen:
self.header_was_seen = True
return

status_col_index = None
if event_type == EventType.MUTATION:
status_col_index = self.col_indices['Mutation_Status']
elif event_type == EventType.STRUCTURAL_VARIANT:
status_col_index = self.col_indices['SV_Status']

value = self.convert_line_to_field(status_col_index, line)
if value.casefold() == 'GERMLINE'.casefold():
self.germline_event_key_set.add(self.compute_key_for_line(line))


class LineFileWriter(LineProcessor):
"""Handles the processing of each line - filtering for somatic events only"""

def __init__(self, event_type, col_indices, germline_event_key_set, output_file_handle):
super().__init__(event_type, col_indices)
self.output_file_handle = output_file_handle
self.header_was_written = False
self.germline_event_key_set = germline_event_key_set

def process(self, line):
"""Process each line of the given file to remove all events that are not 'SOMATIC' or 'UNKNOWN'.

Expand All @@ -63,19 +206,8 @@ def process(self, line):
self.header_was_written = True
return

col_index = None
if event_type == EventType.MUTATION:
if 'Mutation_Status' not in self.col_indices:
raise IndexError('Unable to find Mutation_status column in event file')
col_index = self.col_indices['Mutation_Status']
elif event_type == EventType.STRUCTURAL_VARIANT:
if 'SV_Status' not in self.col_indices:
raise IndexError('Unable to find SV_Status column in event file')
col_index = self.col_indices['SV_Status']

cols = line.split('\t')
value = cols[col_index].rstrip('\n')
if value.casefold() == 'SOMATIC'.casefold() or value.casefold() == 'UNKNOWN'.casefold():
line_key = self.compute_key_for_line(line)
if not line_key in self.germline_event_key_set:
self.output_file_handle.write(line)


Expand All @@ -87,13 +219,25 @@ def __init__(self, input_file_path, output_file_path, event_type):
self.output_file_path = output_file_path
self.event_type = event_type
self.data_handler = DataHandler(input_file_path)
self.col_indices = self.data_handler.get_col_indices({"Mutation_Status", "SV_Status"})
self.col_indices = self.data_handler.get_col_indices(ALL_REFERENCED_EVENT_FIELDS)
self.germline_event_keys = set()

def write(self):
"""Processes the input file and writes out a filtered version including only somatic events"""
with open(input_file_path, "r") as input_file_handle:
with open(output_file_path, "w") as output_file_handle:
line_processor = LineProcessor(self.event_type, self.col_indices, output_file_handle)
# Scan for all germline events and record keys
with open(input_file_path, 'r') as input_file_handle:
line_germline_event_scanner = LineGermlineEventScanner(
self.event_type, self.col_indices, self.germline_event_keys
)
for line in input_file_handle:
line_germline_event_scanner.scan(line)

# Read/write events, filtering those which match a germline event key
with open(input_file_path, 'r') as input_file_handle:
with open(output_file_path, 'w') as output_file_handle:
line_processor = LineFileWriter(
self.event_type, self.col_indices, self.germline_event_keys, output_file_handle
)
for line in input_file_handle:
line_processor.process(line)

Expand All @@ -118,14 +262,12 @@ def write(self):

# Ensure that a recognizable event type code is input.
event_type = None
if not args.event_type:
raise ValueError('Event type argument is missing')
if args.event_type.casefold() == "mutation".casefold():
if args.event_type.casefold() == 'mutation'.casefold():
event_type = EventType.MUTATION
elif args.event_type.casefold() == "structural_variant".casefold():
elif args.event_type.casefold() == 'structural_variant'.casefold():
event_type = EventType.STRUCTURAL_VARIANT
if event_type is None:
raise ValueError(f'event type argument {args.event_type} not recognized or missing')
raise ValueError(f'Event type argument {args.event_type} not recognized')

# Filter the file
filtered_file_writer = FilteredFileWriter(input_file_path, output_file_path, event_type)
Expand Down