TM2_PepSeq_MHC_I_binding_predictions.Rmd

---
title: "TM2 PepSeq MHC I binding predictions"
author: "E. Kelley"
date: "9/2/2020"
output: html_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = FALSE, message = FALSE)
library(MHCbindR)
library(tidyverse)
library(here)
```


```{r, redefine `import_hla_calls`}
# For some reason the import_hla_calls function has a hard-coded file path for the hla data. Need to redefine it until I can rebuild the package.

import_hla_calls <- function(hla_data_path, file_pattern){
  files <- dir(path = hla_data_path, pattern = file_pattern)
  hla_I <- files %>%
    map_dfr(function(x) {
      read_table2(file.path(hla_data_path, x), skip = 6) %>% mutate(file=x)
    }) %>% bind_rows()
  return(hla_I)
}
```


```{r, import hla data, message=FALSE, warning=FALSE}
hla_I_calls <- import_hla_calls(here("data_from_kevin", "TILPepseq2_Updated", "TILPepseq2_HLA_Types"), "*_hla_I_calls.txt")
```


```{r, get top hla calls}
hla_I_filtered <- select_top_hla_calls(hla_I_calls)
```


Write a file of HLA top alleles to run predictions against using the IEDB tool.
```{r, write top hla alleles for iedb}
hla_I_best_calls <- hla_I_filtered %>%
  select(best_hla_1, best_hla_2)
hla_I_for_iedbtools <- tibble(unique(c(hla_I_best_calls$best_hla_1, hla_I_best_calls$best_hla_2)))
  
# write_delim(hla_I_for_iedbtools, "analysis/R/hla_I_for_iedbtools.txt", delim = "\t")
```


```{r, write fasta of library for iedb input}
TM2_named_peptides <- read_csv("design_outs/named_peptides.csv", col_names = FALSE)
names(TM2_named_peptides) <- c("name", "peptide")

# kmer_length must be set to kmer minus one. hahahaha. 
TM2_peptides_kmers <- MHCbindR::convert_named_peptides_fasta(TM2_named_peptides, TM2_named_peptides$peptide, kmer_length = 8, fasta_out = FALSE)
```


```{r, get unique hla calls for small patient set}
select_patients <- c("C038_0022","C038_0036","C038_0031","C038_0044", "C038_0038")
short_hla <- hla_I_best_calls %>% filter(patient %in% select_patients)
shla <- c(short_hla$best_hla_1, short_hla$best_hla_2)
short_hla_unique_alleles <- unique(shla)
```


Import the IEDB MHC-I binding predictions
```{r, import binding preds}
MHC_df <- MHCbindR::import_preds(here("analysis", "bash", "TM2_PepSeq_IEDB_preds"))
```


Pull out the binders w/ min percentile rank.
```{r, name and filter preds}
# group by library_member, ie 'name', take min
# find_okayest_binders <- function(df){
#   df %>% group_by(name) %>%
#     filter(percentile_rank == min(percentile_rank, na.rm = TRUE)) %>% ungroup() %>%
#     slice_min(percentile_rank, n=5000)
# }

find_okayest_binders <- function(df){
  df %>% group_by(name) %>%
    #filter(percentile_rank == min(percentile_rank, na.rm = TRUE)) %>% ungroup() %>%
    slice_min(percentile_rank, n=5000)
}


name_filter_kmer_preds_okayest_binders <- function(df_mhc, peptides_kmers) {
  df_mhc %>%
    map(left_join, peptides_kmers, by = c("peptide" = "sequence")) %>%
    map(find_okayest_binders) %>%
    map(distinct, peptide, .keep_all = TRUE)
}

named_preds <- name_filter_kmer_preds_okayest_binders(MHC_df, TM2_peptides_kmers)
```


Run the library design markdown.
```{r, annotate preds, get short list of alleles}
annotate_mhc_preds <- function(named_mhc_preds, annotated_peptides, df_out = NULL) {
  if (is.null(df_out)) {
    named_mhc_preds %>%
      map(right_join, annotated_peptides, by = c("peptide.y" = "sequence")) %>%
      map(distinct, peptide.x, .keep_all = TRUE)
  } else {
    named_mhc_preds %>%
      map(right_join, annotated_peptides, by = c("peptide.y" = "sequence")) %>%
      map(distinct, peptide.x, .keep_all = TRUE) %>%
      bind_rows()
  }
}
# filter out wild type mutations and select patients
annotated_peptides_select_patients <- annotated_peptides %>%
  filter(str_detect(peptide, "wt", negate = TRUE)) %>%
  filter(file %in% c("C038_0022_merged_Ashion_Research_Peptide_100PercentIdentityRemoved.varCode.csv",
                     "C038_0031_merged_Ashion_Research_Peptide.varCode.csv",
                     "C038_0036_merged_Ashion_Research_Peptide_100PercentIdentityRemoved.varCode.csv",
                     "C038_0038_merged_Ashion_Research_Peptide.varCode.csv",
                     "C038_0044_merged_Ashion_Research_Peptide.varCode.csv")) 


TM2_annotated_mhc_preds <- annotate_mhc_preds(named_preds, annotated_peptides_select_patients)

alleles <- str_replace(str_replace(short_hla_unique_alleles, "\\*", "\\-"), "\\:", "\\-")


# Get just the predictions for HLA alleles in our `short_hla_unique_alleles` list
TM2_annotated_mhc_preds_short_alleles <- names(TM2_annotated_mhc_preds)[str_sub(names(TM2_annotated_mhc_preds), 30,-5) %in% alleles]

TM2_annotated_mhc_preds_short <- TM2_annotated_mhc_preds[TM2_annotated_mhc_preds_short_alleles]
```


```{r}
# named_preds <- MHCbindR::add_names_preds(MHC_df, here("analysis", "bash", "TM2_PepSeq_IEDB_preds", "named_peptides_9.fasta"))
```


```{r, import MHC II peptides, warning=FALSE}
# Annas selected MHC II binders
TM2_PepSeq_peptides_AE_selected <- read_csv("20201125_TM2_PepSeq_peptides_AE_selected.csv", col_types = "cccccccc")
```


Need to fill in the list of peptides with MHC I binders.
Make a single data from `TM2_annotated_preds_short` and narrow in on the final set per patient.
```{r, group split}
# TM2_annotated_mhc_preds_short_df <- map_df(TM2_annotated_mhc_preds_short, ~bind_rows(.x)) %>%
#   mutate(ashion_id=str_sub(file, end = 9)) %>%
#   group_by(ashion_id) %>%
#   distinct(effectId, .keep_all = TRUE) %>% 
#   ungroup()

######THIS CHUNK IS FILTERING FOR ONLY MUT7#################
# There are redundant peptides in the data frames, grouping by ashion_id and allele will make sure
# just one effectId is kept from the duplicates.
TM2_annotated_mhc_preds_short_df <- map_df(TM2_annotated_mhc_preds_short, ~bind_rows(.x)) %>%
  mutate(ashion_id = str_sub(file, end = 9)) %>%
  group_by(ashion_id, allele) %>%
 # distinct(effectId, .keep_all = TRUE) %>% # TAKING THIS OUT BECAUSE IT WAS FILTERING FOR ONLY MUT7
  ungroup()

TM2_annotated_mhc_preds_short_df_split <- TM2_annotated_mhc_preds_short_df %>%
  group_by(ashion_id) %>%
  group_split()

# get number of effectId's for each patient
mutations_per_patient <- map_chr(TM2_annotated_mhc_preds_short_df_split, ~length(.x[[1]]))

names(TM2_annotated_mhc_preds_short_df_split) <- c(TM2_annotated_mhc_preds_short_df_split[[1]]$ashion_id[[1]],
                                                   TM2_annotated_mhc_preds_short_df_split[[2]]$ashion_id[[1]],
                                                   TM2_annotated_mhc_preds_short_df_split[[3]]$ashion_id[[1]],
                                                   TM2_annotated_mhc_preds_short_df_split[[4]]$ashion_id[[1]],
                                                   TM2_annotated_mhc_preds_short_df_split[[5]]$ashion_id[[1]])
```


Function to select the peptide with best binding for each allele and append to list.
```{r, select best binders append list}

# myexample <- C038_0022_peptides_list
# 
# start <- list(
#   top = tibble(), # starting with an empty tibble
#   rest = myexample
# )
#slice_min can carry tie's for percentile rank through and end up with duplicate effectId's.
#I think I can fix this by using a combination of arrange and distinct.

get_top_allele_per_peptide <- function(l, counter) {
  # get best percentile_rank per peptide
  # but per iteration each allele must get one and only one peptide,
  # arrange by percentile_rank, so that distinct takes the best peptide per allele.
  next_top <- l$rest %>% 
    group_by(effectId) %>%
    slice_min(percentile_rank, n = 1) %>% 
    ungroup() %>%
    arrange(percentile_rank) %>%
    distinct(allele, .keep_all = TRUE)
  # append the new top picks to the accumulated ones 
  new_top <- bind_rows(l$top,
                       next_top)
  # remove the top peptides from the pool
  new_rest <- anti_join(l$rest, next_top, by = "effectId")
  # pass on the new  accumulator
  list(top  = new_top,
       rest = new_rest)
}


# Save original version of function.
# get_top_allele_per_peptide <- function(l, counter) {
#   # get best percentile_rank per peptide
#   # but per iteration each allele must get one and only one peptide,
#   # arrange by percentile_rank, so that distinct takes the best peptide per allele.
#   next_top <- l$rest %>% 
#     group_by(peptide.y) %>%
#     slice_min(percentile_rank, n = 1) %>% 
#     ungroup() %>%
#     arrange(percentile_rank) %>%
#     distinct(allele, .keep_all = TRUE)
#   # append the new top picks to the accumulated ones 
#   new_top <- bind_rows(l$top,
#                        next_top)
#   # remove the top peptides from the pool
#   new_rest <- anti_join(l$rest, next_top, by = "peptide.y")
#   # pass on the new  accumulator
#   list(top  = new_top,
#        rest = new_rest)
# }
# 
# 


# sanity check the peptide selection chunk of `get_top_allele_per_peptide`
# result2 <- reduce(1:3, get_top_allele_per_peptide, .init = start)$top
# 
# next_top <- start$rest %>%
#     group_by(peptide.y) %>%
#     slice_min(percentile_rank, n = 1)
# 
# next_top1 <- next_top %>%
#     ungroup() %>%
#     arrange(percentile_rank) %>%
#     distinct(allele, .keep_all = TRUE)

```


Make list for C038_0022
```{r, list for C038_0022}
C038_0022_hlas <- hla_I_best_calls %>% filter(patient %in% "C038_0022")
C038_0022_hlas_vec <- unique(c(C038_0022_hlas$best_hla_1, C038_0022_hlas$best_hla_2))

C038_0022_peptides_list <- TM2_annotated_mhc_preds_short_df_split$C038_0022 %>%
  filter(allele %in% C038_0022_hlas_vec)

C038_0022_start_list <- list(
  top = tibble(),
  rest = C038_0022_peptides_list
)
# 1:3 will give 3 peptides per allele.
C038_0022_peptides_list_min <- reduce(1:3, get_top_allele_per_peptide, .init=C038_0022_start_list)$top

# Combine with Anna's MHC II peptides, but first get the annotations for those.
C038_0022_PepSeq_peptides <- TM2_PepSeq_peptides_AE_selected %>%
  filter(ashion_id == "C038_0022") %>%
  left_join(annotated_peptides)

# quick check for overlap between PepSeq chosen peptides and prediction chosen.
C038_0022_PepSeq_peptides$sequence %in% C038_0022_peptides_list_min$peptide.y
C038_0022_pool <- C038_0022_peptides_list_min %>%
  rename("sequence"="peptide.y") %>%
  rename("library_member"="name") %>%
  rename("peptide"="peptide.y.y") %>%
  full_join(C038_0022_PepSeq_peptides)

#write csv for peptide order.
# C038_0022_pool %>%
#   select(library_member, sequence, allele, mhc_ii_allele, effectId, variantId, gene_name, peptide, ashion_id) %>%
#   write_csv(path = "analysis/R/C038_0022_peptides_pool_updated20201219.csv")

```


Make list for C038_0031
```{r, list for C038_0031}
C038_0031_hlas <- hla_I_best_calls %>% filter(patient %in% "C038_0031")
C038_0031_hlas_vec <- unique(c(C038_0031_hlas$best_hla_1, C038_0031_hlas$best_hla_2))

C038_0031_peptides_list <- TM2_annotated_mhc_preds_short_df_split$C038_0031 %>%
  filter(allele %in% C038_0031_hlas_vec)

C038_0031_start_list <- list(
  top = tibble(),
  rest = C038_0031_peptides_list
)

table(TM2_PepSeq_peptides_AE_selected$ashion_id)
# Need to fill in 15 peptides from 6 alleles, so will select 3 per allele for a total of 18.
C038_0031_peptides_list_min <- reduce(1:3, get_top_allele_per_peptide, .init=C038_0031_start_list)$top

# Combine with Anna's MHC II peptides, but first get the annotations for those.
C038_0031_PepSeq_peptides <- TM2_PepSeq_peptides_AE_selected %>%
  filter(ashion_id == "C038_0031") %>%
  left_join(annotated_peptides)

# quick check for overlap between PepSeq chosen peptides and prediction chosen.
# Looks like we have 3 overlapping between the two. Will have exactly 24 unique peptides.
C038_0031_PepSeq_peptides$sequence %in% C038_0031_peptides_list_min$peptide.y
C038_0031_pool <- C038_0031_peptides_list_min %>%
  rename("sequence"="peptide.y") %>%
  rename("library_member"="name") %>%
  rename("peptide"="peptide.y.y") %>%
  full_join(C038_0031_PepSeq_peptides)

#write csv for peptide order.
# C038_0031_pool %>%
#   select(library_member, sequence, allele, mhc_ii_allele, effectId, variantId, gene_name, peptide, ashion_id) %>%
#   write_csv(path = "analysis/R/C038_0031_peptides_pool_updated201219.csv")

```


Make list for C038_0036
```{r, list for C038_0036}
C038_0036_hlas <- hla_I_best_calls %>% filter(patient %in% "C038_0036")
C038_0036_hlas_vec <- unique(c(C038_0036_hlas$best_hla_1, C038_0036_hlas$best_hla_2))

C038_0036_peptides_list <- TM2_annotated_mhc_preds_short_df_split$C038_0036 %>%
  filter(allele %in% C038_0036_hlas_vec)

C038_0036_start_list <- list(
  top = tibble(),
  rest = C038_0036_peptides_list
)

table(TM2_PepSeq_peptides_AE_selected$ashion_id)
# Need to fill in 14 peptides from 6 alleles, so will select 3 per allele for a total of 18.
C038_0036_peptides_list_min <- reduce(1:3, get_top_allele_per_peptide, .init=C038_0036_start_list)$top

# Combine with Anna's MHC II peptides, but first get the annotations for those.
C038_0036_PepSeq_peptides <- TM2_PepSeq_peptides_AE_selected %>%
  filter(ashion_id == "C038_0036") %>%
  left_join(annotated_peptides)

# quick check for overlap between PepSeq chosen peptides and prediction chosen.
# Looks like we have 1 overlapping between the two. Will have exactly 24 unique peptides.
C038_0036_PepSeq_peptides$sequence %in% C038_0036_peptides_list_min$peptide.y
C038_0036_pool <- C038_0036_peptides_list_min %>%
  rename("sequence"="peptide.y") %>%
  rename("library_member"="name") %>%
  rename("peptide"="peptide.y.y") %>%
  full_join(C038_0036_PepSeq_peptides)

#write csv for peptide order.
# C038_0036_pool %>%
#   select(library_member, sequence, allele, mhc_ii_allele, effectId, variantId, gene_name, peptide, ashion_id) %>%
#   write_csv(path = "analysis/R/C038_0036_peptides_pool_updated20201219.csv")

```


Make list for C038_0038
Anna's analysis had a sample mix-up, so I started from the beginning to choose MHC II peptides from patients C038_0038 and C038_0044, which she had mixed up. 
```{r, list for C038_0038}
C038_0038_hlas <- hla_I_best_calls %>% filter(patient %in% "C038_0038")
C038_0038_hlas_vec <- unique(c(C038_0038_hlas$best_hla_1, C038_0038_hlas$best_hla_2))

C038_0038_peptides_list <- TM2_annotated_mhc_preds_short_df_split$C038_0038 %>%
  filter(allele %in% C038_0038_hlas_vec)

C038_0038_start_list <- list(
  top = tibble(),
  rest = C038_0038_peptides_list
)

table(TM2_PepSeq_peptides_AE_selected$ashion_id)
# Need to fill in 21 peptides from 6 alleles, so will select 4 per allele for a total of 24.
C038_0038_peptides_list_min <- reduce(1:4, get_top_allele_per_peptide, .init=C038_0038_start_list)$top

# Combine with Anna's MHC II peptides, but first get the annotations for those.
# C038_0038_PepSeq_peptides <- TM2_PepSeq_peptides_AE_selected %>%
#   filter(ashion_id == "C038_0038") %>%
#   left_join(annotated_peptides)

# quick check for overlap between PepSeq chosen peptides and prediction chosen.
# no overlaps
# C038_0038_PepSeq_peptides$sequence %in% C038_0038_peptides_list_min$peptide.y
# This chunk has the addition of Anna's peptides commented out. I will add my MHC II peptides in the MHC II analysis markdown.
C038_0038_pool <- C038_0038_peptides_list_min %>%
  rename("sequence"="peptide.y") %>%
  rename("library_member"="name") %>%
  rename("peptide"="peptide.y.y") 

# %>%
#   full_join(C038_0038_PepSeq_peptides)

#write csv for peptide order.
# C038_0038_pool %>%
#   select(library_member, sequence, allele, effectId, variantId, gene_name, peptide, ashion_id) %>%
# write_csv(path = "analysis/R/C038_0038_peptides_pool_MHC_I.csv")

```


Make list for C038_0044  
Anna's analysis had a sample mix-up, so I started from the beginning to choose MHC II peptides from patients C038_0038 and C038_0044, which she had mixed up. 
```{r, list for C038_0044}
C038_0044_hlas <- hla_I_best_calls %>% filter(patient %in% "C038_0044")
C038_0044_hlas_vec <- unique(c(C038_0044_hlas$best_hla_1, C038_0044_hlas$best_hla_2))

C038_0044_peptides_list <- TM2_annotated_mhc_preds_short_df_split$C038_0044 %>%
  filter(allele %in% C038_0044_hlas_vec)

C038_0044_start_list <- list(
  top = tibble(),
  rest = C038_0044_peptides_list
)

table(TM2_PepSeq_peptides_AE_selected$ashion_id)
# Need to fill in 17 peptides from 5 alleles, so will select 4 per allele.
C038_0044_peptides_list_min <- reduce(1:4, get_top_allele_per_peptide, .init=C038_0044_start_list)$top

# Combine with Anna's MHC II peptides, but first get the annotations for those.
# C038_0044_PepSeq_peptides <- TM2_PepSeq_peptides_AE_selected %>%
#   filter(ashion_id == "C038_0044") %>%
#   left_join(annotated_peptides)

# quick check for overlap between PepSeq chosen peptides and prediction chosen.
# Looks like we have 3 overlapping between the two. Will have exactly 24 unique peptides.
# C038_0044_PepSeq_peptides$sequence %in% C038_0044_peptides_list_min$peptide.y

# This chunk has the addition of Anna's peptides commented out. I will add my MHC II peptides in the MHC II analysis markdown.
C038_0044_pool <- C038_0044_peptides_list_min %>%
  rename("sequence"="peptide.y") %>%
  rename("library_member"="name") %>%
  rename("peptide"="peptide.y.y")

  #   full_join(C038_0044_PepSeq_peptides)

#write csv for peptide order.
# C038_0044_pool %>%
#   select(library_member, sequence, allele, effectId, variantId, gene_name, peptide, ashion_id) %>%
#    write_csv(path = "analysis/R/C038_0044_peptides_pool_MHC_I.csv")

```