BFY_Reliability_Analyses_v2.Rmd

---
title: "BFY_Reliability_Analyses"
author: "Santi"
date: "8/31/2020"
output:
  html_document:
    toc: true
    fig_height: 8.5
    fig_width: 12
    css: custom 2.css
editor_options: 
  chunk_output_type: console
---

```{r, echo = FALSE}
options(width = 160)
options(knitr.table.format = "html")
```

The code below perform reliability analyses for resting state EEG data. 

# Setup
```{r eval=TRUE, echo=T, message=FALSE, warning=FALSE, results='hide', fig.show='hide'}
# # Set working directory
# setwd("/Users/santiagomorales/Downloads/") 

list.of.packages <- c("psych", "reshape2", "car", "ggplot2", "R.matlab", "tidyr", "dplyr", "foreach", "doParallel")
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)
# Loading packages
lapply(list.of.packages, require, character.only = TRUE)
############################################################

my_opts <- list(theme_classic() + theme(axis.text=element_text(size=12), axis.title=element_text(size=13,face="bold"), legend.title=element_text(size=12,face="bold"), legend.text=element_text(size=12), strip.text.x = element_text(size = 12, face="bold"), title = element_text(size=14,face="bold")))
```


# Reliability Analyses
### Set up
```{r}
## Setup
# Creating a list of the matlab files 
tbt_data_path <- "~/BFY_Test/" # Change this path!! 


matlab_list <- list.files(path = tbt_data_path, pattern = ".mat", ignore.case = T)
matlab_list <- matlab_list[grep("relativePower", matlab_list)] # Keep only relative power
# matlab_list <- matlab_list[grep("absolutePower", matlab_list)] # Keep only absolute power


df <- data.frame()
for (i in 1:length(matlab_list)) { 
  df_temp_id <- data.frame()
  # for (cond in 1:2) {
    # for (chan in 1:60) {
    #   if (chan %in% c(Fz)) {
        mat_file_name <- matlab_list[i]
        ID <- strsplit(mat_file_name, split = "_")[[1]][1]
        print(paste0("Now in ", ID))
        
        matlabFile <- readMat(paste0(tbt_data_path,mat_file_name)) # Reading in matlab file
        df_temp <- matlabFile$tf.data # Only getting the data
        df_temp <- apply(df_temp, c(2,3), function(x) mean(x, na.rm = T)) # For each combination in the 2nd and 3rd dimensions, average over the values in the 1st. - avging across electrodes
        df_temp <- as.data.frame(t(unlist(df_temp))) # Transposing and saving as a dataframe. 
        names(df_temp) <- unlist(matlabFile$frequency) # Renaming for the freq bands 
        df_temp <- df_temp[complete.cases(df_temp),] # Deleting missing variables 
        # df_temp$cond <- cond
        df_temp$id <- ID
        df_temp$trial <- seq(1,nrow(df_temp))
        # df_temp$chan <- chan
        df <- bind_rows(df, df_temp)
}


# df <- Filter(function(x)!all(is.na(x)), df) # Removing colums that are all NAs

vars <- c("id", "trial")

dfm <- melt(df, id=vars)
dfm$value <- as.numeric(dfm$value)

# dfm.ag <- dfm[dfm$variable=="alpha",] # Selecting only alpha for now
dfm.ag <- dfm
dfm.ag$cond <- dfm$variable
dfm.ag$cond <- as.factor(dfm.ag$cond)

n_trials <- dfm.ag %>%
  group_by(id, cond) %>%
  tally() 
```
#### Loading the Split-half Reliability functions

```{r}
splithalf_trials <- function(data, n_from, n_to, n_by, n_subsamples) { 
  # Eg., df_r <- splithalf_trials(dfm.ag, 2, 100, 5, 10)
  df_r <- data.frame()
  for (s in seq(n_from, n_to, n_by)) { # Indicating how many epochs to include
    print(paste0(s))
    for (i in 1:n_subsamples) { # Indicating how many subsamples
      # df_temp <- dfm.ag[dfm.ag$id == i, ]  
      seed = sample(1:10000000, 1)
      set.seed(seed)   ## set the seed to make your partition reproducible
      
      # Before subsampling, I need to delete participants that do not have enough trials
      n_trials <- data %>%
        group_by(cond, id) %>%
        tally() 
      
      ids.out <- n_trials[n_trials$n <= s,] %>% dplyr::select(cond, id) %>% unite(cond_id, cond, id)
  
      # # This is similar to the loop above however, I need to create a subsampled dataset first
      df_temp_s <- data %>%
        unite(cond_id, cond, id, remove = F) %>%
        dplyr::filter(!cond_id %in% ids.out$cond_id) %>% 
        group_by(cond, id) %>% 
        sample_n(s)
      
      # Now I can do the split half on the subsampled dataset
      df_temp <- df_temp_s %>% 
        sample_frac(.50) %>%      # Creating a random half
        mutate(bin = 1) %>%       # Indicating that this is the first half 
        dplyr::select(id, trial,cond, bin) %>%  # Keeping only vars of interest
        right_join(df_temp_s, by = c("id", "trial", "cond")) %>%  # Bringing in the other half of the trials
        mutate(bin = if_else(is.na(bin), 2, 1)) %>%  # Creating the index for the second half of the trials
        group_by(cond, id, bin) %>%    # Grouping by vars of interest
        dplyr::summarise(Mean_amp = mean(value, na.rm = T)) %>% # Getting the mean amplitude by vars of interest
        unite(cond_bin, cond, bin)  %>% # Creating the variable name
        spread(cond_bin, Mean_amp) # Going to wide using our new variable
      
      
      # Checking the reliability
      r_temp <- corr.test(df_temp[,-c(1)])$r
      n_temp <- corr.test(df_temp[,-c(1)])$n
      r_temp <- (2*r_temp)/(1 + abs(r_temp)) # SB formula
      
      conditions_temp <- unique(gsub("(.*)_.*","\\1",row.names(r_temp))) # removing everything after the last "_" and only keeping unique conditions  

      
      r_temp_df <- data.frame()
      for (j in seq(2,sqrt(length(r_temp)), 2) ) { # Setting up a loop over the number of conditions
        r_temp_df_j <- r_temp[j,j-1]               # getting the index for that condition it is the row number and the column of j - 1
        r_temp_df_j <- data.frame(variable = conditions_temp[j/2], rcoeff = r_temp_df_j) # Getting the name of the condition and it has to be divided by two - This should match the conditions and they are sorted alphabetically! 
        r_temp_df_j$n <- ifelse(length(n_temp) == 1, n_temp, ifelse(length(n_temp) > 1, n_temp[j,j-1], "Error!")) # getting the n for that condition it is the same for all or the row number and the column of j - 1
        r_temp_df_j$seed = seed # Setting the seed 
        r_temp_df_j$n_trials = s # Setting the number that was subsampled by
        
        r_temp_df <- bind_rows(r_temp_df, r_temp_df_j)
      
      }
      
      df_r <- bind_rows(df_r, r_temp_df) # Binding with the larger dataset
    }
  }
  df_r <- return(df_r)
}

# Using parallel computing
splithalf_trials_parallel <- function(data, n_from, n_to, n_by, n_subsamples) { 
  # Eg., df_r <- splithalf_trials(dfm.ag, 2, 100, 5, 10)

  registerDoParallel(4)  # use multicore, set to the number of our cores
  opts <- list(chunkSize=2)
  results_df <- foreach(s = seq(n_from, n_to, n_by), .combine='rbind', .options.nws=opts) %:% 
  foreach(i = 1:n_subsamples, .combine='rbind') %dopar% { # Indicating how many subsamples
    # df_temp <- dfm.ag[dfm.ag$id == i, ]  
    seed = sample(1:10000000, 1)
    set.seed(seed)   ## set the seed to make your partition reproducible
    
    # Loading packages
    list.of.packages <- c("psych", "zoo", "reshape2", "car","taRifx", "ggplot2", "tidyr","dplyr","foreach", "doParallel")       
    lapply(list.of.packages, require, character.only = TRUE) 
    
    # Before subsampling, I need to delete participants that do not have enough trials
    n_trials <- data %>%
      group_by(cond, id) %>%
      tally() 
    
    ids.out <- n_trials[n_trials$n <= s,] %>% dplyr::select(cond, id) %>% unite(cond_id, cond, id)

    # # This is similar to the loop above however, I need to create a subsampled dataset first
    df_temp_s <- data %>%
      unite(cond_id, cond, id, remove = F) %>%
      dplyr::filter(!cond_id %in% ids.out$cond_id) %>% 
      group_by(cond, id) %>% 
      sample_n(s)
    
    # Now I can do the split half on the subsampled dataset
    df_temp <- df_temp_s %>% 
      sample_frac(.50) %>%      # Creating a random half
      mutate(bin = 1) %>%       # Indicating that this is the first half 
      dplyr::select(id, trial,cond, bin) %>%  # Keeping only vars of interest
      right_join(df_temp_s, by = c("id", "trial", "cond")) %>%  # Bringing in the other half of the trials
      mutate(bin = if_else(is.na(bin), 2, 1)) %>%  # Creating the index for the second half of the trials
      group_by(cond, id, bin) %>%    # Grouping by vars of interest
      dplyr::summarise(Mean_amp = mean(value, na.rm = T)) %>% # Getting the mean amplitude by vars of interest
      unite(cond_bin, cond, bin)  %>% # Creating the variable name
      spread(cond_bin, Mean_amp) # Going to wide using our new variable
    
    
    # Checking the reliability
    r_temp <- corr.test(df_temp[,-c(1)])$r
    n_temp <- corr.test(df_temp[,-c(1)])$n
    r_temp <- (2*r_temp)/(1 + abs(r_temp)) # SB formula
    
    conditions_temp <- unique(gsub("(.*)_.*","\\1",row.names(r_temp))) # removing everything after the last "_" and only keeping unique conditions  

    
    r_temp_df <- data.frame()
    for (j in seq(2,sqrt(length(r_temp)), 2) ) { # Setting up a loop over the number of conditions
      r_temp_df_j <- r_temp[j,j-1]               # getting the index for that condition it is the row number and the column of j - 1
      r_temp_df_j <- data.frame(variable = conditions_temp[j/2], rcoeff = r_temp_df_j) # Getting the name of the condition and it has to be divided by two - This should match the conditions and they are sorted alphabetically! 
      r_temp_df_j$n <- ifelse(length(n_temp) == 1, n_temp, ifelse(length(n_temp) > 1, n_temp[j,j-1], "Error!")) # getting the n for that condition it is the same for all or the row number and the column of j - 1
      r_temp_df_j$seed = seed # Setting the seed 
      r_temp_df_j$n_trials = s # Setting the number that was subsampled by
      
      r_temp_df <- bind_rows(r_temp_df, r_temp_df_j)
    
    }
    
    return(r_temp_df)
  }
}

```

# Running the functions & Plotting
```{r}
## For these functions, you can specify (data, n_from, n_to, n_by, n_subsamples)
# df_r <- splithalf_trials(data = dfm.ag, n_from = 2, n_to = 100, n_by = 5, n_subsamples = 10)
df_r <- splithalf_trials_parallel(data = dfm.ag, n_from = 2, n_to = 100, n_by = 5, n_subsamples = 10) # For real analyses change n_subsamples to at least 1000

# describeBy(df_r, df_r$n_trials)
# df_r_old <- df_r
cutoff <- 6 # Indicating how many participants should be there to believe the correlation
# Making sure that they at least have 6 people before looking at their correlation
df_r$rcoeff[df_r$n < cutoff] <- NA

dfm_r <- df_r # no need for melting anymore
# dfm_r <- dfm_r %>% separate(variable, c("variable", "Age"), sep = "_") 


# Plotting
ggplot(dfm_r, aes(n_trials, rcoeff, color = variable, fill = variable, group = variable)) + stat_summary(fun.y = mean, geom = "point",  position=position_dodge(width=0.7)) + stat_summary(fun.y = mean, geom = "line",  aes(group = variable)) + my_opts+ labs(x = "# of Trials", y = "Split-Half Reliability")  + geom_hline(yintercept=.9, color='black', linetype = 2) + geom_hline(yintercept=.8, color='black', linetype = 1) + geom_hline(yintercept=.7, color='black', linetype = 3) +  scale_x_continuous(limits = c(0, 100), breaks = seq(0, 100, by = 10)) + theme(legend.position="bottom", legend.title = element_blank()) 

ggplot(dfm_r, aes(n_trials, rcoeff, color = variable, fill = variable, group = variable)) + stat_summary(fun.y = mean, geom = "point",  position=position_dodge(width=0.7)) + stat_summary(fun.y = mean, geom = "line",  aes(group = variable)) + my_opts+ labs(x = "# of Trials", y = "Split-Half Reliability")  + geom_hline(yintercept=.9, color='black', linetype = 2) + geom_hline(yintercept=.8, color='black', linetype = 1) + geom_hline(yintercept=.7, color='black', linetype = 3) +  scale_x_continuous(limits = c(0, 100), breaks = seq(0, 100, by = 10)) + theme(legend.position="bottom", legend.title = element_blank()) + facet_wrap(~Age)
```