From 4cfab07105e3afcf51a847e0d7835fb70a9b3fe2 Mon Sep 17 00:00:00 2001
From: l-acs <lucas.sahar@mail.mcgill.ca>
Date: Thu, 12 Sep 2024 11:45:52 -0400
Subject: [PATCH] Fix more word-level models

---
 code/analysisWordLevelReadAloudBeta.R | 174 ++++++++++++++++----------
 1 file changed, 106 insertions(+), 68 deletions(-)

diff --git a/code/analysisWordLevelReadAloudBeta.R b/code/analysisWordLevelReadAloudBeta.R
index dfe46f4..5955bef 100644
--- a/code/analysisWordLevelReadAloudBeta.R
+++ b/code/analysisWordLevelReadAloudBeta.R
@@ -471,6 +471,13 @@ if (DEBUG) {
 
 ### SECTION 3.2: mean-center continuous predictors
 
+# now prevent ourselves from using data that isn't explicitly set up to be
+# predictor or outcome: use the version that omitted the ambiguous columns
+errorDatBackup <- errorDat
+errorDat <- errorDatPredictorsOutcomes
+
+
+
 #center continuous predictors
 errorDat$age_gmc <- errorDat$age - mean(errorDat$age)
 errorDat$bfne_gmc <- errorDat$bfne - mean(errorDat$bfne)
@@ -492,8 +499,12 @@ add_gmc <- function(df, col) { # makes a new column: centered version of variabl
     mutate("{{col}}_gmc" := gmc(vec)) # ex. challengeAvgSub->challengeAvgSub_gmc
 }
 
-sandboxDat %>% add_gmc(log10frequency) %>% slice_sample(n = 50) %>%  as.data.frame()
-
+if (DEBUG) {
+  sandboxDat %>%
+  add_gmc(log10frequency) %>%
+  slice_sample(n = 50) %>%
+  as.data.frame()
+}
 
 errorDat <-
   errorDat %>%
@@ -501,14 +512,16 @@ errorDat <-
 
 
 # sanity check
-for (col_index in which(stringr::str_detect(colnames(errorDat), ".*_gmc"))) {
-  colname <- names(errorDat[col_index])
-  col <- errorDat[[col_index]]
-  print(colname)
-
-  avg <- mean(col)
-  print(paste('  mean:', avg,
-              '  rounded mean:', round(avg, digits = 10)))
+if (DEBUG) {
+  for (col_index in which(stringr::str_detect(colnames(errorDat), ".*_gmc"))) {
+    colname <- names(errorDat[col_index])
+    col <- errorDat[[col_index]]
+    print(colname)
+
+    avg <- mean(col)
+    print(paste('  mean:', avg,
+                '  rounded mean:', round(avg, digits = 10)))
+  }
 }
 
 ### SECTION 3.3: preparing for misprod-hes sequential analyses
@@ -550,6 +563,10 @@ errorDatLongHesWithRelMisprod$misprod_position <- as.factor(errorDatLongHesWithR
 
 
 ### SECTION 4: MODEL RESULTS
+# again, now prevent ourselves from using data that isn't explicitly set up to
+# be predictor or outcome: use the version that omitted the ambiguous columns
+
+
 #misprod x bfne
 # model1 <- lmerTest::lmer(misprod ~ bfne_gmc + (1|id) + (1|passage),
 #                          data=errorDat, REML=TRUE)
@@ -561,9 +578,25 @@ model2 <- lmerTest::lmer(misprod ~ scaaredSoc_gmc + (1|id) + (1|passage),
 summary(model2)
 
 #misprod x scaaredSoc control for word
-model2.5 <- lmerTest::lmer(misprod ~ scaaredSoc_gmc + (1|id) + (1|passage) + (1|word),
-                         data=errorDat, REML=TRUE)
+# this version fails - as intended: object 'misprod' not found
+if (DEBUG) {
+  old_model2.5 <- lmerTest::lmer(misprod ~ scaaredSoc_gmc + (1|id) + (1|passage) + (1|word),
+                                 data=errorDat, REML=TRUE)
+}
+
+model2.5 <- lmerTest::lmer(misprod_outcome ~ scaaredSoc_gmc + (1|id) + (1|passage) + (1|word),
+                           data=errorDat, REML=TRUE)
 summary(model2.5)
+
+if(DEBUG) { # compare
+  wrong_model2.5 <- lmerTest::lmer(misprod_predictor ~ scaaredSoc_gmc + (1|id) + (1|passage) + (1|word),
+                                   data=errorDat, REML=TRUE)
+  # Error in mkRespMod(fr, REML = REMLpass) : response must be numeric
+}
+
+summary(wrong_model2.5)
+
+
 # tldr 2/28/24 SA indv. do not misproduce more/less
 
 
@@ -578,12 +611,12 @@ summary(model2.5)
 # summary(model4)
 
 #hesitation x scaaredSoc
-model5 <- lmerTest::lmer(hesitation ~ scaaredSoc_gmc + (1|id) + (1|passage),
+model5 <- lmerTest::lmer(hesitation_outcome ~ scaaredSoc_gmc + (1|id) + (1|passage),
                          data=errorDat, REML=TRUE)
 summary(model5)
 
 # hesitation x scaaredSoc, control for word
-model5.5 <- lmerTest::lmer(hesitation ~ scaaredSoc_gmc + (1|id) + (1|passage) + (1|word),
+model5.5 <- lmerTest::lmer(hesitation_outcome ~ scaaredSoc_gmc + (1|id) + (1|passage) + (1|word),
                          data=errorDat, REML=TRUE)
 summary(model5.5)
 
@@ -604,7 +637,7 @@ summary(model5.5)
 # Now, misproduction-hesitation relationships
 
 # Errors as explained by disfluency: rate of misproduced syllables from rate of hesitated syllables
-f_model20 <- lmerTest::lmer(misprod ~ hesitation + (1|id) + (1|passage),
+f_model20 <- lmerTest::lmer(misprod_outcome ~ hesitation_predictor + (1|id) + (1|passage),
                             data=errorDat, REML=TRUE)
 summary(f_model20) # ***
 # should we (1|word) here?
@@ -630,7 +663,7 @@ summary(f_model20) # ***
 # Now, misproduction-hesitation interactions with social anxiety
 
 # Errors as explained by disfluency and SA: rate of misproduced syllables from rate of hesitated syllables and scaared
-f_model23 <- lmerTest::lmer(misprod ~ hesitation * scaaredSoc_gmc + (1|id) + (1|passage),
+f_model23 <- lmerTest::lmer(misprod_outcome ~ hesitation_predictor * scaaredSoc_gmc + (1|id) + (1|passage),
                             data=errorDat, REML=TRUE)
 summary(f_model23)
 
@@ -654,7 +687,7 @@ summary(f_model23)
 
 # What happens when we control for age?
 #hesitation x scaaredSoc
-age_model1 <- lmerTest::lmer(hesitation ~ scaaredSoc_gmc + age_gmc + (1|id) + (1|passage),
+age_model1 <- lmerTest::lmer(hesitation_outcome ~ scaaredSoc_gmc + age_gmc + (1|id) + (1|passage),
                          data=errorDat, REML=TRUE)
 summary(age_model1)
 
@@ -675,78 +708,80 @@ summary(age_model1)
 # does misproduction location relative to a hesitation predict how many
 # instances we get in a particular reading?
 
-hes_with_rel_misprod_model_1 <- lmerTest::lmer(hes_in_adjacent_window ~ misprod_position + (1|id) + (1|passage),
-                                               data=errorDatLongHesWithRelMisprod, REML=TRUE)
-summary(hes_with_rel_misprod_model_1) # n.s., 0.271
-
-misprod_with_rel_hes_model_1 <- lmerTest::lmer(misprod_in_adjacent_window ~ hes_position + (1|id) + (1|passage),
-                                               data=errorDatLongMisprodWithRelHes, REML=TRUE)
-summary(misprod_with_rel_hes_model_1) # n.s., 0.108
-
-## does it interact with SA?
-hes_with_rel_misprod_model_3 <- lmerTest::lmer(hes_in_adjacent_window ~ misprod_position * scaaredSoc_gmc + (1|id) + (1|passage),
-                                               data=errorDatLongHesWithRelMisprod, REML=TRUE)
-summary(hes_with_rel_misprod_model_3) # n.s.
-
-misprod_with_rel_hes_model_4 <- lmerTest::lmer(misprod_in_adjacent_window ~ hes_position * scaaredSoc_gmc + (1|id) + (1|passage),
-                                               data=errorDatLongMisprodWithRelHes, REML=TRUE)
-summary(misprod_with_rel_hes_model_4) # n.s.
-
-# what if we control for word?
-hes_with_rel_misprod_model_1.5 <- lmerTest::lmer(hes_in_adjacent_window ~ misprod_position + (1|id) + (1|passage) + (1|word),
-                                               data=errorDatLongHesWithRelMisprod, REML=TRUE)
-summary(hes_with_rel_misprod_model_1.5) # n.s., sameish
+# TODO unfixed per earlier predictor/outcome differentiation
+if (FALSE) {
+  hes_with_rel_misprod_model_1 <- lmerTest::lmer(hes_in_adjacent_window ~ misprod_position + (1|id) + (1|passage),
+                                                 data=errorDatLongHesWithRelMisprod, REML=TRUE)
+  summary(hes_with_rel_misprod_model_1) # n.s., 0.271
 
-misprod_with_rel_hes_model_1.5 <- lmerTest::lmer(misprod_in_adjacent_window ~ hes_position + (1|id) + (1|passage) + (1|word),
-                                               data=errorDatLongMisprodWithRelHes, REML=TRUE)
-summary(misprod_with_rel_hes_model_1.5) # ., 0.0974
+  misprod_with_rel_hes_model_1 <- lmerTest::lmer(misprod_in_adjacent_window ~ hes_position + (1|id) + (1|passage),
+                                                 data=errorDatLongMisprodWithRelHes, REML=TRUE)
+  summary(misprod_with_rel_hes_model_1) # n.s., 0.108
 
-## does it interact with SA?
-hes_with_rel_misprod_model_3.5 <- lmerTest::lmer(hes_in_adjacent_window ~ misprod_position * scaaredSoc_gmc + (1|id) + (1|passage) + (1|word),
-                                               data=errorDatLongHesWithRelMisprod, REML=TRUE)
-summary(hes_with_rel_misprod_model_3.5) # n.s.
+  ## does it interact with SA?
+  hes_with_rel_misprod_model_3 <- lmerTest::lmer(hes_in_adjacent_window ~ misprod_position * scaaredSoc_gmc + (1|id) + (1|passage),
+                                                 data=errorDatLongHesWithRelMisprod, REML=TRUE)
+  summary(hes_with_rel_misprod_model_3) # n.s.
 
-misprod_with_rel_hes_model_4.5 <- lmerTest::lmer(misprod_in_adjacent_window ~ hes_position * scaaredSoc_gmc + (1|id) + (1|passage) + (1|word),
-                                               data=errorDatLongMisprodWithRelHes, REML=TRUE)
-summary(misprod_with_rel_hes_model_4.5) # n.s.
+  misprod_with_rel_hes_model_4 <- lmerTest::lmer(misprod_in_adjacent_window ~ hes_position * scaaredSoc_gmc + (1|id) + (1|passage),
+                                                 data=errorDatLongMisprodWithRelHes, REML=TRUE)
+  summary(misprod_with_rel_hes_model_4) # n.s.
 
-# and if we ignore passage?
-hes_with_rel_misprod_model_1.6 <- lmerTest::lmer(hes_in_adjacent_window ~ misprod_position + (1|id) + (1|word),
+  # what if we control for word?
+  hes_with_rel_misprod_model_1.5 <- lmerTest::lmer(hes_in_adjacent_window ~ misprod_position + (1|id) + (1|passage) + (1|word),
                                                  data=errorDatLongHesWithRelMisprod, REML=TRUE)
-summary(hes_with_rel_misprod_model_1.6) # n.s., sameish
+  summary(hes_with_rel_misprod_model_1.5) # n.s., sameish
 
-misprod_with_rel_hes_model_1.6 <- lmerTest::lmer(misprod_in_adjacent_window ~ hes_position + (1|id) + (1|word),
+  misprod_with_rel_hes_model_1.5 <- lmerTest::lmer(misprod_in_adjacent_window ~ hes_position + (1|id) + (1|passage) + (1|word),
                                                  data=errorDatLongMisprodWithRelHes, REML=TRUE)
-summary(misprod_with_rel_hes_model_1.6) # made no difference, as you might expect
+  summary(misprod_with_rel_hes_model_1.5) # ., 0.0974
 
-## does it interact with SA?
-hes_with_rel_misprod_model_3.6 <- lmerTest::lmer(hes_in_adjacent_window ~ misprod_position * scaaredSoc_gmc + (1|id) + (1|word),
+  ## does it interact with SA?
+  hes_with_rel_misprod_model_3.5 <- lmerTest::lmer(hes_in_adjacent_window ~ misprod_position * scaaredSoc_gmc + (1|id) + (1|passage) + (1|word),
                                                  data=errorDatLongHesWithRelMisprod, REML=TRUE)
-summary(hes_with_rel_misprod_model_3.6) # ""
+  summary(hes_with_rel_misprod_model_3.5) # n.s.
 
-misprod_with_rel_hes_model_4.6 <- lmerTest::lmer(misprod_in_adjacent_window ~ hes_position * scaaredSoc_gmc + (1|id) + (1|word),
+  misprod_with_rel_hes_model_4.5 <- lmerTest::lmer(misprod_in_adjacent_window ~ hes_position * scaaredSoc_gmc + (1|id) + (1|passage) + (1|word),
                                                  data=errorDatLongMisprodWithRelHes, REML=TRUE)
-summary(misprod_with_rel_hes_model_4.6) # ""
+  summary(misprod_with_rel_hes_model_4.5) # n.s.
+
+  # and if we ignore passage?
+  hes_with_rel_misprod_model_1.6 <- lmerTest::lmer(hes_in_adjacent_window ~ misprod_position + (1|id) + (1|word),
+                                                   data=errorDatLongHesWithRelMisprod, REML=TRUE)
+  summary(hes_with_rel_misprod_model_1.6) # n.s., sameish
+
+  misprod_with_rel_hes_model_1.6 <- lmerTest::lmer(misprod_in_adjacent_window ~ hes_position + (1|id) + (1|word),
+                                                   data=errorDatLongMisprodWithRelHes, REML=TRUE)
+  summary(misprod_with_rel_hes_model_1.6) # made no difference, as you might expect
 
+  ## does it interact with SA?
+  hes_with_rel_misprod_model_3.6 <- lmerTest::lmer(hes_in_adjacent_window ~ misprod_position * scaaredSoc_gmc + (1|id) + (1|word),
+                                                   data=errorDatLongHesWithRelMisprod, REML=TRUE)
+  summary(hes_with_rel_misprod_model_3.6) # ""
+
+  misprod_with_rel_hes_model_4.6 <- lmerTest::lmer(misprod_in_adjacent_window ~ hes_position * scaaredSoc_gmc + (1|id) + (1|word),
+                                                   data=errorDatLongMisprodWithRelHes, REML=TRUE)
+  summary(misprod_with_rel_hes_model_4.6) # ""
+}
 
 
 # Word frequency analysis with words absent from corpus dropped
 # Does a word's frequency predict hesitation on that word?
 errorDatAttestedFreqs <- filter(errorDat, log10frequency > 0)
-wordfreq_model_1 <- lmerTest::lmer(hesitation ~ log10frequency + (1|id) + (1|passage) + (1|word),
+wordfreq_model_1 <- lmerTest::lmer(hesitation_outcome ~ log10frequency_gmc + (1|id) + (1|passage) + (1|word),
                                    data=errorDatAttestedFreqs, REML=TRUE)
 summary(wordfreq_model_1)
-wordfreq_model_2 <- lmerTest::lmer(misprod ~ log10frequency + (1|id) + (1|passage) + (1|word),
+wordfreq_model_2 <- lmerTest::lmer(misprod_outcome ~ log10frequency_gmc + (1|id) + (1|passage) + (1|word),
                                    data=errorDatAttestedFreqs, REML=TRUE)
 summary(wordfreq_model_2)
 
 
 # Do social anxiety and frequency interact to predict hesitation rate or misproduction rate?
-wordfreq_model_3 <- lmerTest::lmer(hesitation ~ log10frequency * scaaredSoc_gmc + (1|id) + (1|passage) + (1|word),
+wordfreq_model_3 <- lmerTest::lmer(hesitation_outcome ~ log10frequency_gmc * scaaredSoc_gmc + (1|id) + (1|passage) + (1|word),
                                    data=errorDatAttestedFreqs, REML=TRUE)
 summary(wordfreq_model_3) # looks good!
 
-wordfreq_model_4 <- lmerTest::lmer(misprod ~ log10frequency * scaaredSoc_gmc + (1|id) + (1|passage) + (1|word),
+wordfreq_model_4 <- lmerTest::lmer(misprod_outcome ~ log10frequency_gmc * scaaredSoc_gmc + (1|id) + (1|passage) + (1|word),
                                    data=errorDatAttestedFreqs, REML=TRUE)
 summary(wordfreq_model_4) # tldr no?
 
@@ -763,16 +798,19 @@ errorDat$log10frequency_with_absents_as_median <- case_match(
   0 ~ subtlexus_median,
   .default = errorDat$log10frequency)
 
-compare_freq <- data.frame(cbind(old = errorDat$log10frequency,
-                                 new = errorDat$log10frequency_with_absents_as_median))
+if (DEBUG) {
+  compare_freq <- data.frame(cbind(old = errorDat$log10frequency,
+                                   new = errorDat$log10frequency_with_absents_as_median))
 
-filter(compare_freq, old != new) %>% # confirm it worked as expected
-  filter(old != 0 | new != subtlexus_median) %>%
-  nrow == 0 # TRUE
+  filter(compare_freq, old != new) %>% # confirm it worked as expected
+    filter(old != 0 | new != subtlexus_median) %>%
+    nrow == 0 # TRUE
+}
 
+# TODO ADD GMC
 
 # Does a word's frequency predict hesitation on that word?
-wordfreq_model_with_absents_as_median_1 <- lmerTest::lmer(hesitation ~ log10frequency_with_absents_as_median + (1|id) + (1|passage) + (1|word),
+wordfreq_model_with_absents_as_median_1 <- lmerTest::lmer(hesitation_outcome ~ log10frequency_with_absents_as_median + (1|id) + (1|passage) + (1|word),
                                    data=errorDat, REML=TRUE)
 summary(wordfreq_model_with_absents_as_median_1)