change_script_2025

sib-swiss · Jan 31, 2025 · c117a9e · c117a9e
1 parent 1e3b132
commit c117a9e
Show file tree

Hide file tree

Showing 5 changed files with 292 additions and 20 deletions.
diff --git a/docs/assets/Scripts/Day1.R b/docs/assets/Scripts/Day1.R
@@ -1,12 +1,15 @@
 #-----------------
 #-----------------
 # Introduction to statistics with R
-# 2024
+# 2025
 # In Lausanne 
 #-----------------
 #-----------------
 
-
+weight <- c(65,72,55,91,95,72) #this is the weight of our classroom people
+height <- c(1.73, 1.80, 1.62, 1.90, 1.78, 1.93)
+bmi <- weight / height^2
+bmi # Type this in R to see the computed values
 
 #-----------------
 #-----------------
@@ -60,23 +63,24 @@ plot(diameter ~ log(conc), data=hellung)
 
 # a fancy plot with ggplot :)
 library(ggplot2)
-ggplot(hellung, aes(x=log(conc), y=diameter)) + geom_point()
-
+ggplot(hellung, aes(x=log(conc), y=diameter,col=glucose)) + geom_point() 
 
+hellung$glucose <- factor(hellung$glucose)
 
 #-----------------
 #-----------------
 # 2nd exercise
 #-----------------
+data <- read.csv("~/Desktop/Introduction-to-statistics-with-R/docs/assets/exercises/data.csv", header=FALSE)
 
-setwd("/Users/Rachel/Desktop/Introduction-to-statistics-with-R/docs/assets/exercises/")
+setwd("/Users/rachelmarcone/Desktop/Introduction-to-statistics-with-R/docs/assets/exercises/")
 data <- read.csv("data.csv") 
 
 
 data
 summary(data)
 sd(data[,1]); sd(data[,2]); sd(data[,3])
-
+attach(data)
 datatoplot <- data[,1]
 #pdf("datanumber1.pdf")
 ## Plot 4 rows of graphs on one plot
@@ -145,6 +149,23 @@ boxplot(datatoplot, horizontal=TRUE, ylim=range(datatoplot))
 #dev.off()
 
 
+x <- rnorm(10000,mean=0, sd=1)
+hist(x)
+t.test(x,mu=0)$p.value
+
+
+s <- rep(0,100) # this is an empty vector with 10 entries
+for(i in 1:100){ # this is a loop, called a "for" loop, it will repeat 
+  # everything in parenthesis 10 times changing the variable
+  # i from 1 to 10 at each iteration
+  x <- rnorm(10,mean=10, sd=1)
+  s[i] <- t.test(x,mu=10)$p.value # does a t.test then takes the p.value obtained and
+  # puts it into the i-th entry of s
+
+}
+
+s_adj <- p.adjust(s)
+
 ## Last exercise
 
 
@@ -163,9 +184,10 @@ student$leftrighthanded <- as.factor(as.character(student$leftrighthanded))
 summary(student)
 
 student[student[,"height"] ==1.77,"height"] <-177
+student[,"siblings"]<- as.factor(student[,"siblings"])
 
 plot(student$height,student$weight,col=student$gender)
 boxplot(height~gender,data=student)
 boxplot(weight~gender,data=student)
 hist(student$weight)
-
+pairs(student) ## two by two plots of data
diff --git a/docs/assets/Scripts/Day2.R b/docs/assets/Scripts/Day2.R
@@ -1,7 +1,7 @@
 #-----------------
 #-----------------
 # Introduction to statistics with R
-# January 2024
+# January 2025
 # In Lausanne 
 #-----------------
 #-----------------
@@ -37,16 +37,18 @@ ggboxplot(intake$pre, width = 0.5, add = c("mean","jitter"),
 identify_outliers(as.data.frame(intake$pre))
 
 # Assumption: normality
-qqplot
-qqline
+
+qqnorm(intake$pre)
+qqline(intake$pre)
+
 ggqqplot(intake,"pre")
 
 shapiro_test(intake$pre)
 
 # t test
 
-t.test(pre, mu=7725)
-t.test(pre, mu=7725, alternative="less")
+t.test(pre, mu=7725) #one sample two sided t test
+t.test(pre, mu=7725, alternative="less") #one sample one sided t test
 
 
 
@@ -61,12 +63,15 @@ energy
 
 # assumption 1: data in each group are normally distributed.
 
-ind.obese <- which(energy$stature == "obese")
+ind.obese <- which(energy[,"stature"] == "obese")
 ind.lean <- which(energy$stature == "lean")
 
 shapiro_test(energy$expend[ind.obese]) 
+qqnorm(energy$expend[ind.obese])
+qqline(energy$expend[ind.obese])
 shapiro_test(energy$expend[ind.lean])
-
+qqnorm(energy$expend[ind.lean])
+qqline(energy$expend[ind.lean])
 # assumption 2: the variances for the two independent groups are equal.
 
 levene_test(energy, expend~stature)
@@ -127,6 +132,8 @@ names(WT) <- "weight"
 WT$genotype <- "WT"
 
 KO_WT <- rbind(KO,WT)
+shapiro.test(WT$weight)
+shapiro.test(KO$weight)
 
 boxplot(KO_WT$weight ~ KO_WT$genotype, main="Mice weight at 18 weeks", xlab="", ylab="")
 
@@ -144,13 +151,13 @@ sim.p.t.test <- NULL
 sim.p.wilcox.test <- NULL
 
 for (i in 1:1000) {
-  KO <- runif(3, min=27, max=29)
-  WT <- runif(3, min=29, max=34)
+  KO <- runif(3, min=30, max=34)
+  WT <- runif(3, min=27, max=29)
   KO <- as.data.frame(KO)
-  names(KO)[1] <- "weight"
+  names(KO) <- "weight"
   KO$genotype <- "KO"
   WT <- as.data.frame(WT)
-  names(WT)[1] <- "weight"
+  names(WT) <- "weight"
   WT$genotype <- "WT"
 
   KO_WT <- rbind(KO,WT)
@@ -167,7 +174,9 @@ str(sim.p.welch.test)
 str(sim.p.t.test)
 sum(sim.p.welch.test < 0.05)
 sum(sim.p.t.test < 0.05)
+sum(sim.p.wilcox.test<0.05)
 plot(sim.p.t.test,sim.p.welch.test)
+
 adj.bonf <- p.adjust(sim.p.welch.test, method="bonf")
 sum(adj.bonf < 0.05)
 adj.BH <- p.adjust(sim.p.welch.test, method="BH")
@@ -192,7 +201,7 @@ coagulation %>% group_by(diet) %>% get_summary_stats(coag, type = "mean_sd")
 
 boxplot(coagulation$coag ~ coagulation$diet)
 
-ggboxplot(coagulation, x="diet",y="coag")
+ggboxplot(coagulation, x="diet",y="coag",add="jitter")
 
 # check normality
 ind.A <- which(coagulation$diet=="A")

diff --git a/docs/assets/Scripts/Day3_afternoon_multiple_regression.R b/docs/assets/Scripts/Day3_afternoon_multiple_regression.R
@@ -0,0 +1,144 @@
+#####################################
+# Introduction to Statistics with R #
+# Multiple Regression               #
+# Joao Lourenco                     #
+# 29.01.2025                        #
+#####################################
+
+# clear the environment
+rm(list = ls())
+
+# clear the command line
+cat("\014")
+
+
+###########################################
+# multiple regression with two variables  #
+###########################################
+
+# load the class dataset
+class <- read.csv("/Users/rachelmarcone/Desktop/Introduction-to-statistics-with-R/docs/assets/exercises/class.csv")
+
+# fit model with two independent variables
+model <- lm(Height ~ Age + Weight, data = class)
+summary(model)
+
+# The order doesn't matter
+summary(lm(Height ~ Weight + Age  , data = class))
+
+# compare to the models where each variable is considered separately
+summary(lm(Height ~Age  , data = class))
+summary(lm(Height ~ Weight, data = class))
+
+
+# example of a case in which the adjusted R2 decreseases when the number of variables increases
+
+y <- rnorm(10) # create a random vector
+X <- matrix(rnorm(100),ncol = 10, nrow = 10) #create a random matrix
+
+# r-squared
+plot(sapply( 1:10, function(i) summary(lm(y ~ X[,1:i]))$r.squared),type = "l")
+
+# adjusted r-squared
+plot(sapply( 1:10, function(i) summary(lm(y ~ X[,1:i]))$adj.r.squared),type = "l")
+
+# last model
+summary(lm(y ~ X))
+
+#########################################################
+# Categorical variables, dummy variables and contrasts  #
+#########################################################
+
+## categorical variable
+
+# convert Gender to a factor
+class$Gender <- as.factor(class$Gender)
+class$Gender
+as.numeric(class$Gender)
+
+summary(lm(Height ~  Age + Gender, data = class ))
+
+
+# difference in means between males and females
+#?tapply
+#?diff
+means <- tapply(class$Height, class$Gender, mean)
+diff(means)
+
+# where does this difference come from
+summary(lm(Height ~  Gender, data = class ))
+summary(lm(Height ~  Age + Gender, data = class ))
+
+
+## interaction between Age and Gender
+summary(lm(Height ~ Age + Gender + Age:Gender, data = class))
+#summary(lm(Height ~ Age*Gender, data = class))
+
+
+## What if males were the baseline
+
+# create a new categorical variable
+class$Gender1 <- relevel(class$Gender, ref="M") 
+
+# fit the model
+summary(lm(Height ~ Age + Gender1, data = class))
+
+
+
+###########################
+# Diagnostic tools        #
+###########################
+
+attach(class)
+
+
+## examination of residuals
+model <- lm(Height ~ Age , data = class)
+plot(class$Age, residuals(model)) # works only for simple regression
+plot(fitted(model), residuals(model)) # works only for simple regression
+
+# hat values
+?lm.influence
+hat <- lm.influence(model)
+plot(hat$hat, ylim = range(0,hat$hat, 3*2/19))
+abline(h=c(2,3)*2/19, lty=2, col=c("blue","red"))
+
+
+# Predictions with confidence intervals
+?predict.lm
+preds <- predict(model, interval = "prediction")
+attach(class)
+# Plot the data
+plot(Age, Height)
+
+# Add regression line
+abline(model, col = "red", lwd = 2)
+
+
+
+
+# Generate new predictor values for smooth plotting
+new_x = seq(min(Age),max(Age),length.out = 100)
+
+#  get the prediction interval
+prediction_interval <- predict(model, 
+                                newdata = data.frame(Age = new_x),
+                                interval = "prediction")
+
+# get the confidence intervals for a given level of confidence
+confidence_interval <- predict(model,
+                               newdata = data.frame(Age = new_x),
+                               interval = "confidence")
+
+# Create the scatterplot
+plot(Age, Height, ylim = range(prediction_interval[, "lwr"],prediction_interval[, "upr"]))
+
+# Add regression line
+abline(model, col = "red", lwd = 2)
+
+
+# Add bands
+lines(new_x, confidence_interval[, "lwr"], lty = "dashed")
+lines(new_x, confidence_interval[, "upr"], lty = "dashed")
+lines(new_x, prediction_interval[, "lwr"], lty = "dotted")
+lines(new_x, prediction_interval[, "upr"], lty = "dotted")