##############################
# The value of using multilevel models
# Example: Repeated Measures and Clustering in sections
# Author: Elli Theobald: ellij@uw.edu
# Full citation: Students are rarely independent: When, why, and how to use random effects in discipline-based education research
##############################

# load libraries
  # lme4 contains the function (lmer) to fit multilevel models
library(lme4)

# set working directory
setwd("")

# read in the data
mydata <- read.csv("RepeatedStudents.csv",head=T)
head(mydata)

# data description:
  # 2 sections (A and B), 2 treatments (treatment=1 and control=0)
  # each section got 3 doses: either 2 treatments and 1 control (B) or 2 controls and 1 treatment (A)
table(mydata$Section,mydata$Treatment,mydata$Topic)



##############
### ICC ###
##############
# Student Random Effect
mod0_Stu <- lmer(PostScore ~ 1 + (1|StudentID), 
                 data=mydata)
summary(mod0_Stu)

# makes random effect portion of model output a dataframe
out_mod0_Stu <- as.data.frame(VarCorr(mod0_Stu),head=TRUE) 

# calculates Intraclass Correlation (ICC)
btwn_stu <- out_mod0_Stu$vcov[out_mod0_Stu$grp=='StudentID'] # between cluster variation
win_stu <- out_mod0_Stu$vcov[out_mod0_Stu$grp=='Residual'] # within cluster variance
(ICC_stu <- btwn_stu/(win_stu + btwn_stu)) # calculates and prints ICC 


# Section Random Effect
mod0_Sect <- lmer(PostScore ~ 1 + (1|Section), 
                  data=mydata)
summary(mod0_Sect)

# makes random effect portion of model output a dataframe
out_mod0_Sect <- as.data.frame(VarCorr(mod0_Sect),head=TRUE) 

# calculates Intraclass Correlation for Section (ICC)
btwn_sect <- out_mod0_Sect$vcov[out_mod0_Sect$grp=='Section'] # between cluster variation
win_sect <- out_mod0_Sect$vcov[out_mod0_Sect$grp=='Residual'] # within cluster variance
(ICC_sect <- btwn_sect/(win_sect + btwn_sect)) # calculates and prints ICC

# Interpretation: 
  # Based on ICC, it is most important to test StudentID as a random effect
  # and less important to test Section as a random effect
  # Prediction: model selection will favor including StudentID and excluding Section


############################################################
#### Model selection as recommended by Zuur et al. 2009 ####
############################################################
  # Determine the best random effects structure

##############
### Step 1 ###
##############
   # Fit the most complex fixed-effects only model that explicitly tests the hypothesis
# Model 1: tests the hypothesis that treatment impacts postscore
  # note that this is fit in R's base package with the function lm (not lmer in the lme4 package)
mod1 <- lm(PostScore ~ PreScore + Treatment + Topic, 
             data=mydata)
summary(mod1)
AIC(mod1)

##############
### Step 2 ###
##############
   # Test all combinations of random effect structures
   # set REML=T (this is the default) 
# Model 2: student random effect (repeated measures) and section random effect (clustering)
mod2 <- lmer(PostScore ~ PreScore + Treatment + Topic + (1|StudentID) + (1|Section), 
             data=mydata, REML=T)
summary(mod2)
AIC(mod2)

# Model 3: student random effect only (repeated measures)
mod3 <- lmer(PostScore ~ PreScore + Treatment + Topic + (1|StudentID), 
             data=mydata, REML=T)
summary(mod3)
AIC(mod3)

# Model 4: section random effect only (clustering)
mod4 <- lmer(PostScore ~ PreScore + Treatment + Topic + (1|Section), 
             data=mydata, REML=T)
summary(mod4)
AIC(mod4)

##############
### Step 3 ###
##############
   # compare all models from steps 1 and 2 using AIC 
AIC(mod1, mod2, mod3, mod4)
   # mod3 has the lowest AIC



###############################################################
#### Backwards Model selection to Select the Fixed Effects ####
###############################################################
  # Determine the best fixed effects that explain the PostScore


# Start with the best model from above (mod3)
  # specify REML=F
mod3F <- lmer(PostScore ~ PreScore + Treatment + Topic + (1|StudentID), 
             data=mydata, REML=F)
summary(mod3F)
AIC(mod3F)

# Remove fixed effects singularly
  # If AIC increases, put the fixed effect back in

# Remove Treatment because treatment has the smallest effect
mod4F <- lmer(PostScore ~ PreScore + Topic + (1|StudentID), 
             data=mydata, REML=F)
summary(mod4F)
AIC(mod4F)
  # AIC increases so add Treatment back in

# Remove Topic because topic has the next smallest effect
mod5F <- lmer(PostScore ~ PreScore + Treatment + (1|StudentID), 
             data=mydata, REML=F)
summary(mod5F)
AIC(mod5F)
  # AIC increases so add Topic back in

# Remove PreScore because PreScore has the next smallest effect
mod6F <- lmer(PostScore ~ Treatment + Topic + (1|StudentID), 
              data=mydata, REML=F)
summary(mod6F)
AIC(mod6F)
  # AIC increases so add Topic back in

# Refit the final model for interpretation, with REML=T
modFinal <- lmer(PostScore ~ PreScore + Treatment + Topic + (1|StudentID), 
              data=mydata, REML=T)
summary(modFinal)
AIC(modFinal)


######################
### Interpretation ###
######################
# The best fitting model (with the lowest AIC) includes a student random effect (to account for the repeated measures)
  # and PreScore, Topic, and Treatment as fixed effects.