Analysis of Covariance

# R from zero to hero
# Marco Plebani 29 May 2018

rm(list=ls())

Often we are dealing with experiments or studies in which more than one treatment is applied simultaneously and we want to know what’s their relative effect on the response variable.
We will look at cases in which we are manipulating two explanatory variables.

If one explanatory variable is categorical and the other is continuous, the analysis traditionally used is: Analysis of Covariance (ANCOVA).

# data from: https://rcompanion.org/rcompanion/e_04.html
Input = ("
Species Temp Pulse
ex 20.8 67.9
ex 20.8 65.1
ex 24 77.3
ex 24 78.7
ex 24 79.4
ex 24 80.4
ex 26.2 85.8
ex 26.2 86.6
ex 26.2 87.5
ex 26.2 89.1
ex 28.4 98.6
ex 29 100.8
ex 30.4 99.3
ex 30.4 101.7
niv 17.2 44.3
niv 18.3 47.2
niv 18.3 47.6
niv 18.3 49.6
niv 18.9 50.3
niv 18.9 51.8
niv 20.4 60
niv 21 58.5
niv 21 58.9
niv 22.1 60.7
niv 23.5 69.8
niv 24.2 70.9
niv 25.9 76.2
niv 26.5 76.1
niv 26.5 77
niv 26.5 77.7
niv 28.6 84.7
")


dd <- read.table(textConnection(Input),header=TRUE)
str(dd)
names(dd) <- c("Species","Temp","Pulse") # make column names more readable
library(lattice)
xyplot(Pulse ~ Temp, groups=Species, data=dd, type=c("p", "r"), auto.key=T)
lm1 <- lm(Pulse ~ Temp*Species, data=dd)
lm2 <- lm(Pulse ~ Temp+Species, data=dd)
lm3 <- lm(Pulse ~ Temp, data=dd)
lm4 <- lm(Pulse ~ Species, data=dd)
lm0 <- lm(Pulse ~ 1, data=dd)
AIC(lm1,lm2,lm3,lm4,lm0)
# lm2 is the most "economical" model
summary(lm1)
summary(lm2)

# let's plot a nicer graph with plot()
# first subset by species:
unique(dd$Species)
ex <- subset(dd, dd$Species=="ex")
niv <- dd[dd$Species=="niv",]
# this normally work... I think the species names have some spaces before or after the visible characters.
# this way will work:
ex <- subset(dd, dd$Species==unique(dd$Species)[1])
niv <- dd[dd$Species==unique(dd$Species)[2],]
# run a regression for each:
lm_ex <- lm(Pulse~Temp, data=ex)
lm_niv <- lm(Pulse~Temp, data=niv)
# plot the first subset
plot(Pulse~Temp, data=ex,
xlim=c(16,33), ylim=c(40,105),
pch=19, # or 16
col="red"
)

# add the second subset:
points(Pulse~Temp, data=niv)
# plot regression lines:
abline(lm_ex,
lty="solid", col="red"
)
abline(lm_niv,
lty="dashed", col="black"
)
legend("topleft", title="Species:",
legend=unique(dd$Species),
pch=c(19,1),
lty=c("solid", "dashed"),
col=c("red", "black")
)

Plot the lines so that they only predict values within the range of observed values:

# Set a sequence of hypothetic values for the explanatory variable:
temps_ex <- data.frame(Temp=seq(min(ex$Temp), max(ex$Temp), 0.01))
# This just helps keeping the following lines of code short and clean:
texT<- temps_ex$Temp
# predict values with standard error
yy_ex<- predict(lm_ex, data.frame(x=temps_ex), type="response", se=T)


# do the same for niv:
temps_niv <- data.frame(Temp=seq(min(niv$Temp), max(niv$Temp), 0.01))
tnivT<- temps_niv$Temp
yy_niv<- predict(lm_niv, data.frame(x=temps_niv), type="response", se=T)


plot(Pulse~Temp, data=ex,
xlim=c(16,33), ylim=c(40,105),
pch=NA
) # this plots a blank graph
# plot points:
points(Pulse~Temp, data=ex,
pch=19, # or 16
col="red"
)
points(Pulse~Temp, data=niv)
# plot fitted models:
# I'm using points() again but with the argument type="l", which plots lines
points(x=texT,y=yy_ex$fit, type="l",
lty="solid", col="red"
)
points(x=tnivT,y=yy_niv$fit, type="l",
lty="dashed", col="black"
)
legend("topleft", title="Species:",
legend=unique(dd$Species),
pch=c(19,1),
lty=c("solid", "dashed"),
col=c("red", "black")
)

Add CI – this bit of code is a bit messy-looking but it works:

# this just helps keeping the following lines of code short and clean:
yysep_ex <- yy_ex$fit+1.96 * yy_ex$se
yysem_ex <- yy_ex$fit-1.96 * yy_ex$se

# Again, this just helps keeping the following lines of code short and clean:
yysep_niv <- yy_niv$fit+1.96 * yy_niv$se
yysem_niv <- yy_niv$fit-1.96 * yy_niv$se

Function for making transparent colors (found here):

makeTransparent<-function(someColor, alpha=100)
{
newColor<-col2rgb(someColor)
apply(newColor, 2, function(curcoldata){rgb(red=curcoldata[1], green=curcoldata[2],
blue=curcoldata[3],alpha=alpha, maxColorValue=255)})
}

Confidence interval as a shaded area:

polygon(c(texT,rev(texT)),c(yysep_ex,rev(yysem_ex)), col= makeTransparent("red"), border=NA)
polygon(c(tnivT,rev(tnivT)),c(yysep_niv,rev(yysem_niv)),col=makeTransparent("black"), border=NA)

Here I explain how to interpret the results of a slightly more complicated example, with three explanatory variables, two categorical and one continuous.

Advertisements