CompBioMethods: Confidence Intervals in linear regression

Tamas Schauer

Setup


set.seed(6)

n1 <- 10
n2 <- 50

x1 <- rnorm(n1, sd=2)
y1 <- 5 + 2*x1 + rnorm(n1, sd=1)

x2 <- rnorm(n2, sd=2)
y2 <- 10 + 1*x2 + rnorm(n2, sd=1)

x <- c(x1,x2)
y <- c(y1,y2)

df <- data.frame(x = x, 
                 y = y,
                 group = factor(c(rep("g1", n1), 
                                  rep("g2", n2))))

head(df)


            x         y group
1  0.53921196  7.806275    g1
2 -1.25997083  1.301459    g1
3  1.73731966  9.127846    g1
4  3.45439103 11.540216    g1
5  0.04837528  4.497196    g1
6  0.73605035  6.526706    g1

Linear regression

Fit Summary


fit1 <- lm(y1 ~ x1)
summary(fit1)


Call:
lm(formula = y1 ~ x1)

Residuals:
    Min      1Q  Median      3Q     Max 
-1.8987 -0.8493  0.1223  0.7652  1.5437 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)   5.3448     0.3715  14.389 5.32e-07 ***
x1            1.7021     0.2116   8.044 4.20e-05 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.166 on 8 degrees of freedom
Multiple R-squared:   0.89, Adjusted R-squared:  0.8762 
F-statistic: 64.71 on 1 and 8 DF,  p-value: 4.197e-05


fit2 <- lm(y2 ~ x2)
summary(fit2)


Call:
lm(formula = y2 ~ x2)

Residuals:
    Min      1Q  Median      3Q     Max 
-1.8842 -0.6105 -0.1154  0.6376  2.6806 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  9.91357    0.13962   71.01   <2e-16 ***
x2           0.98965    0.07134   13.87   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.9819 on 48 degrees of freedom
Multiple R-squared:  0.8003,    Adjusted R-squared:  0.7962 
F-statistic: 192.4 on 1 and 48 DF,  p-value: < 2.2e-16


fit <- lm(y ~ group + x + group:x, data = df)
summary(fit)


Call:
lm(formula = y ~ group + x + group:x, data = df)

Residuals:
    Min      1Q  Median      3Q     Max 
-1.8987 -0.6329 -0.1012  0.6637  2.6806 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)   5.3448     0.3218  16.608  < 2e-16 ***
groupg2       4.5688     0.3524  12.964  < 2e-16 ***
x             1.7021     0.1833   9.285 6.27e-13 ***
groupg2:x    -0.7125     0.1975  -3.608 0.000659 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.01 on 56 degrees of freedom
Multiple R-squared:  0.877, Adjusted R-squared:  0.8704 
F-statistic: 133.1 on 3 and 56 DF,  p-value: < 2.2e-16

Confidence intervals


pred1 <- data.frame(x1 = seq(min(x1),max(x1),0.1))
pred1 <- cbind(pred1, predict(object = fit1, newdata = pred1, interval = "confidence"))

head(pred1)


         x1       fit        lwr      upr
1 -2.618409 0.8879563 -0.7334178 2.509330
2 -2.518409 1.0581684 -0.5218678 2.638205
3 -2.418409 1.2283806 -0.3107546 2.767516
4 -2.318409 1.3985927 -0.1001139 2.897299
5 -2.218409 1.5688049  0.1100149 3.027595
6 -2.118409 1.7390170  0.3195888 3.158445


pred2 <- data.frame(x2 = seq(min(x2), max(x2),0.1))
pred2 <- cbind(pred2, predict(object = fit2, newdata = pred2, interval = "confidence"))

head(pred2)


         x2      fit      lwr      upr
1 -3.220099 6.726804 6.211664 7.241944
2 -3.120099 6.825769 6.322623 7.328914
3 -3.020099 6.924734 6.433457 7.416010
4 -2.920099 7.023698 6.544156 7.503241
5 -2.820099 7.122663 6.654708 7.590618
6 -2.720099 7.221628 6.765105 7.678151


pred <- data.frame(x = rep(seq(min(x),max(x),0.1), each=2), group = c("g1","g2"))
pred <- cbind(pred, predict(object = fit, newdata = pred, interval = "confidence"))

head(pred)


          x group         fit       lwr      upr
1 -3.220099    g1 -0.13619325 -1.549365 1.276979
2 -3.220099    g2  6.72680387  6.198727 7.254880
3 -3.120099    g1  0.03401889 -1.346512 1.414549
4 -3.120099    g2  6.82576869  6.309988 7.341549
5 -3.020099    g1  0.20423104 -1.143868 1.552330
6 -3.020099    g2  6.92473351  6.421120 7.428347

Plot

Base graphics


par(mfrow=c(1,2), mar = c(4,4,4,4), cex=1.25)

plot(y1 ~ x1, pch = 19, col = 1, 
     xlab = "x", ylab = "y",
     xlim = c(-5,6), ylim = c(-5,20))

points(y2 ~ x2, pch = 19, col = 2)

polygon(x = c(pred1$x1, rev(pred1$x1)), y = c(pred1$lwr, rev(pred1$upr)), col = "#DDDDDD88", border = NA)
polygon(x = c(pred2$x2, rev(pred2$x2)), y = c(pred2$lwr, rev(pred2$upr)), col = "#DDDDDD88", border = NA)

lines(fit ~ x1, data = pred1, col="#666666", lwd=2)
lines(fit ~ x2, data = pred2, col="#666666", lwd=2)


plot(y ~ x, data = df,
     pch = 19, col = df$group, 
     xlim = c(-5,6), ylim = c(-5,20))

pred_g1 <- pred[pred$group == "g1",]
pred_g2 <- pred[pred$group == "g2",]

polygon(x = c(pred_g1$x, rev(pred_g1$x)), y = c(pred_g1$lwr, rev(pred_g1$upr)), col = "#DDDDDD88", border = NA)
polygon(x = c(pred_g2$x, rev(pred_g2$x)), y = c(pred_g2$lwr, rev(pred_g2$upr)), col = "#DDDDDD88", border = NA)

lines(fit ~ x, data = pred_g1, col="#666666", lwd=2)
lines(fit ~ x, data = pred_g2, col="#666666", lwd=2)

compare left and right: slopes are the same
confidence intervals different
it is beneficial to fit combined model (assumption should be checked though)

ggplot


library(ggplot2)
library(ggpubr)

gg1 <- ggplot(df, aes(x=x, y=y, group = group)) + 
      theme_bw() +
      theme(panel.grid.major=element_blank(), panel.grid.minor=element_blank(),
            text = element_text(size=22)) +
      coord_cartesian(xlim = c(-5,6), ylim = c(-5,20)) + 
      geom_point(aes(colour = group), size = 2) +
      geom_smooth(method=lm, formula = y ~ x, se = TRUE,  aes(group = group), 
                  colour='#666666' , fill='#DDDDDD', alpha = 0.5) +
      scale_color_manual(values=c("black", "red"))


gg2 <- ggplot(df, aes(x=x, y=y, fill = group)) + 
      theme_bw() +
      theme(panel.grid.major=element_blank(), panel.grid.minor=element_blank(),
            text = element_text(size=22)) +
      coord_cartesian(xlim = c(-5,6), ylim = c(-5,20)) + 
      geom_point(aes(colour = group), size = 2) +
      geom_ribbon(data = pred, aes(x=x, y=fit,ymin = lwr, ymax = upr, group = group), 
                  fill='#DDDDDD', alpha = 0.5) +
      geom_line(data = pred, aes(y = fit), size = 1, colour="#666666") +
      scale_color_manual(values=c("black", "red"))

ggarrange(gg1, gg2,
          ncol = 2, nrow = 1)

it looks the same as base graphics
geom_smooth() is not the best choice because it fits 2 independent lines
add confidence intervals calculated by predict() using geom_ribbon()

Confidence Intervals in linear regression

Table of Contents

Setup

Linear regression

Fit Summary

Confidence intervals

Plot

Base graphics

ggplot

Corrections

Citation