Confidence Intervals in linear regression

In this post I compare base graphics and ggplot for drawing confidence intervals in linear regression.

Tamas Schauer https://github.com/tschauer (LMU - Biomedical Center)https://www.compbio.bmc.med.uni-muenchen.de/about_us/people/index.html
2020-09-17

Table of Contents



Setup


set.seed(6)

n1 <- 10
n2 <- 50

x1 <- rnorm(n1, sd=2)
y1 <- 5 + 2*x1 + rnorm(n1, sd=1)

x2 <- rnorm(n2, sd=2)
y2 <- 10 + 1*x2 + rnorm(n2, sd=1)

x <- c(x1,x2)
y <- c(y1,y2)

df <- data.frame(x = x, 
                 y = y,
                 group = factor(c(rep("g1", n1), 
                                  rep("g2", n2))))

head(df)

            x         y group
1  0.53921196  7.806275    g1
2 -1.25997083  1.301459    g1
3  1.73731966  9.127846    g1
4  3.45439103 11.540216    g1
5  0.04837528  4.497196    g1
6  0.73605035  6.526706    g1

Linear regression

Fit Summary


fit1 <- lm(y1 ~ x1)
summary(fit1)

Call:
lm(formula = y1 ~ x1)

Residuals:
    Min      1Q  Median      3Q     Max 
-1.8987 -0.8493  0.1223  0.7652  1.5437 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)   5.3448     0.3715  14.389 5.32e-07 ***
x1            1.7021     0.2116   8.044 4.20e-05 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.166 on 8 degrees of freedom
Multiple R-squared:   0.89, Adjusted R-squared:  0.8762 
F-statistic: 64.71 on 1 and 8 DF,  p-value: 4.197e-05

fit2 <- lm(y2 ~ x2)
summary(fit2)

Call:
lm(formula = y2 ~ x2)

Residuals:
    Min      1Q  Median      3Q     Max 
-1.8842 -0.6105 -0.1154  0.6376  2.6806 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  9.91357    0.13962   71.01   <2e-16 ***
x2           0.98965    0.07134   13.87   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.9819 on 48 degrees of freedom
Multiple R-squared:  0.8003,    Adjusted R-squared:  0.7962 
F-statistic: 192.4 on 1 and 48 DF,  p-value: < 2.2e-16

fit <- lm(y ~ group + x + group:x, data = df)
summary(fit)

Call:
lm(formula = y ~ group + x + group:x, data = df)

Residuals:
    Min      1Q  Median      3Q     Max 
-1.8987 -0.6329 -0.1012  0.6637  2.6806 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)   5.3448     0.3218  16.608  < 2e-16 ***
groupg2       4.5688     0.3524  12.964  < 2e-16 ***
x             1.7021     0.1833   9.285 6.27e-13 ***
groupg2:x    -0.7125     0.1975  -3.608 0.000659 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.01 on 56 degrees of freedom
Multiple R-squared:  0.877, Adjusted R-squared:  0.8704 
F-statistic: 133.1 on 3 and 56 DF,  p-value: < 2.2e-16

Confidence intervals


pred1 <- data.frame(x1 = seq(min(x1),max(x1),0.1))
pred1 <- cbind(pred1, predict(object = fit1, newdata = pred1, interval = "confidence"))

head(pred1)

         x1       fit        lwr      upr
1 -2.618409 0.8879563 -0.7334178 2.509330
2 -2.518409 1.0581684 -0.5218678 2.638205
3 -2.418409 1.2283806 -0.3107546 2.767516
4 -2.318409 1.3985927 -0.1001139 2.897299
5 -2.218409 1.5688049  0.1100149 3.027595
6 -2.118409 1.7390170  0.3195888 3.158445

pred2 <- data.frame(x2 = seq(min(x2), max(x2),0.1))
pred2 <- cbind(pred2, predict(object = fit2, newdata = pred2, interval = "confidence"))

head(pred2)

         x2      fit      lwr      upr
1 -3.220099 6.726804 6.211664 7.241944
2 -3.120099 6.825769 6.322623 7.328914
3 -3.020099 6.924734 6.433457 7.416010
4 -2.920099 7.023698 6.544156 7.503241
5 -2.820099 7.122663 6.654708 7.590618
6 -2.720099 7.221628 6.765105 7.678151

pred <- data.frame(x = rep(seq(min(x),max(x),0.1), each=2), group = c("g1","g2"))
pred <- cbind(pred, predict(object = fit, newdata = pred, interval = "confidence"))

head(pred)

          x group         fit       lwr      upr
1 -3.220099    g1 -0.13619325 -1.549365 1.276979
2 -3.220099    g2  6.72680387  6.198727 7.254880
3 -3.120099    g1  0.03401889 -1.346512 1.414549
4 -3.120099    g2  6.82576869  6.309988 7.341549
5 -3.020099    g1  0.20423104 -1.143868 1.552330
6 -3.020099    g2  6.92473351  6.421120 7.428347

Plot

Base graphics


par(mfrow=c(1,2), mar = c(4,4,4,4), cex=1.25)

plot(y1 ~ x1, pch = 19, col = 1, 
     xlab = "x", ylab = "y",
     xlim = c(-5,6), ylim = c(-5,20))

points(y2 ~ x2, pch = 19, col = 2)

polygon(x = c(pred1$x1, rev(pred1$x1)), y = c(pred1$lwr, rev(pred1$upr)), col = "#DDDDDD88", border = NA)
polygon(x = c(pred2$x2, rev(pred2$x2)), y = c(pred2$lwr, rev(pred2$upr)), col = "#DDDDDD88", border = NA)

lines(fit ~ x1, data = pred1, col="#666666", lwd=2)
lines(fit ~ x2, data = pred2, col="#666666", lwd=2)


plot(y ~ x, data = df,
     pch = 19, col = df$group, 
     xlim = c(-5,6), ylim = c(-5,20))

pred_g1 <- pred[pred$group == "g1",]
pred_g2 <- pred[pred$group == "g2",]

polygon(x = c(pred_g1$x, rev(pred_g1$x)), y = c(pred_g1$lwr, rev(pred_g1$upr)), col = "#DDDDDD88", border = NA)
polygon(x = c(pred_g2$x, rev(pred_g2$x)), y = c(pred_g2$lwr, rev(pred_g2$upr)), col = "#DDDDDD88", border = NA)

lines(fit ~ x, data = pred_g1, col="#666666", lwd=2)
lines(fit ~ x, data = pred_g2, col="#666666", lwd=2)

ggplot


library(ggplot2)
library(ggpubr)

gg1 <- ggplot(df, aes(x=x, y=y, group = group)) + 
      theme_bw() +
      theme(panel.grid.major=element_blank(), panel.grid.minor=element_blank(),
            text = element_text(size=22)) +
      coord_cartesian(xlim = c(-5,6), ylim = c(-5,20)) + 
      geom_point(aes(colour = group), size = 2) +
      geom_smooth(method=lm, formula = y ~ x, se = TRUE,  aes(group = group), 
                  colour='#666666' , fill='#DDDDDD', alpha = 0.5) +
      scale_color_manual(values=c("black", "red"))


gg2 <- ggplot(df, aes(x=x, y=y, fill = group)) + 
      theme_bw() +
      theme(panel.grid.major=element_blank(), panel.grid.minor=element_blank(),
            text = element_text(size=22)) +
      coord_cartesian(xlim = c(-5,6), ylim = c(-5,20)) + 
      geom_point(aes(colour = group), size = 2) +
      geom_ribbon(data = pred, aes(x=x, y=fit,ymin = lwr, ymax = upr, group = group), 
                  fill='#DDDDDD', alpha = 0.5) +
      geom_line(data = pred, aes(y = fit), size = 1, colour="#666666") +
      scale_color_manual(values=c("black", "red"))

ggarrange(gg1, gg2,
          ncol = 2, nrow = 1)

Corrections

If you see mistakes or want to suggest changes, please create an issue on the source repository.

Citation

For attribution, please cite this work as

Schauer (2020, Sept. 17). CompBioMethods: Confidence Intervals in linear regression. Retrieved from https://tschauer.github.io/blog/posts/2020-09-17-confidence-intervals-in-linear-regression/

BibTeX citation

@misc{schauer2020confidence,
  author = {Schauer, Tamas},
  title = {CompBioMethods: Confidence Intervals in linear regression},
  url = {https://tschauer.github.io/blog/posts/2020-09-17-confidence-intervals-in-linear-regression/},
  year = {2020}
}