A comparison of statistical methods for analyzing mass spectrometry data

Analysis of mass spectrometry data with t.test, lm or limma.

Tamas Schauer https://github.com/tschauer (LMU - Biomedical Center)https://www.compbio.bmc.med.uni-muenchen.de/about_us/people/index.html
2020-09-24

Table of Contents



Data Description


Setup


# load data
ms_data <- read.table("data_massspec/elife-56325-supp1-v1.txt", header = T, row.names = 1, stringsAsFactors = FALSE)

ms_data[1:5,1:4]

           DomA_c_1_mean DomA_c_2_mean DomA_c_3_mean DomB_c_1_mean
A0A0B4JCZ0      30.07340      28.10453      30.76833      29.19630
A0A0B4JD97      26.56143      29.04703      27.66783      28.51773
A0A0B4JDG4      26.90077      27.31623      26.88430      27.35533
A0A0B4K5Z8      27.83300      26.35150      27.51597      25.59287
A0A0B4K651      26.55867      26.90197      26.92103      26.23967

library(org.Dm.eg.db)
# convert protein ids to gene names
gene_names <- mapIds(x = org.Dm.eg.db, keys = gsub("\\;.*","", rownames(ms_data)), 
                     keytype = "UNIPROT", column = "SYMBOL", multiVals = "first")

# setup conditions
my_conditions <- factor(gsub("_.*","",colnames(ms_data)))
my_conditions

[1] DomA DomA DomA DomB DomB DomB CTRL CTRL CTRL
Levels: CTRL DomA DomB

# setup comparisons
my_comparisons <- list(c(levels(my_conditions)[2], levels(my_conditions)[1]),
                       c(levels(my_conditions)[3], levels(my_conditions)[1]),
                       c(levels(my_conditions)[2], levels(my_conditions)[3]))
my_comparisons

[[1]]
[1] "DomA" "CTRL"

[[2]]
[1] "DomB" "CTRL"

[[3]]
[1] "DomA" "DomB"

Students t-test


for(j in seq_along(my_comparisons)){
      
      res_ttest <- data.frame(matrix(NA, nrow = nrow(ms_data), ncol = 2),
                              row.names = rownames(ms_data))
      
      for(i in 1:nrow(ms_data)){
            
            fit <-  t.test(x = ms_data[i, my_conditions == my_comparisons[[j]][1]], 
                           y = ms_data[i, my_conditions == my_comparisons[[j]][2]], 
                           var.equal = TRUE)
            
            res_ttest[i,1] <- fit$estimate[1] - fit$estimate[2]
            res_ttest[i,2] <- fit$p.value
            
            rm(list = "fit")
      }
      
      my_comparison_name <- paste(my_comparisons[[j]], collapse = "vs")
      
      colnames(res_ttest) <- c(paste0("coef_",my_comparison_name),
                               paste0("pval_",my_comparison_name))
      
      res_ttest$gene_name <- gene_names
      
      assign(paste0("res_ttest_", my_comparison_name), res_ttest)
      
      rm(list = "res_ttest")
}

head(res_ttest_DomAvsCTRL)

           coef_DomAvsCTRL pval_DomAvsCTRL       gene_name
A0A0B4JCZ0      -0.1522889      0.86138949         CG42668
A0A0B4JD97      -1.1075667      0.20122473            tacc
A0A0B4JDG4       0.4864667      0.21138652 pre-mod(mdg4)-V
A0A0B4K5Z8       0.1355333      0.89152875           Sec23
A0A0B4K651       1.0487889      0.14619876           Best1
A0A0B4K6E6       1.1618556      0.06399887         CG17734

Linear model with lm


for(j in seq_along(my_comparisons)){
      
      res_lm <- data.frame(matrix(NA, nrow = nrow(ms_data), ncol = 2),
                           row.names = rownames(ms_data))
      
      for(i in 1:nrow(ms_data)){
            
            y <- as.numeric(ms_data[i,])
            my_conditions <- relevel(relevel(my_conditions, ref = my_comparisons[[j]][1]), ref = my_comparisons[[j]][2])
            
            fit <- lm(y ~ my_conditions)
            
            res_lm[i,1] <- coef(summary(fit))[2,1]
            res_lm[i,2] <- coef(summary(fit))[2,4]
      }
      
      my_comparison_name <- paste(my_comparisons[[j]], collapse = "vs")
      
      colnames(res_lm) <- c(paste0("coef_",my_comparison_name),
                            paste0("pval_",my_comparison_name))
      
      res_lm$gene_name <- gene_names
      
      assign(paste0("res_lm_", my_comparison_name), res_lm)
      
      rm(list = "res_lm")
}

head(res_lm_DomAvsCTRL)

           coef_DomAvsCTRL pval_DomAvsCTRL       gene_name
A0A0B4JCZ0      -0.1522889       0.8385450         CG42668
A0A0B4JD97      -1.1075667       0.1268511            tacc
A0A0B4JDG4       0.4864667       0.1528289 pre-mod(mdg4)-V
A0A0B4K5Z8       0.1355333       0.8953270           Sec23
A0A0B4K651       1.0487889       0.2617120           Best1
A0A0B4K6E6       1.1618556       0.0280833         CG17734

Linear model with limma


library(limma)

for(j in seq_along(my_comparisons)){
      
      res_limma <- data.frame(matrix(NA, nrow = nrow(ms_data), ncol = 2),
                              row.names = rownames(ms_data))
      
      design <- model.matrix( ~ 0 + my_conditions) 
      colnames(design) <- levels(my_conditions)
      
      my_comparison <- paste(my_comparisons[[j]], collapse = "-")
      my_contrast <- makeContrasts(my_comparison, levels = design)
      
      ###
      
      fit <- eBayes(contrasts.fit(lmFit(ms_data, design), my_contrast))
      
      res_limma[,1] <- as.numeric(fit$coefficients)
      res_limma[,2] <- as.numeric(fit$p.value)
      
      rm(list = "fit")
      
      ###
      
      my_comparison_name <- paste(my_comparisons[[j]], collapse = "vs")
      
      colnames(res_limma) <- c(paste0("coef_",my_comparison_name),
                               paste0("pval_",my_comparison_name))
      
      res_limma$gene_name <- gene_names
      
      assign(paste0("res_limma_", my_comparison_name), res_limma)
      
      rm(list = "res_limma")
}

head(res_limma_DomAvsCTRL)

           coef_DomAvsCTRL pval_DomAvsCTRL       gene_name
A0A0B4JCZ0      -0.1522889      0.81184589         CG42668
A0A0B4JD97      -1.1075667      0.07543584            tacc
A0A0B4JDG4       0.4864667      0.14075893 pre-mod(mdg4)-V
A0A0B4K5Z8       0.1355333      0.87566598           Sec23
A0A0B4K651       1.0487889      0.18399592           Best1
A0A0B4K6E6       1.1618556      0.01369977         CG17734

Volcano Plots


par(mfrow=c(3,3), mar = c(4,4,0,0), oma = c(1,3,1,1), mgp = c(2,1,0) ,cex=1.25)

fav_genes <- c("dom", "Tip60", "Arp6", "ocm")
my_methods <- c("ttest","lm","limma")

for(i in seq_along(my_methods)){
      
      my_method <- my_methods[i]
      
      for(j in seq_along(my_comparisons)){
            
            my_comparison_name <- paste(my_comparisons[[j]], collapse = "vs")
            
            res_plot <- get(paste0("res_",my_method,"_", my_comparison_name))
            
            plot(res_plot[,1], -log10(res_plot[,2]),
                 xlim = c(-10,10), ylim = c(0,12),
                 ylab = "-log10 p-value",
                 xlab = gsub("vs", " vs ", my_comparison_name), 
                 pch = 19, col = "grey", cex = 0.25)
            
            my_sign <- p.adjust(res_plot[,2], method = "BH") < 0.05
            
            points(res_plot[,1][my_sign], 
                   -log10(res_plot[,2][my_sign]),
                   pch = 19, col = "darkred", cex = 0.75)
            
            my_labeled <- res_plot$gene_name %in% fav_genes
            
            points(res_plot[,1][my_labeled], 
                   -log10(res_plot[,2][my_labeled]),
                   pch = 19, col = "black", cex = 0.25)
            
            text(res_plot[,1][my_labeled], 
                 -log10(res_plot[,2][my_labeled]),
                 res_plot$gene_name[my_labeled],
                 col = "black", adj = c(0.5,-0.25), cex = 0.8)
            
            legend("topleft", legend = c("all","fdr < 0.05", "labeled"),
                   pch = 19, col = c("grey", "darkred", "black"), pt.cex = c(0.5,1,0.5))
            
            if(j == 1){
                  mtext(my_method, side = 2, line = 3.5, cex =1.5, font = 2)
            }
      }
}

Conclusions

Corrections

If you see mistakes or want to suggest changes, please create an issue on the source repository.

Citation

For attribution, please cite this work as

Schauer (2020, Sept. 24). CompBioMethods: A comparison of statistical methods for analyzing mass spectrometry data. Retrieved from https://tschauer.github.io/blog/posts/2020-09-24-mass-spec-analysis-with-ttest-lm-or-limma/

BibTeX citation

@misc{schauer2020a,
  author = {Schauer, Tamas},
  title = {CompBioMethods: A comparison of statistical methods for analyzing mass spectrometry data},
  url = {https://tschauer.github.io/blog/posts/2020-09-24-mass-spec-analysis-with-ttest-lm-or-limma/},
  year = {2020}
}