CompBioMethods: A comparison of statistical methods for analyzing mass spectrometry data

Tamas Schauer

Data Description

AP-MS data of DomA, DomB and control IP
3 biological with 3 technical replicates each
biological replicates are from different clones (c_1, c_2 etc.)
label-free quantification (LFQ) and imputation was performed previously
technical replicates were averaged
data were log-transformed
data source: Scacchetti et al. 2020, https://cdn.elifesciences.org/articles/56325/elife-56325-supp1-v1.xlsx

Setup


# load data
ms_data <- read.table("data_massspec/elife-56325-supp1-v1.txt", header = T, row.names = 1, stringsAsFactors = FALSE)

ms_data[1:5,1:4]


           DomA_c_1_mean DomA_c_2_mean DomA_c_3_mean DomB_c_1_mean
A0A0B4JCZ0      30.07340      28.10453      30.76833      29.19630
A0A0B4JD97      26.56143      29.04703      27.66783      28.51773
A0A0B4JDG4      26.90077      27.31623      26.88430      27.35533
A0A0B4K5Z8      27.83300      26.35150      27.51597      25.59287
A0A0B4K651      26.55867      26.90197      26.92103      26.23967


library(org.Dm.eg.db)
# convert protein ids to gene names
gene_names <- mapIds(x = org.Dm.eg.db, keys = gsub("\\;.*","", rownames(ms_data)), 
                     keytype = "UNIPROT", column = "SYMBOL", multiVals = "first")


# setup conditions
my_conditions <- factor(gsub("_.*","",colnames(ms_data)))
my_conditions


[1] DomA DomA DomA DomB DomB DomB CTRL CTRL CTRL
Levels: CTRL DomA DomB


# setup comparisons
my_comparisons <- list(c(levels(my_conditions)[2], levels(my_conditions)[1]),
                       c(levels(my_conditions)[3], levels(my_conditions)[1]),
                       c(levels(my_conditions)[2], levels(my_conditions)[3]))
my_comparisons


[[1]]
[1] "DomA" "CTRL"

[[2]]
[1] "DomB" "CTRL"

[[3]]
[1] "DomA" "DomB"

Students t-test

fit a t-test for each pair-wise comparison for each protein
the model includes only 6 data points in each test
3 tests (comparisons) are performed separately (multiple testing)


for(j in seq_along(my_comparisons)){
      
      res_ttest <- data.frame(matrix(NA, nrow = nrow(ms_data), ncol = 2),
                              row.names = rownames(ms_data))
      
      for(i in 1:nrow(ms_data)){
            
            fit <-  t.test(x = ms_data[i, my_conditions == my_comparisons[[j]][1]], 
                           y = ms_data[i, my_conditions == my_comparisons[[j]][2]], 
                           var.equal = TRUE)
            
            res_ttest[i,1] <- fit$estimate[1] - fit$estimate[2]
            res_ttest[i,2] <- fit$p.value
            
            rm(list = "fit")
      }
      
      my_comparison_name <- paste(my_comparisons[[j]], collapse = "vs")
      
      colnames(res_ttest) <- c(paste0("coef_",my_comparison_name),
                               paste0("pval_",my_comparison_name))
      
      res_ttest$gene_name <- gene_names
      
      assign(paste0("res_ttest_", my_comparison_name), res_ttest)
      
      rm(list = "res_ttest")
}


head(res_ttest_DomAvsCTRL)


           coef_DomAvsCTRL pval_DomAvsCTRL       gene_name
A0A0B4JCZ0      -0.1522889      0.86138949         CG42668
A0A0B4JD97      -1.1075667      0.20122473            tacc
A0A0B4JDG4       0.4864667      0.21138652 pre-mod(mdg4)-V
A0A0B4K5Z8       0.1355333      0.89152875           Sec23
A0A0B4K651       1.0487889      0.14619876           Best1
A0A0B4K6E6       1.1618556      0.06399887         CG17734

Linear model with lm

fit a linear model for each protein
the model includes all 9 data points in each model
3 contrasts (comparisons) are performed on the same model


for(j in seq_along(my_comparisons)){
      
      res_lm <- data.frame(matrix(NA, nrow = nrow(ms_data), ncol = 2),
                           row.names = rownames(ms_data))
      
      for(i in 1:nrow(ms_data)){
            
            y <- as.numeric(ms_data[i,])
            my_conditions <- relevel(relevel(my_conditions, ref = my_comparisons[[j]][1]), ref = my_comparisons[[j]][2])
            
            fit <- lm(y ~ my_conditions)
            
            res_lm[i,1] <- coef(summary(fit))[2,1]
            res_lm[i,2] <- coef(summary(fit))[2,4]
      }
      
      my_comparison_name <- paste(my_comparisons[[j]], collapse = "vs")
      
      colnames(res_lm) <- c(paste0("coef_",my_comparison_name),
                            paste0("pval_",my_comparison_name))
      
      res_lm$gene_name <- gene_names
      
      assign(paste0("res_lm_", my_comparison_name), res_lm)
      
      rm(list = "res_lm")
}


head(res_lm_DomAvsCTRL)


           coef_DomAvsCTRL pval_DomAvsCTRL       gene_name
A0A0B4JCZ0      -0.1522889       0.8385450         CG42668
A0A0B4JD97      -1.1075667       0.1268511            tacc
A0A0B4JDG4       0.4864667       0.1528289 pre-mod(mdg4)-V
A0A0B4K5Z8       0.1355333       0.8953270           Sec23
A0A0B4K651       1.0487889       0.2617120           Best1
A0A0B4K6E6       1.1618556       0.0280833         CG17734

Linear model with limma

fit a linear model including all data points
information is “borrowed” across proteins
3 contrasts (comparisons) are performed on the same model


library(limma)

for(j in seq_along(my_comparisons)){
      
      res_limma <- data.frame(matrix(NA, nrow = nrow(ms_data), ncol = 2),
                              row.names = rownames(ms_data))
      
      design <- model.matrix( ~ 0 + my_conditions) 
      colnames(design) <- levels(my_conditions)
      
      my_comparison <- paste(my_comparisons[[j]], collapse = "-")
      my_contrast <- makeContrasts(my_comparison, levels = design)
      
      ###
      
      fit <- eBayes(contrasts.fit(lmFit(ms_data, design), my_contrast))
      
      res_limma[,1] <- as.numeric(fit$coefficients)
      res_limma[,2] <- as.numeric(fit$p.value)
      
      rm(list = "fit")
      
      ###
      
      my_comparison_name <- paste(my_comparisons[[j]], collapse = "vs")
      
      colnames(res_limma) <- c(paste0("coef_",my_comparison_name),
                               paste0("pval_",my_comparison_name))
      
      res_limma$gene_name <- gene_names
      
      assign(paste0("res_limma_", my_comparison_name), res_limma)
      
      rm(list = "res_limma")
}


head(res_limma_DomAvsCTRL)


           coef_DomAvsCTRL pval_DomAvsCTRL       gene_name
A0A0B4JCZ0      -0.1522889      0.81184589         CG42668
A0A0B4JD97      -1.1075667      0.07543584            tacc
A0A0B4JDG4       0.4864667      0.14075893 pre-mod(mdg4)-V
A0A0B4K5Z8       0.1355333      0.87566598           Sec23
A0A0B4K651       1.0487889      0.18399592           Best1
A0A0B4K6E6       1.1618556      0.01369977         CG17734

Volcano Plots

loop through methods and comparisons
plot difference against -log10 p-value
color proteins with fdr < 0.05
label some interesting proteins


par(mfrow=c(3,3), mar = c(4,4,0,0), oma = c(1,3,1,1), mgp = c(2,1,0) ,cex=1.25)

fav_genes <- c("dom", "Tip60", "Arp6", "ocm")
my_methods <- c("ttest","lm","limma")

for(i in seq_along(my_methods)){
      
      my_method <- my_methods[i]
      
      for(j in seq_along(my_comparisons)){
            
            my_comparison_name <- paste(my_comparisons[[j]], collapse = "vs")
            
            res_plot <- get(paste0("res_",my_method,"_", my_comparison_name))
            
            plot(res_plot[,1], -log10(res_plot[,2]),
                 xlim = c(-10,10), ylim = c(0,12),
                 ylab = "-log10 p-value",
                 xlab = gsub("vs", " vs ", my_comparison_name), 
                 pch = 19, col = "grey", cex = 0.25)
            
            my_sign <- p.adjust(res_plot[,2], method = "BH") < 0.05
            
            points(res_plot[,1][my_sign], 
                   -log10(res_plot[,2][my_sign]),
                   pch = 19, col = "darkred", cex = 0.75)
            
            my_labeled <- res_plot$gene_name %in% fav_genes
            
            points(res_plot[,1][my_labeled], 
                   -log10(res_plot[,2][my_labeled]),
                   pch = 19, col = "black", cex = 0.25)
            
            text(res_plot[,1][my_labeled], 
                 -log10(res_plot[,2][my_labeled]),
                 res_plot$gene_name[my_labeled],
                 col = "black", adj = c(0.5,-0.25), cex = 0.8)
            
            legend("topleft", legend = c("all","fdr < 0.05", "labeled"),
                   pch = 19, col = c("grey", "darkred", "black"), pt.cex = c(0.5,1,0.5))
            
            if(j == 1){
                  mtext(my_method, side = 2, line = 3.5, cex =1.5, font = 2)
            }
      }
}

Conclusions

p-values are less significant using t.test compared to lm or limma
it gives more power to fit a model with all data points (e.g. Tip60 in DomA vs DomB)
some proteins with a small difference are significant using t.test or lm (e.g. ocm, false positive?)
limma gives a “better separated” volcano plot.

A comparison of statistical methods for analyzing mass spectrometry data

Table of Contents

Data Description

Setup

Students t-test

Linear model with lm

Linear model with limma

Volcano Plots

Conclusions

Corrections

Citation