Averaging data in equal sized bins falsifies weak assosciations to strong.
# set n
n <- 5000
# generate data
df <- data.frame(x = rnorm(n),
y = rnorm(n))
# order by x
df <- df[order(df$x),]
# plot data
plot(y ~ x, data = df, main = "", pch=19, col = "#00000022", xlim = c(-4,4), ylim = c(-4,4))
legend("topleft", legend = paste("r =", round(cor(df$x,df$y),2)))
legend("topright", legend = paste("pval =", format(cor.test(df$x,df$y)$p.value, digits = 2, scientific = T)))
abline(lm(y ~x, data = df), col="red3", lwd=2)
par(mfrow=c(2,2), oma=c(2,2,2,2))
# plot without binning
plot(y ~ x, data = df, main = "no binning", pch=19, col = "#00000022")
legend("topleft", legend = paste("r =", round(cor(df$x,df$y),2)))
legend("topright", legend = paste("pval =", format(cor.test(df$x,df$y)$p.value, digits = 2, scientific = T)))
abline(lm(y ~x, data = df), col="red3", lwd=2)
# iterate through different number of bins
for(n_bins in c(150,50,20)){
x_binned <- sapply(1:n_bins, FUN = function(i){ mean(df$x[((i-1)*(n/n_bins)+1):((i)*(n/n_bins))])})
y_binned <- sapply(1:n_bins, FUN = function(i){ mean(df$y[((i-1)*(n/n_bins)+1):((i)*(n/n_bins))])})
plot(x_binned, y_binned, main = paste("n bins =", n_bins), pch=19, col = "#00000055")
legend("topleft", legend = paste("r =", round(cor(x_binned,y_binned),2)))
legend("topright", legend = paste("pval =", format(cor.test(x_binned,y_binned)$p.value, digits = 2, scientific = T)))
abline(lm(y_binned ~ x_binned), col="red3", lwd=2)
par(mfrow=c(2,2), oma=c(2,2,2,2))
# plot without binning
plot(y ~ x, data = df, main = "no binning", pch=19, col = "#00000022", xlim = c(-4,4), ylim = c(-4,4))
legend("topleft", legend = paste("r =", round(cor(df$x,df$y),2)))
legend("topright", legend = paste("pval =", format(cor.test(df$x,df$y)$p.value, digits = 2, scientific = T)))
abline(lm(y ~x, data = df), col="red3", lwd=2)
# iterate through different number of bins
for(n_bins in c(150,50,20)){
x_binned <- sapply(1:n_bins, FUN = function(i){ mean(df$x[((i-1)*(n/n_bins)+1):((i)*(n/n_bins))])})
y_binned <- sapply(1:n_bins, FUN = function(i){ mean(df$y[((i-1)*(n/n_bins)+1):((i)*(n/n_bins))])})
plot(x_binned, y_binned, main = paste("n bins =", n_bins), pch=19, col = "#00000055", xlim = c(-4,4), ylim = c(-4,4))
legend("topleft", legend = paste("r =", round(cor(x_binned,y_binned),2)))
legend("topright", legend = paste("pval =", format(cor.test(x_binned,y_binned)$p.value, digits = 2, scientific = T)))
abline(lm(y_binned ~ x_binned), col="red3", lwd=2)
If you see mistakes or want to suggest changes, please create an issue on the source repository.
For attribution, please cite this work as
Schauer (2019, Nov. 26). CompBioMethods: Data Binning and Correlation. Retrieved from https://tschauer.github.io/blog/posts/2019-11-29-data-binning-and-correlation/
BibTeX citation
@misc{schauer2019data, author = {Schauer, Tamas}, title = {CompBioMethods: Data Binning and Correlation}, url = {https://tschauer.github.io/blog/posts/2019-11-29-data-binning-and-correlation/}, year = {2019} }