Here I use a cross-correlation method to determine the width of nucleosome-free regions in MNase-seq data.
### helper functions ###
# generate Gaussian distribution
gaussmf <- function (x, sigma, mean) {
height <- 1
mfVals = (height * exp(-(((x - mean)^2)/(2 * (sigma^2)))))
mfVals
}
# create NFR-like pattern
makeNFR <- function(d, beforeRef, afterRef){
apply(sapply(c(-0.5,0.5), function(x) {gaussmf(seq(-beforeRef, afterRef), 40, x * d)}), 1, sum)
}
# scale between minimum and maximum
scaler <- function(x, probs = c(0.01,0.99)){
(x - quantile(x, probs[1])) / (quantile(x, probs[2])-quantile(x, probs[1]))
}
### main function ###
# iterate through each row of the matrix
# use ccf function to find maximum correlation
# pick d which corresponds to the maximum correlation
calcNFRwidth <- function(my_mat,
beforeRef = 200,
afterRef = 200,
mc.cores = 16){
# here before and after is not necessarily symmetric (e.g. if matrix is not centered at NFR)
my_mat_sub <- my_mat[, (ceiling(ncol(my_mat)/2)-beforeRef):(ceiling(ncol(my_mat)/2)+afterRef-1)]
res <- parallel::mclapply(1:nrow(my_mat_sub), mc.cores = mc.cores, function(ridx){
rx <- scaler(my_mat_sub[ridx, ])
bestR <- 0
spacingV <- NA
shiftV <- NA
total_width <- beforeRef+afterRef
if(round(sd(rx), 3) != 0 & sum(is.na(rx)) == 0){
for (d in 1:(total_width+100)) {
# here before and after should be symmetric (left and right to center)
y <- makeNFR(d = d, beforeRef = total_width/2, afterRef = total_width/2)
my_ccf <- ccf(rx,y, lag.max = 50, plot=FALSE)
r <- max(my_ccf$acf)
shift <- my_ccf$lag[my_ccf$acf == r]
if (r > bestR) {
bestR <- r
shiftV <- shift
spacingV <- d
}
}
}
c(r = round(bestR, 2), width = spacingV, shift = shiftV)
})
df <- t(as.data.frame(res))
rownames(df) <- rownames(my_mat_sub)
return(df)
}
# load site centered matrix
load("data_NFRwidth/smooth.mat.TF.yeast.invivo.MNase.rda")
# use +/- 200 bp from the center
beforeRef <- 200
afterRef <- 200
file_path <- paste0("data_NFRwidth/NFRwidth.TF.yeast.invivo.MNase.set",beforeRef,"_",afterRef,".txt")
if(file.exists(file_path)){
#read NFR data (pre-calculated)
df <- read.table(file_path, row.names = 1, header = T)
} else {
# run analysis
df <- calcNFRwidth(my_mat = my_mat,
beforeRef = beforeRef,
afterRef = afterRef)
write.table(df, file = file_path, sep = "\t", row.names = T, col.names = NA)
}
head(df)
r width shift
1 0.71 279 -10
2 0.51 112 24
3 0.67 421 40
4 0.87 1 -50
5 0.68 353 50
6 0.97 320 6
# heatmap ordered by NFR width
library(RColorBrewer)
par(mfrow=c(1,1), oma=c(2,0,0,1))
plotHeatmap(mat = my_mat, df = df, plot_title = "Nucleosomes centered at TF")
# check distributions
par(mfrow=c(1,3), oma=c(2,0,0,0))
hist(df[,1], breaks = 500, main = "NFR correlation", xlab="")
hist(df[,2], breaks = 250, main = "NFR width", xlab="")
hist(df[,3], breaks = 500, main = "NFR shift", xlab="")
# choose examples by interesting values
examples <- c(which(df[,2] > 0 & df[,2] <= 1 & df[,1] > 0.9)[1],
which(df[,2] > 165 & df[,2] <= 170 & df[,1] > 0.9)[1],
which(df[,2] > 295 & df[,2] <= 305 & df[,1] > 0.9)[1],
which(df[,2] > 499 & df[,2] <= 500 & df[,1] > 0.6)[1],
which(df[,2] > 0 & df[,2] <= 1 & df[,1] < 0.7)[1],
which(df[,2] > 165 & df[,2] <= 170 & df[,1] < 0.7)[1],
which(df[,2] > 295 & df[,2] <= 305 & df[,1] < 0.7)[1],
which(df[,2] > 499 & df[,2] <= 500 & df[,1] < 0.6)[1])
par(mfrow=c(2,4), oma=c(0,0,2,0))
plotExamples(examples = examples)
# load site centered matrix
load("data_NFRwidth/smooth.mat.plusOne.yeast.invivo.MNase.rda")
# settings are different window sizes
# windows are asymmetric as the signal is centered at the +1 nucleosome
settings <- list(c(450,50),
c(400,50),
c(350,50),
c(300,50))
for(i in seq_along(settings)){
beforeRef <- settings[[i]][1]
afterRef <- settings[[i]][2]
file_path <- paste0("data_NFRwidth/NFRwidth.plusOne.yeast.invivo.PE.MNase.set",beforeRef,"_",afterRef,".txt")
if(file.exists(file_path)){
#read NFR data (pre-calculated)
df <- read.table(file_path, row.names = 1, header = T)
assign(paste0("df",i), df)
} else {
# run analysis
df <- calcNFRwidth(my_mat = my_mat,
beforeRef = beforeRef,
afterRef = afterRef)
write.table(df, file = file_path, sep = "\t", row.names = T, col.names = NA)
assign(paste0("df",i), df)
}
}
# heatmap ordered by NFR width
library(RColorBrewer)
par(mfrow=c(1,4), oma=c(1,0,1,1))
# iterate through settings
for(i in seq_along(settings)){
plotHeatmap(mat = my_mat, df = get(paste0("df",i)),
plot_title = paste0("beforeRef = ",settings[[i]][1],"; afterRef =",settings[[i]][2]))
}
title(main = "Nucleosomes centered at TSS+1 nucleosome", outer = TRUE)
# check distributions
par(mfrow=c(1,3), oma=c(2,0,0,0))
df <- df3
beforeRef = 350; afterRef = 50
hist(df[,1], breaks = 500, main = "NFR correlation", xlab="")
hist(df[,2], breaks = 250, main = "NFR width", xlab="")
hist(df[,3], breaks = 500, main = "NFR shift", xlab="")
# choose examples by interesting values
examples <- c(which(df[,2] > 0 & df[,2] <= 1 & df[,1] > 0.8)[1],
which(df[,2] > 150 & df[,2] <= 170 & df[,1] > 0.8)[1],
which(df[,2] > 295 & df[,2] <= 305 & df[,1] > 0.9)[1],
which(df[,2] > 499 & df[,2] <= 500 & df[,1] > 0.6)[2],
which(df[,2] > 0 & df[,2] <= 1 & df[,1] < 0.7)[1],
which(df[,2] > 165 & df[,2] <= 170 & df[,1] < 0.7)[1],
which(df[,2] > 295 & df[,2] <= 305 & df[,1] < 0.7)[1],
which(df[,2] > 499 & df[,2] <= 500 & df[,1] < 0.6)[1])
par(mfrow=c(2,4), oma=c(0,0,2,0))
plotExamples(examples = examples)
# read published data
NFR <- read.table("data_NFRwidth/13059_2018_1398_MOESM2_ESM.txt", sep = "\t", header = T, stringsAsFactors = F)
library(LSD)
par(mfrow=c(1,1), oma=c(2,2,0,2))
# merge datasets
df_NFR <- merge(df, NFR, by.x = "row.names", by.y = "ORF")
# add 147 for comparison
heatscatter(df_NFR$width, df_NFR$NDR.Width+147,
xlab = "cross-correlation method", ylab = "Chereji method",
main = "Comparison", pch=19, cex=0.6,
xlim = c(100,600), ylim = c(100,600))
# heatmap ordered by NFR width
par(mfrow=c(1,2), oma=c(1,0,1,1))
my_mat <- my_mat[rownames(my_mat) %in% df_NFR$Row.names,]
my_mat <- my_mat[order(rownames(my_mat)),]
identical(as.character(df_NFR$Row.names), rownames(my_mat))
[1] TRUE
df <- df_NFR[,c(2,3,4)]
rownames(df) <- df_NFR$Row.names
plotHeatmap(mat = my_mat, df = df, plot_title = "cross-correlation method")
df <- df_NFR[,c(2,14,4)]
rownames(df) <- df_NFR$Row.names
plotHeatmap(mat = my_mat, df = df, plot_title = "Chereji method")
title(main = "Nucleosomes centered at TSS+1 nucleosome", line = -0.5, outer = TRUE)
Conclusions:
using larger window sizes is not an option (small nucleosome appears in the middle)
in this case the nucleosome might be exactly at the TF center (which results in a width=1)
The first dataset was generated by Elisa Oberbeckmann (LMU, BMC, Korber group)
The second dataset was generated by Ashish Singh (LMU, BMC, Müller-Planitz group)
The ISW1 and CHD1 ATP-dependent chromatin remodelers compete to set nucleosome spacing in vivo. Ocampo J, Chereji RV, Eriksson PR, Clark DJ. Nucleic Acids Res. 2016 Jun 2;44(10):4625-35. doi: 10.1093/nar/gkw068. Epub 2016 Feb 9. PMID: 26861626
Precise genome-wide mapping of single nucleosomes and linkers in vivo. Chereji RV, Ramachandran S, Bryson TD, Henikoff S. Genome Biol. 2018 Feb 9;19(1):19. doi: 10.1186/s13059-018-1398-0. PMID: 29426353
If you see mistakes or want to suggest changes, please create an issue on the source repository.
For attribution, please cite this work as
Schauer (2020, Jan. 8). CompBioMethods: Determining NFR width by cross-correlation. Retrieved from https://tschauer.github.io/blog/posts/2020-02-13-determining-nfr-width-by-cross-correlation/
BibTeX citation
@misc{schauer2020determining, author = {Schauer, Tamas}, title = {CompBioMethods: Determining NFR width by cross-correlation}, url = {https://tschauer.github.io/blog/posts/2020-02-13-determining-nfr-width-by-cross-correlation/}, year = {2020} }