Determining NFR width by cross-correlation

Here I use a cross-correlation method to determine the width of nucleosome-free regions in MNase-seq data.

Tamas Schauer true
2020-01-08

Table of Contents



Nucleosome Data by MNase-seq


Method Description



### helper functions ###

# generate Gaussian distribution
gaussmf <- function (x, sigma, mean) {
      height <- 1
      mfVals = (height * exp(-(((x - mean)^2)/(2 * (sigma^2)))))
      mfVals
}

# create NFR-like pattern
makeNFR <- function(d, beforeRef, afterRef){
      
      apply(sapply(c(-0.5,0.5), function(x) {gaussmf(seq(-beforeRef, afterRef), 40, x * d)}), 1, sum)
      
}

# scale between minimum and maximum
scaler <- function(x, probs = c(0.01,0.99)){
      (x - quantile(x, probs[1])) / (quantile(x, probs[2])-quantile(x, probs[1]))
}


### main function ###

# iterate through each row of the matrix
# use ccf function to find maximum correlation
# pick d which corresponds to the maximum correlation

calcNFRwidth <- function(my_mat,    
                          beforeRef = 200,
                          afterRef =  200,
                          mc.cores = 16){

    
    # here before and after is not necessarily symmetric (e.g. if matrix is not centered at NFR)
    my_mat_sub <- my_mat[, (ceiling(ncol(my_mat)/2)-beforeRef):(ceiling(ncol(my_mat)/2)+afterRef-1)]
    
    
    res <- parallel::mclapply(1:nrow(my_mat_sub), mc.cores = mc.cores, function(ridx){
        
        rx <- scaler(my_mat_sub[ridx, ])
        
        bestR <- 0
        spacingV <- NA
        shiftV <- NA
        
        total_width <- beforeRef+afterRef
        
        if(round(sd(rx), 3) != 0 & sum(is.na(rx)) == 0){
            
            for (d in 1:(total_width+100)) {
                
                # here before and after should be symmetric (left and right to center)
                y <-  makeNFR(d = d, beforeRef = total_width/2, afterRef = total_width/2)
                
                my_ccf <- ccf(rx,y, lag.max = 50, plot=FALSE)
                
                r <- max(my_ccf$acf)
                shift <- my_ccf$lag[my_ccf$acf == r]
                
                if (r > bestR) {
                    bestR <- r
                    shiftV <- shift
                    spacingV <- d
                }
            }
            
        }
        
        
        c(r = round(bestR, 2), width = spacingV, shift = shiftV)
        
    })                         
    
    df <- t(as.data.frame(res))
    rownames(df) <- rownames(my_mat_sub)
    return(df)

}

Budding Yeast Data #1


# load site centered matrix
load("data_NFRwidth/smooth.mat.TF.yeast.invivo.MNase.rda")

# use +/- 200 bp from the center
beforeRef <- 200 
afterRef  <- 200

file_path <- paste0("data_NFRwidth/NFRwidth.TF.yeast.invivo.MNase.set",beforeRef,"_",afterRef,".txt")

if(file.exists(file_path)){
    
    #read NFR data (pre-calculated)
    df <- read.table(file_path, row.names = 1, header = T)
    
} else {
    
    # run analysis
    df <- calcNFRwidth(my_mat = my_mat,
                    beforeRef = beforeRef,
                    afterRef = afterRef)
    
    write.table(df, file = file_path, sep = "\t", row.names = T, col.names = NA)
}

head(df)

     r width shift
1 0.71   279   -10
2 0.51   112    24
3 0.67   421    40
4 0.87     1   -50
5 0.68   353    50
6 0.97   320     6

Heatmap Visualization


# heatmap  ordered by NFR width
library(RColorBrewer)

par(mfrow=c(1,1), oma=c(2,0,0,1))

plotHeatmap(mat = my_mat, df = df, plot_title = "Nucleosomes centered at TF")


Exploring the Values


# check distributions
par(mfrow=c(1,3), oma=c(2,0,0,0))

hist(df[,1], breaks = 500, main = "NFR correlation", xlab="")
hist(df[,2], breaks = 250, main = "NFR width", xlab="")
hist(df[,3], breaks = 500, main = "NFR shift", xlab="")


Example Sites


# choose examples by interesting values

examples <- c(which(df[,2] > 0   & df[,2] <= 1 & df[,1] > 0.9)[1],
              which(df[,2] > 165 & df[,2] <= 170 & df[,1] > 0.9)[1],
              which(df[,2] > 295 & df[,2] <= 305 & df[,1] > 0.9)[1],
              which(df[,2] > 499 & df[,2] <= 500 & df[,1] > 0.6)[1],
              which(df[,2] > 0   & df[,2] <= 1 & df[,1] < 0.7)[1],
              which(df[,2] > 165 & df[,2] <= 170 & df[,1] < 0.7)[1],
              which(df[,2] > 295 & df[,2] <= 305 & df[,1] < 0.7)[1],
              which(df[,2] > 499 & df[,2] <= 500 & df[,1] < 0.6)[1])

par(mfrow=c(2,4), oma=c(0,0,2,0))

plotExamples(examples = examples)


Budding Yeast Data #2


# load site centered matrix
load("data_NFRwidth/smooth.mat.plusOne.yeast.invivo.MNase.rda")

# settings are different window sizes
# windows are asymmetric as the signal is centered at the +1 nucleosome
settings <- list(c(450,50),
                 c(400,50),
                 c(350,50),
                 c(300,50))

for(i in seq_along(settings)){
    
    beforeRef <- settings[[i]][1] 
    afterRef  <- settings[[i]][2] 
        
    file_path <- paste0("data_NFRwidth/NFRwidth.plusOne.yeast.invivo.PE.MNase.set",beforeRef,"_",afterRef,".txt")

    if(file.exists(file_path)){
        
        #read NFR data (pre-calculated)
        df <- read.table(file_path, row.names = 1, header = T)
        
        assign(paste0("df",i), df)
        
    } else {
        # run analysis
        df <- calcNFRwidth(my_mat = my_mat,
                            beforeRef = beforeRef,
                            afterRef = afterRef)
        write.table(df, file = file_path, sep = "\t", row.names = T, col.names = NA)

        assign(paste0("df",i), df)
    }
}

Heatmap Visualization


# heatmap  ordered by NFR width
library(RColorBrewer)

par(mfrow=c(1,4), oma=c(1,0,1,1))

# iterate through settings
for(i in seq_along(settings)){
    
    plotHeatmap(mat = my_mat, df = get(paste0("df",i)), 
            plot_title = paste0("beforeRef = ",settings[[i]][1],"; afterRef =",settings[[i]][2]))
}

title(main = "Nucleosomes centered at TSS+1 nucleosome", outer = TRUE)


Exploring the Values


# check distributions
par(mfrow=c(1,3), oma=c(2,0,0,0))

df <- df3
beforeRef = 350; afterRef = 50

hist(df[,1], breaks = 500, main = "NFR correlation", xlab="")
hist(df[,2], breaks = 250, main = "NFR width", xlab="")
hist(df[,3], breaks = 500, main = "NFR shift", xlab="")


Example Sites


# choose examples by interesting values

examples <- c(which(df[,2] > 0   & df[,2] <= 1 & df[,1] > 0.8)[1],
              which(df[,2] > 150 & df[,2] <= 170 & df[,1] > 0.8)[1],
              which(df[,2] > 295 & df[,2] <= 305 & df[,1] > 0.9)[1],
              which(df[,2] > 499 & df[,2] <= 500 & df[,1] > 0.6)[2],
              which(df[,2] > 0   & df[,2] <= 1 & df[,1] < 0.7)[1],
              which(df[,2] > 165 & df[,2] <= 170 & df[,1] < 0.7)[1],
              which(df[,2] > 295 & df[,2] <= 305 & df[,1] < 0.7)[1],
              which(df[,2] > 499 & df[,2] <= 500 & df[,1] < 0.6)[1])

par(mfrow=c(2,4), oma=c(0,0,2,0))

plotExamples(examples = examples)


Compare to Chereji etal. data


# read published data
NFR <- read.table("data_NFRwidth/13059_2018_1398_MOESM2_ESM.txt", sep = "\t", header = T, stringsAsFactors = F)

library(LSD)
par(mfrow=c(1,1), oma=c(2,2,0,2))


# merge datasets
df_NFR <- merge(df, NFR, by.x = "row.names", by.y = "ORF")

# add 147 for comparison
heatscatter(df_NFR$width, df_NFR$NDR.Width+147,
            xlab = "cross-correlation method", ylab = "Chereji method",
            main = "Comparison", pch=19, cex=0.6,
            xlim = c(100,600), ylim = c(100,600))


Heatmap Visualization


# heatmap  ordered by NFR width
par(mfrow=c(1,2), oma=c(1,0,1,1))

my_mat <- my_mat[rownames(my_mat) %in% df_NFR$Row.names,]
my_mat <- my_mat[order(rownames(my_mat)),]

identical(as.character(df_NFR$Row.names), rownames(my_mat))

[1] TRUE

df <- df_NFR[,c(2,3,4)]
rownames(df) <- df_NFR$Row.names

plotHeatmap(mat = my_mat, df = df, plot_title = "cross-correlation method")


df <- df_NFR[,c(2,14,4)]
rownames(df) <- df_NFR$Row.names

plotHeatmap(mat = my_mat, df = df, plot_title = "Chereji method")

title(main = "Nucleosomes centered at TSS+1 nucleosome", line = -0.5, outer = TRUE)

Conclusions:


Data Source

The first dataset was generated by Elisa Oberbeckmann (LMU, BMC, Korber group)

The second dataset was generated by Ashish Singh (LMU, BMC, Müller-Planitz group)


References

The ISW1 and CHD1 ATP-dependent chromatin remodelers compete to set nucleosome spacing in vivo. Ocampo J, Chereji RV, Eriksson PR, Clark DJ. Nucleic Acids Res. 2016 Jun 2;44(10):4625-35. doi: 10.1093/nar/gkw068. Epub 2016 Feb 9. PMID: 26861626

Precise genome-wide mapping of single nucleosomes and linkers in vivo. Chereji RV, Ramachandran S, Bryson TD, Henikoff S. Genome Biol. 2018 Feb 9;19(1):19. doi: 10.1186/s13059-018-1398-0. PMID: 29426353

Corrections

If you see mistakes or want to suggest changes, please create an issue on the source repository.

Citation

For attribution, please cite this work as

Schauer (2020, Jan. 8). CompBioMethods: Determining NFR width by cross-correlation. Retrieved from https://tschauer.github.io/blog/posts/2020-02-13-determining-nfr-width-by-cross-correlation/

BibTeX citation

@misc{schauer2020determining,
  author = {Schauer, Tamas},
  title = {CompBioMethods: Determining NFR width by cross-correlation},
  url = {https://tschauer.github.io/blog/posts/2020-02-13-determining-nfr-width-by-cross-correlation/},
  year = {2020}
}