K-Means Clustering - Female/Male Embryos

Drosophila female and male embryos can be distinguished using k-means clustering on marker genes from RNA-seq data.

Tamas Schauer
2019-11-20

Table of Contents



Embryo RNA-seq Data


Pre-processing


# do not run here
module load ngs/STAR/2.5.3a
module load ngs/samtools

### STAR index directory ###
STAR_INDEX="../genome/STAR_dmel-all-chromosome-r6.17"

### GTF file ###
GTF_FILE="../genome/dmel-all-r6.17.gtf"

# STAR run
STAR \
--runThreadN 8 \
--readFilesCommand gunzip -c \
--quantMode GeneCounts \
--genomeDir ${STAR_INDEX} \
--sjdbGTFfile ${GTF_FILE} \
--readFilesIn ${FILEBASE}_1.txt.gz ${FILEBASE}_2.txt.gz \
--outFileNamePrefix ${FILEBASE}. \
--outSAMtype BAM SortedByCoordinate \
--limitBAMsortRAM 5000000000 \
--outFilterMultimapNmax 1

Annotation


library(AnnotationDbi)
library(GenomicFeatures)

txdb = loadDb("data_Kmeans/txdb")

my_exons <- exons(txdb)

exons.list.per.gene <- exonsBy(txdb, by="gene")
exonic.gene.sizes <- sum(width(reduce(exons.list.per.gene))) / 1000

head(exonic.gene.sizes)

FBgn0000003 FBgn0000008 FBgn0000014 FBgn0000015 FBgn0000017 
      0.299       5.414       5.477      11.791      12.819 
FBgn0000018 
      1.794 

Count Table


# read data from text file
my_counts <- read.table("data_Kmeans/count_table.txt", header = T, row.names = 1)

my_counts[1:6,1:8]

             E1B  E1C  E1D  E1G  E1H  E2B  E2C  E2D
FBgn0000003   76   15   23   23   12   68   22   14
FBgn0000008 1395  676  875  699  439   34   23   19
FBgn0000014    1    0    0    0    1   88   34   47
FBgn0000015    0    0    0    0    2   19   19   11
FBgn0000017 4222 2186 2614 2784 1523 2850 1718 2083
FBgn0000018  908  364  515  278  253   36   15   19

dim(my_counts)

[1] 17485    54

Filtering and Normalization


# function to calculate TPM (see reference below)
countToTpm <- function(counts, effLen, scaler=1e6)
{
      rate <- log(counts) - log(effLen)
      denom <- log(sum(exp(rate)))
      exp(rate - denom + log(scaler))
}

# select genes
my_counts_Genes <- my_counts[grep("FBgn", rownames(my_counts)),]

# filter low counts
my_filter <- apply(my_counts_Genes, 1, function(x) length(x[x>20]) >= ncol(my_counts_Genes)/12)

my_counts_Filtered <- my_counts_Genes[my_filter,]

# match filtered genes and exonic gene sizes
my_genes_Filtered <- intersect(rownames(my_counts_Filtered), names(exonic.gene.sizes))

exonic.gene.sizes <- exonic.gene.sizes[names(exonic.gene.sizes) %in% my_genes_Filtered]
exonic.gene.sizes <- exonic.gene.sizes[order(names(exonic.gene.sizes))]

my_counts_Filtered <- my_counts_Filtered[rownames(my_counts_Filtered) %in% my_genes_Filtered,]
my_counts_Filtered <- my_counts_Filtered[order(rownames(my_counts_Filtered)),]

# check whether names are identical
stopifnot(identical(rownames(my_counts_Filtered), names(exonic.gene.sizes)))

# calculate TPM
my_TPM <- apply(my_counts_Filtered, 2, FUN = function(x){countToTpm(x, exonic.gene.sizes)})

# take log
log2_TPM <- log2(my_TPM+0.5)

log2_TPM[1:6,1:6]

                   E1B       E1C       E1D       E1G        E1H
FBgn0000003  5.4913503  4.296149  4.656480  4.753028  4.3757731
FBgn0000008  5.5107778  5.589526  5.712513  5.489284  5.3727542
FBgn0000014 -0.9106273 -1.000000 -1.000000 -1.000000 -0.7559121
FBgn0000015 -1.0000000 -1.000000 -1.000000 -1.000000 -0.7719412
FBgn0000017  5.8614745  6.035190  6.045044  6.233035  5.9183222
FBgn0000018  6.4770042  6.284183  6.535293  5.749910  6.1637671
                   E2B
FBgn0000003 6.23310042
FBgn0000008 1.35804085
FBgn0000014 2.53083317
FBgn0000015 0.04186372
FBgn0000017 6.20060925
FBgn0000018 2.82643640

dim(log2_TPM)

[1] 8983   54

K-means Clustering

Clustering with k=3


library(org.Dm.eg.db)

# select well known sex-specific marker genes
my_favorite_genes <- c("Sxl", "msl-2" ,"lncRNA:roX1", "lncRNA:roX2")
names(my_favorite_genes) <- mapIds(org.Dm.eg.db, my_favorite_genes, "FLYBASE", keytype="SYMBOL", multiVals="first")

set.seed(99)

# k-means with k=3
kmeans_clusters <- kmeans(apply(log2_TPM[ rownames(log2_TPM) %in% names(my_favorite_genes), ], 1, 
                         function(x){ (x-mean(x))/sd(x) }),3, nstart=25, iter.max=1000)

my_sexes <- factor(as.integer(kmeans_clusters$cluster))

head(my_sexes)

[1] 1 1 1 1 1 2
Levels: 1 2 3

Plot Marker Genes


# conditions
my_conditions <- factor(substr(colnames(log2_TPM),1,2))

# setup color
my_color_palette <- c("#999999", "#d6604d", "#4393c3")

par(mfrow=c(2,2), oma=c(2,2,2,2), cex=1.25)

# iterate through marker genes
for(i in seq_along(my_favorite_genes)){
    
    plotDots(log2_TPM[rownames(log2_TPM) == names(my_favorite_genes)[i],], 
                     my_title = my_favorite_genes[i], 
                     color_palette = my_color_palette, 
                     color_groups = my_sexes, 
                     conditions = my_conditions, 
                     point_size = 0.8)
}

mtext(text = "Stages", side = 1, line = -2, outer = TRUE, cex=1.5)
mtext(text = "log2 TPM", side = 2, outer = TRUE, cex=1.5)

par(fig = c(0,1,0,1), cex=1.25, new = TRUE)
plot(0,0, xlab="", ylab="", xaxt="n", yaxt="n", bty="n", type="n" )
legend("center", legend = c("Early", "Female", "Male"), 
       pch = 19, col = my_color_palette, horiz = T, )


Conclusions


Data Source

The dataset was generated by Tamas Schauer (LMU, BMC, Becker group)


References

Progressive dosage compensation during Drosophila embryogenesis is reflected by gene arrangement. Prayitno K, Schauer T, Regnard C, Becker PB. EMBO Rep. 2019 Aug;20(8):e48138. doi: 10.15252/embr.201948138. Epub 2019 Jul 9. PMID: 31286660

TPM calculation: https://haroldpimentel.wordpress.com/2014/05/08/what-the-fpkm-a-review-rna-seq-expression-units/

Corrections

If you see mistakes or want to suggest changes, please create an issue on the source repository.

Citation

For attribution, please cite this work as

Schauer (2019, Nov. 20). CompBioMethods: K-Means Clustering - Female/Male Embryos. Retrieved from https://tschauer.github.io/blog/posts/2019-11-28-k-means-clustering-femalemale-embryos/

BibTeX citation

@misc{schauer2019k-means,
  author = {Schauer, Tamas},
  title = {CompBioMethods: K-Means Clustering - Female/Male Embryos},
  url = {https://tschauer.github.io/blog/posts/2019-11-28-k-means-clustering-femalemale-embryos/},
  year = {2019}
}