Drosophila female and male embryos can be distinguished using k-means clustering on marker genes from RNA-seq data.
# do not run here
module load ngs/STAR/2.5.3a
module load ngs/samtools
### STAR index directory ###
STAR_INDEX="../genome/STAR_dmel-all-chromosome-r6.17"
### GTF file ###
GTF_FILE="../genome/dmel-all-r6.17.gtf"
# STAR run
STAR \
--runThreadN 8 \
--readFilesCommand gunzip -c \
--quantMode GeneCounts \
--genomeDir ${STAR_INDEX} \
--sjdbGTFfile ${GTF_FILE} \
--readFilesIn ${FILEBASE}_1.txt.gz ${FILEBASE}_2.txt.gz \
--outFileNamePrefix ${FILEBASE}. \
--outSAMtype BAM SortedByCoordinate \
--limitBAMsortRAM 5000000000 \
--outFilterMultimapNmax 1
library(AnnotationDbi)
library(GenomicFeatures)
txdb = loadDb("data_Kmeans/txdb")
my_exons <- exons(txdb)
exons.list.per.gene <- exonsBy(txdb, by="gene")
exonic.gene.sizes <- sum(width(reduce(exons.list.per.gene))) / 1000
head(exonic.gene.sizes)
FBgn0000003 FBgn0000008 FBgn0000014 FBgn0000015 FBgn0000017
0.299 5.414 5.477 11.791 12.819
FBgn0000018
1.794
# read data from text file
my_counts <- read.table("data_Kmeans/count_table.txt", header = T, row.names = 1)
my_counts[1:6,1:8]
E1B E1C E1D E1G E1H E2B E2C E2D
FBgn0000003 76 15 23 23 12 68 22 14
FBgn0000008 1395 676 875 699 439 34 23 19
FBgn0000014 1 0 0 0 1 88 34 47
FBgn0000015 0 0 0 0 2 19 19 11
FBgn0000017 4222 2186 2614 2784 1523 2850 1718 2083
FBgn0000018 908 364 515 278 253 36 15 19
dim(my_counts)
[1] 17485 54
# function to calculate TPM (see reference below)
countToTpm <- function(counts, effLen, scaler=1e6)
{
rate <- log(counts) - log(effLen)
denom <- log(sum(exp(rate)))
exp(rate - denom + log(scaler))
}
# select genes
my_counts_Genes <- my_counts[grep("FBgn", rownames(my_counts)),]
# filter low counts
my_filter <- apply(my_counts_Genes, 1, function(x) length(x[x>20]) >= ncol(my_counts_Genes)/12)
my_counts_Filtered <- my_counts_Genes[my_filter,]
# match filtered genes and exonic gene sizes
my_genes_Filtered <- intersect(rownames(my_counts_Filtered), names(exonic.gene.sizes))
exonic.gene.sizes <- exonic.gene.sizes[names(exonic.gene.sizes) %in% my_genes_Filtered]
exonic.gene.sizes <- exonic.gene.sizes[order(names(exonic.gene.sizes))]
my_counts_Filtered <- my_counts_Filtered[rownames(my_counts_Filtered) %in% my_genes_Filtered,]
my_counts_Filtered <- my_counts_Filtered[order(rownames(my_counts_Filtered)),]
# check whether names are identical
stopifnot(identical(rownames(my_counts_Filtered), names(exonic.gene.sizes)))
# calculate TPM
my_TPM <- apply(my_counts_Filtered, 2, FUN = function(x){countToTpm(x, exonic.gene.sizes)})
# take log
log2_TPM <- log2(my_TPM+0.5)
log2_TPM[1:6,1:6]
E1B E1C E1D E1G E1H
FBgn0000003 5.4913503 4.296149 4.656480 4.753028 4.3757731
FBgn0000008 5.5107778 5.589526 5.712513 5.489284 5.3727542
FBgn0000014 -0.9106273 -1.000000 -1.000000 -1.000000 -0.7559121
FBgn0000015 -1.0000000 -1.000000 -1.000000 -1.000000 -0.7719412
FBgn0000017 5.8614745 6.035190 6.045044 6.233035 5.9183222
FBgn0000018 6.4770042 6.284183 6.535293 5.749910 6.1637671
E2B
FBgn0000003 6.23310042
FBgn0000008 1.35804085
FBgn0000014 2.53083317
FBgn0000015 0.04186372
FBgn0000017 6.20060925
FBgn0000018 2.82643640
dim(log2_TPM)
[1] 8983 54
library(org.Dm.eg.db)
# select well known sex-specific marker genes
my_favorite_genes <- c("Sxl", "msl-2" ,"lncRNA:roX1", "lncRNA:roX2")
names(my_favorite_genes) <- mapIds(org.Dm.eg.db, my_favorite_genes, "FLYBASE", keytype="SYMBOL", multiVals="first")
set.seed(99)
# k-means with k=3
kmeans_clusters <- kmeans(apply(log2_TPM[ rownames(log2_TPM) %in% names(my_favorite_genes), ], 1,
function(x){ (x-mean(x))/sd(x) }),3, nstart=25, iter.max=1000)
my_sexes <- factor(as.integer(kmeans_clusters$cluster))
head(my_sexes)
[1] 1 1 1 1 1 2
Levels: 1 2 3
# conditions
my_conditions <- factor(substr(colnames(log2_TPM),1,2))
# setup color
my_color_palette <- c("#999999", "#d6604d", "#4393c3")
par(mfrow=c(2,2), oma=c(2,2,2,2), cex=1.25)
# iterate through marker genes
for(i in seq_along(my_favorite_genes)){
plotDots(log2_TPM[rownames(log2_TPM) == names(my_favorite_genes)[i],],
my_title = my_favorite_genes[i],
color_palette = my_color_palette,
color_groups = my_sexes,
conditions = my_conditions,
point_size = 0.8)
}
mtext(text = "Stages", side = 1, line = -2, outer = TRUE, cex=1.5)
mtext(text = "log2 TPM", side = 2, outer = TRUE, cex=1.5)
par(fig = c(0,1,0,1), cex=1.25, new = TRUE)
plot(0,0, xlab="", ylab="", xaxt="n", yaxt="n", bty="n", type="n" )
legend("center", legend = c("Early", "Female", "Male"),
pch = 19, col = my_color_palette, horiz = T, )
The dataset was generated by Tamas Schauer (LMU, BMC, Becker group)
Progressive dosage compensation during Drosophila embryogenesis is reflected by gene arrangement. Prayitno K, Schauer T, Regnard C, Becker PB. EMBO Rep. 2019 Aug;20(8):e48138. doi: 10.15252/embr.201948138. Epub 2019 Jul 9. PMID: 31286660
TPM calculation: https://haroldpimentel.wordpress.com/2014/05/08/what-the-fpkm-a-review-rna-seq-expression-units/
If you see mistakes or want to suggest changes, please create an issue on the source repository.
For attribution, please cite this work as
Schauer (2019, Nov. 20). CompBioMethods: K-Means Clustering - Female/Male Embryos. Retrieved from https://tschauer.github.io/blog/posts/2019-11-28-k-means-clustering-femalemale-embryos/
BibTeX citation
@misc{schauer2019k-means, author = {Schauer, Tamas}, title = {CompBioMethods: K-Means Clustering - Female/Male Embryos}, url = {https://tschauer.github.io/blog/posts/2019-11-28-k-means-clustering-femalemale-embryos/}, year = {2019} }