dotplot from multiple hyperogeometric tests

2023/11/22

geneList %>% head
##     4312     8318    10874    55143    55388      991 
## 4.572613 4.514594 4.418218 4.144075 3.876258 3.677857
geneList %>% tail
##     10551     10974     79838     79901     57758      4969 
## -3.330760 -3.416798 -3.457221 -3.603467 -3.945467 -4.302655
# change ENTREZID to gene symbols
names(geneList) <- mapIds(org.Hs.eg.db, keys=names(geneList), keytype="ENTREZID", columns="SYMBOL",column="SYMBOL")
## 'select()' returned 1:1 mapping between keys and columns
genes_list <- list()
genes_list[[1]] <- names(geneList)[geneList > 2]
genes_list[[2]] <- names(geneList)[geneList < -2]

names(genes_list) <- paste("comparison", 1:length(genes_list), sep = "_")

genes_list %>% lapply(head)
## $comparison_1
## [1] "MMP1"  "CDC45" "NMU"   "CDCA8" "MCM10" "CDC20"
## 
## $comparison_2
## [1] NA        "COL16A1" "CACNA1D" "MAOB"    "ADIPOQ"  "VSTM4"
genes_list %>% lapply(length)
## $comparison_1
## [1] 110
## 
## $comparison_2
## [1] 97
hypergeo_output <- genes_list %>% lapply(function(x){
  x %>% enricher(TERM2GENE = MSigDB.ofinterest, pvalueCutoff = 1, qvalueCutoff = 1, universe = names(geneList))
})

hypergeo_df <- hypergeo_output %>% lapply(function(x){ x %>% as_tibble})
hypergeo_df$comparison_1 %>% datatable()
hypergeo_df$comparison_2 %>% datatable()

plots

#  get top genesets for each comparison
top_genesets <- lapply(hypergeo_df, function(x){
  x %>% filter(p.adjust < 0.05) %>% .$Description %>%  head(n=15)
}) %>% unlist %>% unique
hypergeo_plot <- data.frame()
for(i in 1:length(hypergeo_df)){
  temp <- hypergeo_df[[i]] %>% dplyr::slice(match(top_genesets, ID)) %>% mutate(group= names(hypergeo_df)[i])
  hypergeo_plot <- rbind(hypergeo_plot, temp)
}

hypergeo_plot$group <- factor(hypergeo_plot$group)
hypergeo_plot$ID <- factor(hypergeo_plot$ID , levels = top_genesets)
hypergeo_plot$GeneRatio %>% head 

# GeneRatio is a character vector that looks like "82/2838". need to divde the first with second element to get a numeric value
hypergeo_plot$GeneRatio_number <- hypergeo_plot$GeneRatio %>% str_split(pattern = "/") %>% lapply(function(x){ 
  x <- x %>% as.numeric
  x[[1]]/x[[2]]
  }) %>% unlist

hypergeo_plot$group <- factor(hypergeo_plot$group, levels = unique(hypergeo_plot$group))

# any p,adjust value higher than 0.05 are removed from the plot using NA
hypergeo_plot$GeneRatio_number[hypergeo_plot$p.adjust > 0.05] <- NA
hypergeo_plot$p.adjust[hypergeo_plot$p.adjust > 0.05] <- NA
hypergeo_plot %>% ggplot(aes(x = group, y = ID,  col =  -log10(p.adjust))) +
  scale_colour_gradientn(colours = c("orange","red", "darkred")) +
  # scale_colour_gradientn(colours = c("orange", "yellow", "white")) +
  # scale_color_gradient2(midpoint=2, low="blue", mid="white", high="red") +
  geom_point(aes(size = GeneRatio_number)) + 
  theme_bw() +
  theme(axis.title.y = element_blank(),axis.title.x = element_blank(), axis.text.x = element_text(angle=45, hjust =1))