Gene ID Conversion

2024/01/07

Various ways to convert ENTREZ ID to gene symbols and vice versa

library(tidyverse)
library(DOSE)

load dataset

# load a list of ENTREZ ID's
data(geneList)
geneList %>% head
##     4312     8318    10874    55143    55388      991 
## 4.572613 4.514594 4.418218 4.144075 3.876258 3.677857
length(names(geneList))
## [1] 12495

biomart

library(biomaRt)
biomaRt::listEnsemblArchives()
##              name     date                                 url version
## 1  Ensembl GRCh37 Feb 2014          https://grch37.ensembl.org  GRCh37
## 2     Ensembl 110 Jul 2023 https://jul2023.archive.ensembl.org     110
## 3     Ensembl 109 Feb 2023 https://feb2023.archive.ensembl.org     109
## 4     Ensembl 108 Oct 2022 https://oct2022.archive.ensembl.org     108
## 5     Ensembl 107 Jul 2022 https://jul2022.archive.ensembl.org     107
## 6     Ensembl 106 Apr 2022 https://apr2022.archive.ensembl.org     106
## 7     Ensembl 105 Dec 2021 https://dec2021.archive.ensembl.org     105
## 8     Ensembl 104 May 2021 https://may2021.archive.ensembl.org     104
## 9     Ensembl 103 Feb 2021 https://feb2021.archive.ensembl.org     103
## 10    Ensembl 102 Nov 2020 https://nov2020.archive.ensembl.org     102
## 11    Ensembl 101 Aug 2020 https://aug2020.archive.ensembl.org     101
## 12    Ensembl 100 Apr 2020 https://apr2020.archive.ensembl.org     100
## 13     Ensembl 99 Jan 2020 https://jan2020.archive.ensembl.org      99
## 14     Ensembl 98 Sep 2019 https://sep2019.archive.ensembl.org      98
## 15     Ensembl 97 Jul 2019 https://jul2019.archive.ensembl.org      97
## 16     Ensembl 96 Apr 2019 https://apr2019.archive.ensembl.org      96
## 17     Ensembl 95 Jan 2019 https://jan2019.archive.ensembl.org      95
## 18     Ensembl 94 Oct 2018 https://oct2018.archive.ensembl.org      94
## 19     Ensembl 93 Jul 2018 https://jul2018.archive.ensembl.org      93
## 20     Ensembl 80 May 2015 https://may2015.archive.ensembl.org      80
## 21     Ensembl 77 Oct 2014 https://oct2014.archive.ensembl.org      77
## 22     Ensembl 75 Feb 2014 https://feb2014.archive.ensembl.org      75
## 23     Ensembl 54 May 2009 https://may2009.archive.ensembl.org      54
##    current_release
## 1                 
## 2                *
## 3                 
## 4                 
## 5                 
## 6                 
## 7                 
## 8                 
## 9                 
## 10                
## 11                
## 12                
## 13                
## 14                
## 15                
## 16                
## 17                
## 18                
## 19                
## 20                
## 21                
## 22                
## 23
listMarts()
##                biomart                version
## 1 ENSEMBL_MART_ENSEMBL      Ensembl Genes 110
## 2   ENSEMBL_MART_MOUSE      Mouse strains 110
## 3     ENSEMBL_MART_SNP  Ensembl Variation 110
## 4 ENSEMBL_MART_FUNCGEN Ensembl Regulation 110
mart_aug2020 <- biomaRt::useMart(biomart = "ENSEMBL_MART_ENSEMBL", 
                         dataset = "hsapiens_gene_ensembl",
                         host = "https://aug2020.archive.ensembl.org")

head(listAttributes(mart_aug2020), n = 10)
##                             name                  description         page
## 1                ensembl_gene_id               Gene stable ID feature_page
## 2        ensembl_gene_id_version       Gene stable ID version feature_page
## 3          ensembl_transcript_id         Transcript stable ID feature_page
## 4  ensembl_transcript_id_version Transcript stable ID version feature_page
## 5             ensembl_peptide_id            Protein stable ID feature_page
## 6     ensembl_peptide_id_version    Protein stable ID version feature_page
## 7                ensembl_exon_id               Exon stable ID feature_page
## 8                    description             Gene description feature_page
## 9                chromosome_name     Chromosome/scaffold name feature_page
## 10                start_position              Gene start (bp) feature_page
# find the entrez gene id name
grep( "entrez", listAttributes(mart_aug2020)$name, value = T)
## [1] "entrezgene_trans_name"  "entrezgene_description" "entrezgene_accession"  
## [4] "entrezgene_id"
t2g_aug2020 <- biomaRt::getBM(attributes = c("entrezgene_id","external_gene_name", "ensembl_gene_id", "ensembl_gene_id_version", "gene_biotype", "transcript_biotype","chromosome_name", "band", "transcript_length", "start_position", "end_position","transcription_start_site", "strand", "refseq_mrna","refseq_ncrna"), mart = mart_aug2020)

# rename
t2g_aug2020 <- dplyr::rename(t2g_aug2020,  entrez_gene = entrezgene_id, ext_gene = external_gene_name, ens_gene = ensembl_gene_id, ens_gene_ver = ensembl_gene_id_version)
# biomart genesymbols
index <- match(names(geneList), t2g_aug2020$entrez_gene)
biomart_gs <- t2g_aug2020[index,] %>% .$ext_gene
# wasn't able to use this because the NA's from match is ignored
# biomart_gs <- t2g_aug2020 %>% slice(match(names(geneList), entrez_gene)) %>% .$ext_gene

annotationhub

library(AnnotationHub)
# these codes are from https://github.com/hbctraining/scRNA-seq/blob/master/lessons/mitoRatio.md

# Connect to AnnotationHub
# the snapshot is saved at /home/aahn/.cache/R/AnnotationHub
ah <- AnnotationHub()

# Access the Ensembl database for organism
ahDb <- query(ah, 
              pattern = c("Homo sapiens", "EnsDb"), 
              ignore.case = TRUE)

#Next, we acquire the latest annotation files from this Ensembl database.
#We can first check which annotation versions are available:
# Check versions of databases available
ahDb %>% 
  mcols()

# Since we want the most recent, we will return the AnnotationHub ID for this database:
# Acquire the latest annotation files
id <- ahDb %>%
  mcols() %>%
  rownames() %>%
  tail(n = 1)

#Finally, we can use the AnnotationHub connection to download the appropriate Ensembl database, which should be version GRCh38.92.

# Download the appropriate Ensembldb database
# this took ages so will need to save the rds 
edb <- ah[[id]]

# this did not work ... i guess you cant save a ENsDb object? 
#saveRDS(edb, "/researchers/antonio.ahn/resources/R_resources/annotationhub/data/edb_AH104864.rds")

#And to extract gene-level information we can use the Ensembldb function genes() to return a data frame of annotations.

# Extract gene-level information from database
annotations <- genes(edb, 
                     return.type = "data.frame")       

# saved as these steps takes quite a while
saveRDS(annotations, "/researchers/antonio.ahn/resources/R_resources/annotationhub/data/annotations_AH104864.rds")
annoHub <- readRDS("/researchers/antonio.ahn/resources/R_resources/annotationhub/data/annotations_AH104864.rds")
index <- match(names(geneList), annoHub$entrezid)
annoHub_gs <- annoHub[index,] %>% .$symbol

mapIds

library(org.Hs.eg.db)
mapIds_gs <- mapIds(org.Hs.eg.db, keys=names(geneList), keytype="ENTREZID", columns="SYMBOL",column="SYMBOL")

summary

in this example, the mapID method gave the most non NA’s

summary_df <- tibble(entrezID = names(geneList), biomart_gs = biomart_gs, mapIds_gs, annoHub_gs)
summary_df$biomart_gs %>% is.na %>% summary
##    Mode   FALSE    TRUE 
## logical   12263     232
summary_df$mapIds_gs %>% is.na %>% summary
##    Mode   FALSE    TRUE 
## logical   12440      55
summary_df$annoHub_gs %>% is.na %>% summary
##    Mode   FALSE    TRUE 
## logical   12214     281