生信基础 | 人-小鼠基因之间的比较 / 四六文摘

先看看MSigDB中的基因ENTREZID是否可以全部转化为SYMBOL。

library(biomaRt)library(clusterProfiler)library(org.Hs.eg.db)library(org.Mm.eg.db)library(AnnotationDbi)load("F:/BioInfoStudy/data/genesets/mouse_H_v5p2.rdata")load("F:/BioInfoStudy/data/genesets/human_H_v5p2.rdata")hsa_ent <- c()for(x in names(Hs.H)){ hsa_ent <- c(hsa_ent,Hs.H[[x]])}hsa_ent <- unique(hsa_ent)length(hsa_ent)

hsa_ent2symb <- bitr(hsa_ent, fromType = "ENTREZID", toType = "SYMBOL", OrgDb = org.Hs.eg.db)dim(hsa_ent2symb)

diff_ent <- setdiff(hsa_ent,hsa_ent2symb$ENTREZID)

ent2symb <- toTable(org.Hs.egSYMBOL)ent2symb[match(diff_ent,ent2symb$gene_id),]

AnnotationDbi::select(org.Hs.eg.db, keys=diff_ent,columns=c("SYMBOL"), keytype="ENTREZID" )mget(diff_ent, org.Hs.egSYMBOL, ifnotfound=NA

通过多种转换方式，发现人的ENTREZID有3个无法找到对应的SYMBOL，NCBI上检索了一下，也没找到这3个基因。

> diff_ent[1] "285464" "11217" "80761"

再看看小鼠的。

###--------------------------------------小鼠mus_ent <- c()for(x in names(Mm.H)){ mus_ent <- c(mus_ent,Mm.H[[x]])}mus_ent <- unique(mus_ent)length(mus_ent)

mus_ent2symb <- bitr(mus_ent, fromType = "ENTREZID", toType = "SYMBOL", OrgDb = org.Mm.eg.db)dim(mus_ent2symb)

diff_ent <- setdiff(mus_ent,mus_ent2symb$ENTREZID)

ent2symb <- toTable(org.Hs.egSYMBOL)ent2symb[match(diff_ent,ent2symb$gene_id),]

AnnotationDbi::select(org.Hs.eg.db, keys=diff_ent,columns=c("SYMBOL"), keytype="ENTREZID" )

#org.Mm.egMAPCOUNTSmget(diff_ent, org.Mm.egSYMBOL, ifnotfound=NA)

居然有15个没有找到对应的symbol。

> diff_ent [1] "675165" "100038993" "100861969" "100043385" "100504735" "105245585" "546061" [8] "625758" "668131" "665343" "667941" "11641" "100039053" "100043918"[15] "100043921"

写一个函数，用于人-小鼠同源基因的转换；

####------------------------------conversion_BtHsaAndMusGenes <- function(genes,species){ human <- biomaRt::useMart(biomart = "ENSEMBL_MART_ENSEMBL", dataset = "hsapiens_gene_ensembl") mouse <- biomaRt::useMart(biomart = "ENSEMBL_MART_ENSEMBL", dataset = "mmusculus_gene_ensembl") if(species == "human"){ gt <- biomaRt::getLDS(attributes = c("hgnc_symbol"), filters = "hgnc_symbol", values = genes, mart = human, attributesL = c("mgi_symbol"), martL = mouse,uniqueRows = T) }else if(species == "mouse"){ gt <- biomaRt::getLDS(attributes = c("mgi_symbol"), filters = "mgi_symbol", values = genes , mart = mouse, attributesL = c("hgnc_symbol"), martL = human, uniqueRows=T) }else{stop("Wrong species information")} return(gt)}

加载我自己从gtf文件读入的所有基因信息进行转换。

load(file = "K:/BioInfoFiles/human/gencode.v35_human_hsaGeneInfo.Rdata")head(hsaGeneInfo)[1:3,]# > head(hsaGeneInfo)# ensembl symbol gene_type geneLength# 1 ENSG00000000003 TSPAN6 protein_coding 4536# 2 ENSG00000000005 TNMD protein_coding 1476# 3 ENSG00000000419 DPM1 protein_coding 1207

load(file = "K:/BioInfoFiles//mouse/gencode.vM27_mouse_musGeneInfo.Rdata")head(musGeneInfo)# > head(musGeneInfo)[1:4,]# ensembl symbol gene_type geneLength# 1 ENSMUSG00000000001 Gnai3 protein_coding 3262# 2 ENSMUSG00000000003 Pbsn protein_coding 902# 3 ENSMUSG00000000028 Cdc45 protein_coding 3506# 4 ENSMUSG00000000031 H19 lncRNA 2460

###把小鼠基因转换为人的同源基因m2h <- conversion_BtHsaAndMusGenes(genes = musGeneInfo$symbol,species = "mouse")

head(m2h)# > head( m2h)[1:2,]# MGI.symbol HGNC.symbol# 1 mt-Atp6 MT-ATP6# 2 mt-Co2 MT-CO2dim(m2h)# > dim(m2h)# [1] 23075 2###把人基因转换为小鼠的同源基因h2m <- conversion_BtHsaAndMusGenes(genes = hsaGeneInfo$symbol,species = "human")h2m[1:2,]# > h2m[1:2,]# HGNC.symbol MGI.symbol# 1 MT-CO2 mt-Co2# 2 MT-CO1 mt-Co1dim(h2m)# > dim(h2m)# [1] 22714 2

##有重复的length(unique(m2h$HGNC.symbol))length(unique(m2h$MGI.symbol))# > length(unique(m2h$HGNC.symbol))# [1] 18411# > length(unique(m2h$MGI.symbol))# [1] 19308

这里比较MSigDB数据集中小鼠与人基因之间的差异并统计。

statgeneinfo <- data.frame()identical(names(Hs.H),names(Mm.H))#gs <- names(Mm.H)[1]for(gs in names(Mm.H)){ total_pre_hsa <- length(Hs.H[[gs]]) total_pre_mus <- length(Mm.H[[gs]]) hsa_symbol <- bitr(Hs.H[[gs]], fromType = "ENTREZID", toType = "SYMBOL", OrgDb = org.Hs.eg.db)[,2] %>% na.omit() total_after_hsa <- length(hsa_symbol) mus_symbol <- bitr(Mm.H[[gs]], fromType = "ENTREZID", toType = "SYMBOL", OrgDb = org.Mm.eg.db)[,2] %>% na.omit() total_after_mus <- length(mus_symbol)

hsa_mus_con_g <- intersect(hsa_symbol,m2h[match(mus_symbol,mus_ent2symb$SYMBOL),2]) hsa_mus_con_len <- length(hsa_mus_con_g) hsa_uniq <- total_pre_hsa - hsa_mus_con_len mus_uniq <- total_pre_mus - hsa_mus_con_len hsa_na <- total_pre_hsa - total_after_hsa mus_na <- total_pre_mus - total_after_mus

info <- data.frame(dataset = gs, total_pre_hsa = total_pre_hsa, total_pre_mus = total_pre_mus, total_after_hsa = total_after_hsa, total_after_mus = total_after_mus, hsa_mus_con_len = hsa_mus_con_len, hsa_uniq = hsa_uniq, mus_uniq = mus_uniq, hsa_na = hsa_na, mus_na = mus_na) statgeneinfo <- rbind(statgeneinfo,info)}