一文构建微生物群落sparcc网络 / 四六文摘

数据代码文件打包
欢迎加入微生信生物讨论群，扫描下方二维码添加小编微信，小编带你入伙啦，大牛如云，让交流变得简单。（记得备注姓名-研究方向-单位，防止不小心忽略）
微生物群落构建sparcc网络-一文就够了
sparcc网络：过滤OTU
sparcc代码矩阵计算
转入R中构建网络
sparcc网络可视化
展望

最近工作做不动，学点东西，释放释放压力，同时来还可以为大家带来技术福利。

文章翻译自google，地址为：https://colab.mpi-bremen.de/micro-b3/svn/EBI2014/Thursday/，有兴趣的大家可以阅读原文。

我在这里对代码进行了大量注释并添了网络可视化方案。

最终完整的sparcc教程送给大家。

sparcc网络：过滤OTU

library(igraph) library(ggplot2)


# 作者改变工作路径，这里我们不进行改变

# setwd("~/Desktop/EBI2014/Thursday/coocurrences")
# 导入环境，里面有环境变量和一张OTU表格

load("OTU_env_forAntonio_1.RData")
# Check how many OTUs we have

dim(OTU_table)
# 过滤序列数量少于50条的的OTU

OTU.table.filtered <- OTU_table[,colSums(OTU_table) >= 50]
# 查看行列

dim(OTU.table.filtered)

#修改列名

OTU.table.filtered.colnames <- colnames(OTU.table.filtered)

OTU.table.filtered.sparcc <- cbind(rownames(OTU.table.filtered), OTU.table.filtered)

colnames(OTU.table.filtered.sparcc) <- c("OTU_id", OTU.table.filtered.colnames)
# 输出文件，作为sparCC的输入

# We need to transpose the table
write.table(t(OTU.table.filtered.sparcc), file = "otu_table_filtered_50.tsv", row.names = T, sep = "\t", quote = F, col.names = F)

####################################### # 这个过程省略，因为800多个OTU也得一会 #######################################

sparcc代码矩阵计算

#!/usr/bin/bash


#切换到我们的工作路径

cd ./coocurrences
#建立文件夹

mkdir -p pvals
#运行 sparCC

SparCC.py otu_table_filtered_50.tsv -i 5 --cor_file=icomm_cor_sparcc_5.out
#随机抽样 10次

MakeBootstraps.py otu_table_filtered_50.tsv -n 5 -o pvals/boot
# 置换

for i in {0..10}; do printf "Bootstrap number: $i\n";SparCC.py pvals/boot_${i}.txt -i 5 --cor_file=pvals/sim_cor_${i}.txt 2&>1; done
#p值

PseudoPvals.py icomm_cor_sparcc_5.out pvals/sim_cor 10 -o icomm_pvals_sparcc_5.out -t 'two_sided'
#完成之后返回R中继续计算

转入R中构建网络

# Occupancy–abundance relationship (Endemism-cosmopolitanism) # Example from http://ecology.msu.montana.edu/labdsv/R/labs/lab1/lab1.html


# For comodity

otus <- (OTU.table.filtered)
# We set a minimum abundance to be present

minabu <- 50
# 设定阈值为50，统计序列数量大于50条的OTU有多少个

otu.site <- apply(otus > minabu, 1, sum)
# 统计每个OTU在多少个样品中含量超过了50个。

site.otu <- apply(otus > minabu, 2, sum)
## 下面这个循环求取超过50条序列的样品中，每个样品平均有多少条

if (minabu == 0) {

#

mean.abu <- apply(otus, 2, sum)/site.otu

} else {

mean.abu <- rep(0, ncol(otus))

# If not we have to find where there are the OTU with a value larger than minabu

for (i in 1:ncol(otus)) {

mask <- otus[, i] > minabu

# And calculate the mean abundance

mean.abu[i] <- sum(otus[mask, i])/max(1, site.otu[i])

}

}

mean.abu[is.na(mean.abu)] <- 0
# Let's plot the Abundance vs. Occurrence
#l列一张表，表明每个OTU多少个样品中平均有多少条序列

distr.otus<-as.data.frame(cbind(site.otu[mean.abu > minabu], mean.abu[mean.abu > minabu]))

colnames(distr.otus)<-c("counts", "mean")

head(distr.otus)

# 这里根据OTU出现的频率，对OTU进行了定义

#第一个就是广泛存在的

distr.otus$com[distr.otus$counts >= 20]<-"Cosmopolitan"

#虽然在少数样品中存在，但是丰度较高的，定义为地域性的OTU。

distr.otus$com[distr.otus$counts <= 8 & distr.otus$mean >= 500]<-"Endemic"

distr.otus$com[is.na(distr.otus$counts)]<-"NA"
##下面就是一张丰度大于50条的全部OTU在多少个样品中出现的分布散点图

ggplot(data=distr.otus) +

geom_point(aes(counts, mean, colour=com)) +

xlab("Number of sites") +

ylab("Mean abundance") +

xlim(c(1,39)) +

ylim(c(min(mean.abu[mean.abu > minabu]),2000)) +

coord_trans(y = "log10") +

guides(colour=FALSE) +

theme(legend.position = "bottom") +

theme_bw()
#拿到这些定义好的OTU的编号

cosmopolitan<- rownames(distr.otus)[distr.otus$counts >= 20]

endemic <- rownames(distr.otus)[distr.otus$counts <= 8 & distr.otus$mean >= 500]
#######################################

# Back from sparCC

#######################################
# 让我们构建共线性网络

# 首先定义一些功能

makeNet <- function(X){

a<-matrix(nrow=1,ncol=4)

a[1,1] <- rownames(sparcc$r)[X[1]]

a[1,2] <- colnames(sparcc$r)[X[2]]

a[1,3] <- sparcc$r[X[1], X[2]]

a[1,4] <- sparcc$P[X[1],X[2]]

return(a)

}
graph.transform.weights <- function (X) {

require(igraph)

data.tmp <- matrix(0, nrow=nrow(X), ncol=2)

dimnames(data.tmp)[[2]] <- c("i", "j")

data.tmp[,1] <- X[,1]

data.tmp[,2] <- X[,2]

data <- data.frame(data.tmp)

graph.data <- graph.data.frame(data, directed=F)

E(graph.data)$weight <- abs(as.numeric(X[,3]))

E(graph.data)$cor <- as.numeric(X[,3])

E(graph.data)$pvalue <- as.numeric(X[,3])

summary(graph.data)

cat("Average degree:",ecount(graph.data)/vcount(graph.data)*2)

return(graph.data)

}
# 提取sparcc相关性矩阵

sparcc.cor <- read.table(file = "icomm_cor_sparcc_5.out", sep = "\t", header = T, row.names = 1)

dim(sparcc.cor)
# 提取sparCC pseudo p-values

sparcc.pval <- read.table(file = "icomm_pvals_sparcc_5.out", sep = "\t", header = T, row.names = 1)

dim(sparcc.pval)
# 构建一个列表同时包含这两个文件

sparcc <- structure(list(r = sparcc.cor, P = sparcc.pval))
# 设定显著性阈值为0.01，相关性阈值为0.7

pval <- 0.01

r <- 0.7

#选择符合要求的行列

selrp<-which((abs(sparcc$r) > r & sparcc$P < pval) & lower.tri(sparcc$r == TRUE), arr.ind = TRUE)
# 使用makeNet功能构建邻接矩阵

sparcc.graph.df <- t(apply(selrp,1, makeNet))
# 构建igraph网络对象

sparcc.graph <- graph.transform.weights(sparcc.graph.df)
# We will add the distribution information to the nodes

sparcc.graph.names <- as.data.frame(V(sparcc.graph)$name, stringsAsFactors = F)

colnames(sparcc.graph.names) <- "names"
otu.distr <- rbind(cbind(cosmopolitan, "cosmopolitan"),cbind(endemic, "endemic"))

colnames(otu.distr)<-c("names","distribution")

head(otu.distr )

#合并网络中的OTU名称和我们上面定义的标签

sparcc.graph.atr <- as.data.frame(merge(sparcc.graph.names, otu.distr, by="names", all.x=T, all.y = F), stringsAsFactors = F)

head(sparcc.graph.atr)

sparcc.graph.atr <- data.frame(lapply(sparcc.graph.atr, as.character), stringsAsFactors=FALSE)

sparcc.graph.atr[is.na(sparcc.graph.atr)]<-'FALSE'
# 这条命令调整顺序

sparcc.graph.atr.sorted <- sparcc.graph.atr[match(sparcc.graph.names$names,sparcc.graph.atr$names),]

head(sparcc.graph.atr.sorted)

# 将上面编号的节点注释文件编入igraph网络的节点属性中。

V(sparcc.graph)$distribution <- sparcc.graph.atr.sorted$distribution
# 保存网络

write.graph(sparcc.graph, "sparcc_graph_50.graphml", format="graphml")
# 提取自网络，这里我们根据上上面分组，提取包含这每个分组otu的子网络 注意这里的联系可能包含这个类别之外的节点
# We combine the node names from the graph and for each type of distribution

sparcc.graph.ids.cosm <- unlist(list(V(sparcc.graph)$name, unique(cosmopolitan)))

sparcc.graph.ids.end <- unlist(list(V(sparcc.graph)$name, unique(endemic)))
# We keep those present in both

sparcc.graph.ids.cosm <- sparcc.graph.ids.cosm[duplicated(sparcc.graph.ids.cosm)]

sparcc.graph.ids.end <- sparcc.graph.ids.end[duplicated(sparcc.graph.ids.end)]
# We extract the first order neighbors

sparcc.graph.cosm.nei<-neighborhood(sparcc.graph, 1, nodes=sparcc.graph.ids.cosm)

sparcc.graph.end.nei<-neighborhood(sparcc.graph, 1, nodes=sparcc.graph.ids.end)
# We create the subgraph only containing the first order neighbors

sparcc.graph.cosm<-induced.subgraph(sparcc.graph, unlist(sparcc.graph.cosm.nei))

sparcc.graph.end<-induced.subgraph(sparcc.graph, unlist(sparcc.graph.end.nei))
# We write the network to a file

write.graph(sparcc.graph.cosm, "sparcc_graph_cosm_nei_50.graphml", format="graphml")

write.graph(sparcc.graph.end, "sparcc_graph_end_nei_50.graphml", format="graphml")
# 同样挑选自网络，但是只保留同一个类别网络中的连线

# We extract the first order neighbors

sparcc.graph.cosm.nei<-neighborhood(sparcc.graph, 0, nodes=sparcc.graph.ids.cosm)

sparcc.graph.end.nei<-neighborhood(sparcc.graph, 0, nodes=sparcc.graph.ids.end)
# 提取自网络

sparcc.graph.cosm<-induced.subgraph(sparcc.graph, unlist(sparcc.graph.cosm.nei))

sparcc.graph.end<-induced.subgraph(sparcc.graph, unlist(sparcc.graph.end.nei))
# 去除孤立的节点

sparcc.graph.cosm <- delete.vertices(sparcc.graph.cosm, which(degree(sparcc.graph.cosm) < 1))

sparcc.graph.end <- delete.vertices(sparcc.graph.end, which(degree(sparcc.graph.end) < 1))
# 文件写入

write.graph(sparcc.graph.cosm, "sparcc_graph_cosm_50.graphml", format="graphml")

write.graph(sparcc.graph.end, "sparcc_graph_end_50.graphml", format="graphml")

sparcc网络可视化

这里为大家介绍一下可以改变节点边框粗细的函数。这个功能一般是没有。大家注意划重点。

###改变网络节点边框宽度 mycircle <- function(coords, v=NULL, params) { vertex.color <- params("vertex", "color") if (length(vertex.color) != 1 && !is.null(v)) { vertex.color <- vertex.color[v] } vertex.size <- 1/200 * params("vertex", "size") if (length(vertex.size) != 1 && !is.null(v)) { vertex.size <- vertex.size[v] } vertex.frame.color <- params("vertex", "frame.color") if (length(vertex.frame.color) != 1 && !is.null(v)) { vertex.frame.color <- vertex.frame.color[v] } vertex.frame.width <- params("vertex", "frame.width") if (length(vertex.frame.width) != 1 && !is.null(v)) { vertex.frame.width <- vertex.frame.width[v] }


mapply(coords[,1], coords[,2], vertex.color, vertex.frame.color,

vertex.size, vertex.frame.width,

FUN=function(x, y, bg, fg, size, lwd) {

symbols(x=x, y=y, bg=bg, fg=fg, lwd=lwd,

circles=size, add=TRUE, inches=FALSE)

})

}
add.vertex.shape("fcircle", clip=igraph.shape.noclip,

plot=mycircle, parameters=list(vertex.frame.color=1,

vertex.frame.width=1))
igraph = sparcc.graph.cosm

# igraph = sparcc.graph.end
l <- layout_nicely(igraph)

l <- norm_coords(l, ymin=-1, ymax=1, xmin=-1, xmax=1)

plot(igraph)
plot(igraph,

layout=l,

vertex.shape="fcircle",

vertex.label=NA ,

vertex.frame.color="#984EA3",

vertex.color="#984EA3",

vertex.size =2,

vertex.frame.alpha =0.5,

edge.width=0.5,

edge.lty=2,

edge.curved=F,

margin=c(0,0,0,0),

vertex.frame.width=5

)
igraph = sparcc.graph.end
l <- layout_nicely(igraph)

l <- norm_coords(l, ymin=-1, ymax=1, xmin=-1, xmax=1)

plot(igraph)
plot(igraph,

layout=l,

vertex.shape="fcircle",

vertex.label=NA ,

vertex.frame.color="#984EA3",

vertex.color="#984EA3",

vertex.size =2,

vertex.frame.alpha =0.5,

edge.width=0.5,

edge.lty=2,

edge.curved=F,

margin=c(0,0,0,0),

vertex.frame.width=5

)

展望

联合细菌真菌包括各种环境变量一同构建网络之后，我们有开发了基于ggplot的网络可视化方方案，最突出的特点就是简单，就像使用ggplot做散点图和折线图一样简单。尽情期待。（欢迎交流）

一文构建微生物群落sparcc网络

sparcc网络：过滤OTU

sparcc代码 矩阵计算

转入R中构建网络

sparcc网络可视化

展望

数据代码文件打包

相关推荐

sparcc代码矩阵计算