随机森林分析-Random Forest
随机森林分析是一种使用决策树的方法对变量重要性进行评估的分析方法。相信不少人或多或少的都有所了解。具体随机森林分析的原理可以参考百度百科和R语言实战,本节我们只讲如何在R中实现随机森林分析并绘图。
一般来说,通过随机森林评估其它变量对目标变量的重要性可以因目标变量类型的差异而分为两种类型的随机森林。第一种类型(classification),当目标变量是分组/处理(factors)时,此时通过随机森林分析能够预测分组的准确性(即袋外误差OOB[out-of-big] error的高低)以及其它变量对该分组的重要性;第二种类型(regression),当目标变量是连续(数值)变量时,此时通过随机森林分析能够预测其它变量构成的模型对目标变量的解释量(% Var Explained)以及其它变量对该目标变量的重要性。接下来,我们分别阐述这两种类型的随机森林在R中实现的过程。
1、目标变量为分组/处理
#rm(list=ls())
library(tidyverse)
library("randomForest")
library("rfUtilities")
library("rfPermute")
加载数据如下
load("RFdata1.RData")
head(RFdata3[,1:6])
RFdata3 %>%
mutate(treatments = as.factor(treatments)) -> RFdata3.1#目标变量转变为factors
set.seed(123)
treat_rf <- randomForest(treatments ~ ., data= RFdata3.1,
importance=TRUE,proximity=TRUE)
treat_rf
set.seed(123)
treat_perm <- rf.significance(treat_rf, RFdata3.1[,-1], nperm=99, ntree=501)
treat_perm
treat_rfP<- rfPermute(treatments ~ ., data = RFdata3.1, ntree = 500,
na.action = na.omit, nrep = 100,num.cores = 1)
treat_dat <- rp.importance(treat_rfP, sort.by = NULL, decreasing = TRUE)
treat_dat[,c("MeanDecreaseAccuracy","MeanDecreaseAccuracy.pval")] %>%
as_tibble(rownames = "names") %>%
mutate(label = if_else(MeanDecreaseAccuracy.pval<0.001,"***",
if_else(MeanDecreaseAccuracy.pval<0.01,"**",
if_else(MeanDecreaseAccuracy.pval<0.05,"*","ns")))) %>%
arrange(MeanDecreaseAccuracy) %>%
mutate(group = if_else(label=="ns","In_sig","Sig"),
names = forcats::fct_inorder(names)) %>%
ggplot(aes(x = names, y = MeanDecreaseAccuracy))+
geom_bar(aes(fill = group),stat = "identity")+
geom_text(aes(y = MeanDecreaseAccuracy+0.5,label = label))+
labs(x = "", y = "% Mean decrease accuracy")+
coord_flip()
2、目标变量为连续变量/数值
load("RFdata2.RData")
head(RFdata2)
set.seed(123)
richness_rf <- randomForest(Richness ~ ., data= RFdata2,
importance=TRUE,proximity=TRUE)
richness_rf
set.seed(123)
richness_perm <- rf.significance(richness_rf, RFdata2[,-1], nperm=99, ntree=501)
richness_perm
set.seed(123)
richness_rfP<- rfPermute(Richness ~ ., data = RFdata2, ntree = 500,
na.action = na.omit, nrep = 100,num.cores = 1)
richness_dat <- rp.importance(richness_rfP, sort.by = NULL, decreasing = TRUE)
richness_dat %>%
as_tibble(rownames = "names") %>%
data.frame() %>%
mutate(label = if_else(X.IncMSE.pval < 0.001,"***",
if_else(X.IncMSE.pval <0.01,"**",
if_else(X.IncMSE.pval<0.05,"*","ns"))),
X.IncMSE = as.numeric(X.IncMSE)) %>%
arrange(X.IncMSE) %>%
mutate(group = if_else(label=="ns","In_sig","Sig"),
names = forcats::fct_inorder(names)) %>%
ggplot(aes(x = names, y = X.IncMSE))+
geom_bar(aes(fill = group),stat = "identity")+
geom_text(aes(y = X.IncMSE + 1,label = label))+
labs(x = "", y = "%IncMSE")+
coord_flip()
赞 (0)