R语言相关系数计算与可视化
欢迎来到医科研,这里是白介素2的读书笔记,跟我一起聊临床与科研的故事, 生物医学数据挖掘,R语言,TCGA、GEO数据挖掘。
如何解释相关系数
相关系数的值在 -1~1之间,1表示强正相关,0不相关,-1表示负相关 相关性矩阵可同时研究多个变量之间的相关性,其结果是一个同时展示一个变量与其它变量的表格
示例代码
x表示一个矩阵或者数据框
#cor(x, method = c("pearson", "kendall", "spearman"))
## 若存在缺失值,用以下代码
#cor(x, method = "pearson", use = "complete.obs")
简单相关性矩阵的计算
cor函数计算,但仅仅计算出相关系数,没有pvalue
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## 选择部分变量进入分析
my_data <- select(mtcars, mpg, disp, hp, drat, wt, qsec)
head(my_data)
## mpg disp hp drat wt qsec
## Mazda RX4 21.0 160 110 3.90 2.620 16.46
## Mazda RX4 Wag 21.0 160 110 3.90 2.875 17.02
## Datsun 710 22.8 108 93 3.85 2.320 18.61
## Hornet 4 Drive 21.4 258 110 3.08 3.215 19.44
## Hornet Sportabout 18.7 360 175 3.15 3.440 17.02
## Valiant 18.1 225 105 2.76 3.460 20.22
## 计算相关性矩阵
cor_1 <- round(cor(my_data), 2)
cor_1
## mpg disp hp drat wt qsec
## mpg 1.00 -0.85 -0.78 0.68 -0.87 0.42
## disp -0.85 1.00 0.79 -0.71 0.89 -0.43
## hp -0.78 0.79 1.00 -0.45 0.66 -0.71
## drat 0.68 -0.71 -0.45 1.00 -0.71 0.09
## wt -0.87 0.89 0.66 -0.71 1.00 -0.17
## qsec 0.42 -0.43 -0.71 0.09 -0.17 1.00
Hmisc包中的rcorr函数可同时返回相关系数,以及pvalue
简单版本
#rcorr(x, type = c("pearson","spearman"))##代码还是非常简单
library("Hmisc")
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, units
cor_2 <- rcorr(as.matrix(my_data))
cor_2
## mpg disp hp drat wt qsec
## mpg 1.00 -0.85 -0.78 0.68 -0.87 0.42
## disp -0.85 1.00 0.79 -0.71 0.89 -0.43
## hp -0.78 0.79 1.00 -0.45 0.66 -0.71
## drat 0.68 -0.71 -0.45 1.00 -0.71 0.09
## wt -0.87 0.89 0.66 -0.71 1.00 -0.17
## qsec 0.42 -0.43 -0.71 0.09 -0.17 1.00
##
## n= 32
##
##
## P
## mpg disp hp drat wt qsec
## mpg 0.0000 0.0000 0.0000 0.0000 0.0171
## disp 0.0000 0.0000 0.0000 0.0000 0.0131
## hp 0.0000 0.0000 0.0100 0.0000 0.0000
## drat 0.0000 0.0000 0.0100 0.0000 0.6196
## wt 0.0000 0.0000 0.0000 0.0000 0.3389
## qsec 0.0171 0.0131 0.0000 0.6196 0.3389
参数解释
r 表示相关性矩阵 n表示矩阵的观测 P表示pvalue,检验水平
提取相应的值
str(cor_2)#同样的是一个列表结构
## List of 3
## $ r: num [1:6, 1:6] 1 -0.848 -0.776 0.681 -0.868 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:6] "mpg" "disp" "hp" "drat" ...
## .. ..$ : chr [1:6] "mpg" "disp" "hp" "drat" ...
## $ n: int [1:6, 1:6] 32 32 32 32 32 32 32 32 32 32 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:6] "mpg" "disp" "hp" "drat" ...
## .. ..$ : chr [1:6] "mpg" "disp" "hp" "drat" ...
## $ P: num [1:6, 1:6] NA 9.38e-10 1.79e-07 1.78e-05 1.29e-10 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:6] "mpg" "disp" "hp" "drat" ...
## .. ..$ : chr [1:6] "mpg" "disp" "hp" "drat" ...
## - attr(*, "class")= chr "rcorr"
##提取pvalue
cor_2$P
## mpg disp hp drat wt
## mpg NA 9.380328e-10 1.787835e-07 1.776240e-05 1.293958e-10
## disp 9.380328e-10 NA 7.142679e-08 5.282022e-06 1.222311e-11
## hp 1.787835e-07 7.142679e-08 NA 9.988772e-03 4.145827e-05
## drat 1.776240e-05 5.282022e-06 9.988772e-03 NA 4.784260e-06
## wt 1.293958e-10 1.222311e-11 4.145827e-05 4.784260e-06 NA
## qsec 1.708199e-02 1.314404e-02 5.766253e-06 6.195826e-01 3.388683e-01
## qsec
## mpg 1.708199e-02
## disp 1.314404e-02
## hp 5.766253e-06
## drat 6.195826e-01
## wt 3.388683e-01
## qsec NA
提取相关系数矩阵
cor_2$r
## mpg disp hp drat wt qsec
## mpg 1.0000000 -0.8475514 -0.7761684 0.68117191 -0.8676594 0.41868403
## disp -0.8475514 1.0000000 0.7909486 -0.71021393 0.8879799 -0.43369788
## hp -0.7761684 0.7909486 1.0000000 -0.44875912 0.6587479 -0.70822339
## drat 0.6811719 -0.7102139 -0.4487591 1.00000000 -0.7124406 0.09120476
## wt -0.8676594 0.8879799 0.6587479 -0.71244065 1.0000000 -0.17471588
## qsec 0.4186840 -0.4336979 -0.7082234 0.09120476 -0.1747159 1.00000000
整理数据格式-方便可视化
该部分内容引用至custom function for convinient formatting of the correlation matrix
输入参数1:cor_r=相关系数矩阵
输入参数2:cor_p=pvalue矩阵
flat_cor_mat <- function(cor_r, cor_p){
#作者编写了一个用于整理相关性矩阵的函数
#into a table with 4 columns containing :
# Column 1 : 行名 (变量1)
# Column 2 : 列名(变量2)
# Column 3 : 相关系数
# Column 4 : pvalue
library(tidyr)
library(tibble)
cor_r <- rownames_to_column(as.data.frame(cor_r), var = "row")##行名变为列
cor_r <- gather(cor_r, column, cor, -1)##除row列,其它各列gather,将值提取为变量
cor_p <- rownames_to_column(as.data.frame(cor_p), var = "row")
cor_p <- gather(cor_p, column, p, -1)
cor_p_matrix <- left_join(cor_r, cor_p, by = c("row", "column"))##left_join优先保留左侧
cor_p_matrix##得到矩阵
}
应用函数示例
cor_3 <- rcorr(as.matrix(mtcars[, 1:7]))##
cor_3
## mpg cyl disp hp drat wt qsec
## mpg 1.00 -0.85 -0.85 -0.78 0.68 -0.87 0.42
## cyl -0.85 1.00 0.90 0.83 -0.70 0.78 -0.59
## disp -0.85 0.90 1.00 0.79 -0.71 0.89 -0.43
## hp -0.78 0.83 0.79 1.00 -0.45 0.66 -0.71
## drat 0.68 -0.70 -0.71 -0.45 1.00 -0.71 0.09
## wt -0.87 0.78 0.89 0.66 -0.71 1.00 -0.17
## qsec 0.42 -0.59 -0.43 -0.71 0.09 -0.17 1.00
##
## n= 32
##
##
## P
## mpg cyl disp hp drat wt qsec
## mpg 0.0000 0.0000 0.0000 0.0000 0.0000 0.0171
## cyl 0.0000 0.0000 0.0000 0.0000 0.0000 0.0004
## disp 0.0000 0.0000 0.0000 0.0000 0.0000 0.0131
## hp 0.0000 0.0000 0.0000 0.0100 0.0000 0.0000
## drat 0.0000 0.0000 0.0000 0.0100 0.0000 0.6196
## wt 0.0000 0.0000 0.0000 0.0000 0.0000 0.3389
## qsec 0.0171 0.0004 0.0131 0.0000 0.6196 0.3389
my_cor_matrix <- flat_cor_mat(cor_3$r, cor_3$P)
head(my_cor_matrix)
## row column cor p
## 1 mpg mpg 1.0000000 NA
## 2 cyl mpg -0.8521620 6.112688e-10
## 3 disp mpg -0.8475514 9.380328e-10
## 4 hp mpg -0.7761684 1.787835e-07
## 5 drat mpg 0.6811719 1.776240e-05
## 6 wt mpg -0.8676594 1.293958e-10
相关性矩阵的可视化
symnum()函数,使用符号象征性的表示相关系数 例如一个简单的格式如下:
symnum(x, cutpoints = c(0.3, 0.6, 0.8, 0.9, 0.95), symbols = c(" “,”.“,”,“,”+“,”*“,”B"), abbr.colnames = TRUE)
其中x表示相关性矩阵
cutpoints表示相关系数的界值,0-0.3用空格表示,0.3-06以.表示 symbols符号
abbr.colnames:列名是否缩写
cor_4 <- cor(mtcars[1:6])
symnum(cor_4, abbr.colnames = FALSE)
## mpg cyl disp hp drat wt
## mpg 1
## cyl + 1
## disp + * 1
## hp , + , 1
## drat , , , . 1
## wt + , + , , 1
## attr(,"legend")
## [1] 0 ' ' 0.3 '.' 0.6 ',' 0.8 '+' 0.9 '*' 0.95 'B' 1
corrplot函数的可视化
简单的形式如下 corrplot(corr, method=“circle”) corr指的是可视化矩阵 method指可视化方法,包括“circle”, “square”, “ellipse”, “number”, “shade”, “color”, “pie”.
M<-cor(mtcars)
head(round(M,2))
## mpg cyl disp hp drat wt qsec vs am gear carb
## mpg 1.00 -0.85 -0.85 -0.78 0.68 -0.87 0.42 0.66 0.60 0.48 -0.55
## cyl -0.85 1.00 0.90 0.83 -0.70 0.78 -0.59 -0.81 -0.52 -0.49 0.53
## disp -0.85 0.90 1.00 0.79 -0.71 0.89 -0.43 -0.71 -0.59 -0.56 0.39
## hp -0.78 0.83 0.79 1.00 -0.45 0.66 -0.71 -0.72 -0.24 -0.13 0.75
## drat 0.68 -0.70 -0.71 -0.45 1.00 -0.71 0.09 0.44 0.71 0.70 -0.09
## wt -0.87 0.78 0.89 0.66 -0.71 1.00 -0.17 -0.55 -0.69 -0.58 0.43
circle法
require(corrplot)
## Loading required package: corrplot
## corrplot 0.84 loaded
corrplot(M, method = "circle")
corrplot(M, method = "ellipse")
corrplot(M, method = "pie")
corrplot(M, method = "color")
corrplot(M, method = "number")
输出布局
“full” (default) :完整矩阵 “upper”: 上三角 “lower”: 下三角
# upper triangular
corrplot(M, type = "upper")
#lower triangular
corrplot(M, type = "lower")
# correlogram with hclust reordering
corrplot(M, order = "hclust")
corrplot(M, type = "upper", order = "hclust")
# Change background color to lightgreen and color of the circles to darkorange and steel blue
corrplot(M, type = "upper", order = "hclust", col = c("darkorange", "steelblue"),
bg = "lightgreen")
# use "colorRampPallete" to obtain contionus color scales
col <- colorRampPalette(c("darkorange", "white", "steelblue"))(20)
corrplot(M, type = "upper", order = "hclust", col = col)
# Or use "RColorBrewer" package
library(RColorBrewer)
corrplot(M, type = "upper", order = "hclust",
col = brewer.pal(n = 9, name = "PuOr"), bg = "darkgreen")
## tl.col文字标签颜色设置,tl.srt文字旋转角度
corrplot(M, type = "upper", order = "hclust", tl.col = "darkblue", tl.srt = 45)
# 标记出设定检验水平下有意义的点
cor_5 <- rcorr(as.matrix(mtcars))
M <- cor_5$r
p_mat <- cor_5$P
corrplot(M, type = "upper", order = "hclust",
p.mat = p_mat, sig.level = 0.01)
# Leave blank on no significant coefficient
corrplot(M, type = "upper", order = "hclust",
p.mat = p_mat, sig.level = 0.05, insig = "blank")
参考资料