你以为我发现Seurat包PCA图与别人的方向不一致挖到C/C++就完了么?
学徒和学员已经陆续出师,是时候把生信技能树的舞台交给后辈了!
事情前提回顾:为什么你画的Seurat包PCA图与别人的方向不一致?
发现了是随机种子后,你难道没有其他的疑问了么?问题简直是一环扣一环,堪比央视几套的走进科学节目:
为什么有了随机种子,图还能上下左右颠倒? 为什么正好是上下颠倒(意味着绘图的纵坐标取值正负符号正好与别人的相反),左右颠倒(横坐标绘图的取值正负符号正好与别人的相反),而其他地方都一样呢? 随机不应该随机出来很不一样的么?
接着上一篇文章:为什么你画的Seurat包PCA图与别人的方向不一致?我挖到了irlba函数,里面有个随机参数maxit:maximum number of iterations,即运行某一个函数并且迭代多少次,因为数据分布模型未知,这在统计学里基本上是用来模拟一个理论分布然后根据这个分布算某一个比如p值这样的用的。
但是发现这个函数最后使用的C/C++代码…
你以为我挖到人家是C/C++写的就挖不动了么?后来,我找了一个快速学习班,一天就学会了C/C++。
首先,我们换一个思路,看一下绘图用到的函数,PCA图的本质就是散点图嘛
Seurat包的绘图函数代码:https://github.com/satijalab/seurat/blob/master/R/visualization.R
绘制PCA散点图使用的是:
p <- DimPlot(gbm, reduction = "pca",group.by="patient.id",pt.size = 1.2)
p
找到DimPlot绘图使用的横坐标和纵坐标用的值是什么,代码如下:
DimPlot <- function(
object,
dims = c(1, 2),
cells = NULL,
cols = NULL,
pt.size = NULL,
reduction = NULL,
group.by = NULL,
split.by = NULL,
shape.by = NULL,
order = NULL,
shuffle = FALSE,
seed = 1,
label = FALSE,
label.size = 4,
label.color = 'black',
label.box = FALSE,
repel = FALSE,
cells.highlight = NULL,
cols.highlight = '#DE2D26',
sizes.highlight = 1,
na.value = 'grey50',
ncol = NULL,
combine = TRUE
) {
if (length(x = dims) != 2) {
stop("'dims' must be a two-length vector")
}
reduction <- reduction %||% DefaultDimReduc(object = object)
cells <- cells %||% colnames(x = object)
if (isTRUE(x = shuffle)) {
set.seed(seed = seed)
cells <- sample(x = cells)
}
data <- Embeddings(object = object[[reduction]])[cells, dims]
data <- as.data.frame(x = data)
dims <- paste0(Key(object = object[[reduction]]), dims)
object[['ident']] <- Idents(object = object)
orig.groups <- group.by
group.by <- group.by %||% 'ident'
data[, group.by] <- object[[group.by]][cells, , drop = FALSE]
for (group in group.by) {
if (!is.factor(x = data[, group])) {
data[, group] <- factor(x = data[, group])
}
}
if (!is.null(x = shape.by)) {
data[, shape.by] <- object[[shape.by, drop = TRUE]]
}
if (!is.null(x = split.by)) {
data[, split.by] <- object[[split.by, drop = TRUE]]
}
plots <- lapply(
X = group.by,
FUN = function(x) {
plot <- SingleDimPlot(
data = data[, c(dims, x, split.by, shape.by)],
dims = dims,
col.by = x,
cols = cols,
pt.size = pt.size,
shape.by = shape.by,
order = order,
label = FALSE,
cells.highlight = cells.highlight,
cols.highlight = cols.highlight,
sizes.highlight = sizes.highlight,
na.value = na.value
)
if (label) {
plot <- LabelClusters(
plot = plot,
id = x,
repel = repel,
size = label.size,
split.by = split.by,
box = label.box,
color = label.color
)
}
if (!is.null(x = split.by)) {
plot <- plot + FacetTheme() +
facet_wrap(
facets = vars(!!sym(x = split.by)),
ncol = if (length(x = group.by) > 1 || is.null(x = ncol)) {
length(x = unique(x = data[, split.by]))
} else {
ncol
}
)
}
return(plot)
}
)
if (!is.null(x = split.by)) {
ncol <- 1
}
if (combine) {
plots <- wrap_plots(plots, ncol = orig.groups %iff% ncol)
}
return(plots)
}
上面绘图使用的是SingleDimPlot函数,继续查看:
SingleDimPlot <- function(
data,
dims,
col.by = NULL,
cols = NULL,
pt.size = NULL,
shape.by = NULL,
alpha.by = NULL,
order = NULL,
label = FALSE,
repel = FALSE,
label.size = 4,
cells.highlight = NULL,
cols.highlight = '#DE2D26',
sizes.highlight = 1,
na.value = 'grey50'
) {
pt.size <- pt.size %||% AutoPointSize(data = data)
if (length(x = dims) != 2) {
stop("'dims' must be a two-length vector")
}
if (!is.data.frame(x = data)) {
data <- as.data.frame(x = data)
}
if (is.character(x = dims) && !all(dims %in% colnames(x = data))) {
stop("Cannot find dimensions to plot in data")
} else if (is.numeric(x = dims)) {
dims <- colnames(x = data)[dims]
}
if (!is.null(x = cells.highlight)) {
highlight.info <- SetHighlight(
cells.highlight = cells.highlight,
cells.all = rownames(x = data),
sizes.highlight = sizes.highlight %||% pt.size,
cols.highlight = cols.highlight,
col.base = cols[1] %||% '#C3C3C3',
pt.size = pt.size
)
order <- highlight.info$plot.order
data$highlight <- highlight.info$highlight
col.by <- 'highlight'
pt.size <- highlight.info$size
cols <- highlight.info$color
}
if (!is.null(x = order) && !is.null(x = col.by)) {
if (typeof(x = order) == "logical") {
if (order) {
data <- data[order(data[, col.by]), ]
}
} else {
order <- rev(x = c(
order,
setdiff(x = unique(x = data[, col.by]), y = order)
))
data[, col.by] <- factor(x = data[, col.by], levels = order)
new.order <- order(x = data[, col.by])
data <- data[new.order, ]
if (length(x = pt.size) == length(x = new.order)) {
pt.size <- pt.size[new.order]
}
}
}
if (!is.null(x = col.by) && !col.by %in% colnames(x = data)) {
warning("Cannot find ", col.by, " in plotting data, not coloring plot")
col.by <- NULL
} else {
# col.index <- grep(pattern = col.by, x = colnames(x = data), fixed = TRUE)
col.index <- match(x = col.by, table = colnames(x = data))
if (grepl(pattern = '^\\d', x = col.by)) {
# Do something for numbers
col.by <- paste0('x', col.by)
} else if (grepl(pattern = '-', x = col.by)) {
# Do something for dashes
col.by <- gsub(pattern = '-', replacement = '.', x = col.by)
}
colnames(x = data)[col.index] <- col.by
}
if (!is.null(x = shape.by) && !shape.by %in% colnames(x = data)) {
warning("Cannot find ", shape.by, " in plotting data, not shaping plot")
}
if (!is.null(x = alpha.by) && !alpha.by %in% colnames(x = data)) {
warning(
"Cannot find alpha variable ",
alpha.by,
" in data, setting to NULL",
call. = FALSE,
immediate. = TRUE
)
alpha.by <- NULL
}
plot <- ggplot(data = data) +
geom_point(
mapping = aes_string(
x = dims[1],
y = dims[2],
color = paste0("`", col.by, "`"),
shape = shape.by,
alpha = alpha.by
),
size = pt.size
) +
guides(color = guide_legend(override.aes = list(size = 3))) +
labs(color = NULL)
if (label && !is.null(x = col.by)) {
plot <- LabelClusters(
plot = plot,
id = col.by,
repel = repel,
size = label.size
)
}
if (!is.null(x = cols)) {
if (length(x = cols) == 1 && (is.numeric(x = cols) || cols %in% rownames(x = brewer.pal.info))) {
scale <- scale_color_brewer(palette = cols, na.value = na.value)
} else if (length(x = cols) == 1 && (cols %in% c('alphabet', 'alphabet2', 'glasbey', 'polychrome', 'stepped'))) {
colors <- DiscretePalette(length(unique(data[[col.by]])), palette = cols)
scale <- scale_color_manual(values = colors, na.value = na.value)
} else {
scale <- scale_color_manual(values = cols, na.value = na.value)
}
plot <- plot + scale
}
plot <- plot + theme_cowplot()
return(plot)
}
在这里我们看到了绘图的核心,激动,使用的ggplot2,数据为data,几何对象geom_point,即点图:
plot <- ggplot(data = data) +
geom_point(
mapping = aes_string(
x = dims[1],
y = dims[2],
color = paste0("`", col.by, "`"),
shape = shape.by,
alpha = alpha.by
),
size = pt.size
) +
guides(color = guide_legend(override.aes = list(size = 3))) +
labs(color = NULL)
那么回过头去看data怎么得到的
reduction <- reduction %||% DefaultDimReduc(object = object)
cells <- cells %||% colnames(x = object)
data <- Embeddings(object = object[[reduction]])[cells, dims]
data <- as.data.frame(x = data)
找到关键的地方,省略花里胡哨的修饰得到如下:
cells <- colnames(x = gbm)
dims <- c(1,2)
reduction <- "pca"
data <- Embeddings(object = gbm[[reduction]])[cells, dims]
data <- as.data.frame(x = data)
dims <- paste0(Key(object = gbm[[reduction]]), dims)
plot <- ggplot(data = data) +
geom_point(
mapping = aes_string(
x = dims[1],
y = dims[2]
),
size = 1.2
)
plot
看看如何通过Embeddings函数计算得到坐标
library(irlba)
npcs = 50
# PCA分析是基于pbmc[["RNA"]]@scale.data这个数据的
temp <- gbm[["RNA"]]@scale.data
temp[1:4,1:4]
dim(temp)
pca.results <- irlba(A = t(x = temp), nv = npcs)
cell.embeddings <- pca.results$u %*% diag(pca.results$d)
data1 <- cell.embeddings[,1:2]
# 比较
head(data)
PC_1 PC_2
1001000173.G8 -0.03682265 2.699819
1001000173.D4 4.34592621 -17.398409
1001000173.B4 9.10210569 5.652316
1001000173.A2 11.40656422 0.578273
1001000173.E2 5.38025168 2.970779
1001000173.F6 6.52216224 1.860002
head(data1)
[,1] [,2]
[1,] -0.03682265 -2.699819
[2,] 4.34592621 17.398409
[3,] 9.10210569 -5.652316
[4,] 11.40656422 -0.578273
[5,] 5.38025168 -2.970779
[6,] 6.52216224 -1.860002
嗯,验证了上面得到的data得到的方法。
绕了一大圈,这下还是回到了irlba函数。至少我们现在知道了这两个坐标是PCA结果的什么值对吧。并且也看见绘制这幅图得核心代码,排除了有一个人的回答,显然不是因为绘图得原因导致得:
后面我花了很长时间去扣这个irlba包的irlba函数,总结一下:
这个函数使用的一个初始向量为norm()随机得到的,也就是用set.seed用到的地方,后面根据这个初识向量进行迭代运算 irlba函数其中有个函数norm2来自于Spatiotemporal这个包,但是这个包现在由于有问题已经被移除了。
irlba函数计算PCA主成分的核心是svd。
基于以上发现每次运算正负号不一样的地方在于svd中的u矩阵:
library(irlba)
npcs = 50
# PCA分析是基于pbmc[["RNA"]]@scale.data这个数据的
temp <- gbm[["RNA"]]@scale.data
temp[1:4,1:4]
dim(temp)
#set.seed(1)
pca.results <- irlba(A = t(x = temp), nv = npcs,verbose = T)
cell.embeddings <- pca.results$u %*% diag(pca.results$d) # pca.results$u每次运行正负不一样
pca.results$u[1:4,1:4]
总结:
既然这个irlba函数计算PCA主成分现在有问题了,那么为了避免这个坑,我们可以设置RunPCA函数中的一个参数approx = F,让其调用prcomp函数计算主成分分析。
运行结果如下,当然坐标范围已经不一样了:
gbm <- RunPCA(gbm, features = VariableFeatures(object = gbm),approx = F)
p <- DimPlot(gbm, reduction = "pca",group.by="patient.id",pt.size = 1.2)
p