Seurat4.0系列教程17：Mixscape / 四六文摘

概述

本教程演示了如何使用mixscape 分析 single-cell pooled CRSIPR screens。我们介绍新的seurat功能：

计算每个细胞的特异性扰动特征。
识别和去除"逃逸"CRISPR扰动的细胞。
可视化跨不同扰动的相似性/差异。

加载所需的包

# Load packages. library(Seurat) library(SeuratData) library(ggplot2) library(patchwork) library(scales) library(dplyr) library(reshape2)


# Download dataset using SeuratData.

InstallData(ds = "thp1.eccite")

# Setup custom theme for plotting. custom_theme <- theme( plot.title = element_text(size=16, hjust = 0.5), legend.key.size = unit(0.7, "cm"), legend.text = element_text(size = 14))

加载包含 ECCITE-seq 数据集的seurat对象

我们使用一个包含111个 gRNA 的ECCITE-seq数据集，该数据集来自受刺激的THP-1细胞，发表在bioRxiv 2020^[1]。此数据集可从SeuratData包下载。

# Load object. eccite <- LoadData(ds = "thp1.eccite")

# Normalize protein. eccite <- NormalizeData( object = eccite, assay = "ADT", normalization.method = "CLR", margin = 2)

基于RNA的聚类是由混淆的变异驱动的

在这里，我们按照标准的Seurat工作流，根据细胞的基因表达特征对细胞进行聚类。我们期望获得特定扰动的亚群，但是我们看到聚类主要由细胞周期和重复的 ID 驱动。只观察到一个特定扰动的群，其中包含表达 IFN-gamma通路的gRNA的细胞。

# Prepare RNA assay for dimensionality reduction: # Normalize data, find variable features and scale data. DefaultAssay(object = eccite) <- 'RNA' eccite <- NormalizeData(object = eccite) %>% FindVariableFeatures() %>% ScaleData()


# Run Principle Component Analysis (PCA) to reduce the dimensionality of the data.

eccite <- RunPCA(object = eccite)
# Run Uniform Manifold Approximation and Projection (UMAP) to visualize clustering in 2-D.

eccite <- RunUMAP(object = eccite, dims = 1:40)
# Generate plots to check if clustering is driven by biological replicate ID, 

# cell cycle phase or target gene class.

p1 <- DimPlot(

  object = eccite, 

  group.by = 'replicate', 

  label = F, 

  pt.size = 0.2, 

  reduction = "umap", cols = "Dark2", repel = T) +

  scale_color_brewer(palette = "Dark2") +

  ggtitle("Biological Replicate") +

  xlab("UMAP 1") +

  ylab("UMAP 2") +

  custom_theme
p2 <- DimPlot(

  object = eccite, 

  group.by = 'Phase', 

  label = F, pt.size = 0.2, 

  reduction = "umap", repel = T) + 

  ggtitle("Cell Cycle Phase") +

  ylab("UMAP 2") +

  xlab("UMAP 1") +

  custom_theme
p3 <- DimPlot(

  object = eccite, 

  group.by = 'crispr', 

  pt.size = 0.2, 

  reduction = "umap", 

  split.by = "crispr", 

  ncol = 1, 

  cols = c("grey39","goldenrod3")) + 

  ggtitle("Perturbation Status") +

  ylab("UMAP 2") +

  xlab("UMAP 1") +

  custom_theme

# Visualize plots. ((p1 / p2 + plot_layout(guides = 'auto')) | p3 )

计算局部扰动特征可减轻混淆效果

为了计算局部扰动特征，我们设置non-targeting Nearest Neighbors（NNs）相当于 k=20 的数量，我们建议用户从以下范围选择 k：20 < k < 30。直觉上，用户不希望将 k 设置为非常小或大的数字，因为这很可能不会从数据集中删除技术误差。使用PRTB特征来聚类细胞去除了所有技术误差，并显示一个额外的特定扰动的群。

# Calculate perturbation signature (PRTB). eccite<- CalcPerturbSig( object = eccite, assay = "RNA", slot = "data", gd.class ="gene", nt.cell.class = "NT", reduction = "pca", ndims = 40, num.neighbors = 20, split.by = "replicate", new.assay.name = "PRTB")


# Prepare PRTB assay for dimensionality reduction: 

# Normalize data, find variable features and center data.

DefaultAssay(object = eccite) <- 'PRTB'
# Use variable features from RNA assay.

VariableFeatures(object = eccite) <- VariableFeatures(object = eccite[["RNA"]])

eccite <- ScaleData(object = eccite, do.scale = F, do.center = T)
# Run PCA to reduce the dimensionality of the data.

eccite <- RunPCA(object = eccite, reduction.key = 'prtbpca', reduction.name = 'prtbpca')
# Run UMAP to visualize clustering in 2-D.

eccite <- RunUMAP(

  object = eccite, 

  dims = 1:40, 

  reduction = 'prtbpca', 

  reduction.key = 'prtbumap', 

  reduction.name = 'prtbumap')
# Generate plots to check if clustering is driven by biological replicate ID, 

# cell cycle phase or target gene class.

q1 <- DimPlot(

  object = eccite, 

  group.by = 'replicate', 

  reduction = 'prtbumap', 

  pt.size = 0.2, cols = "Dark2", label = F, repel = T) +

  scale_color_brewer(palette = "Dark2") +

  ggtitle("Biological Replicate") +

  ylab("UMAP 2") +

  xlab("UMAP 1") +

  custom_theme
q2 <- DimPlot(

  object = eccite, 

  group.by = 'Phase', 

  reduction = 'prtbumap', 

  pt.size = 0.2, label = F, repel = T) +

  ggtitle("Cell Cycle Phase") +

  ylab("UMAP 2") +

  xlab("UMAP 1") + 

  custom_theme
q3 <- DimPlot(

  object = eccite,

  group.by = 'crispr',

  reduction = 'prtbumap', 

  split.by = "crispr", 

  ncol = 1, 

  pt.size = 0.2, 

  cols = c("grey39","goldenrod3")) +

  ggtitle("Perturbation Status") +

  ylab("UMAP 2") +

  xlab("UMAP 1") +

  custom_theme

# Visualize plots. (q1 / q2 + plot_layout(guides = 'auto') | q3)

Mixscape 识别没有检测到扰动的细胞

在这里，我们假设每个目标基因类是两个高斯分布的混合物，一个代表敲除（KO），另一个代表非扰动（NP）细胞。我们进一步假设 NP 细胞的分布与表示非靶向 gRNA （NT）的细胞的分布相同，我们尝试使用Mixscape 包中的normalmixEM()功能来估计 KO 细胞的分布。接下来，我们计算细胞属于KO分布的概率，并将概率高于0.5的细胞归类为KO。应用此方法，我们识别出 11 个目标基因类别中的 KOs，并检测每个类别中 gRNA 靶向效率的变化。

# Run mixscape. eccite <- RunMixscape( object = eccite, assay = "PRTB", slot = "scale.data", labels = "gene", nt.class.name = "NT", min.de.genes = 5, iter.num = 10, de.assay = "RNA", verbose = F, prtb.type = "KO")


# Calculate percentage of KO cells for all target gene classes.

df <- prop.table(table(eccite$mixscape_class.global, eccite$NT),2)
df2 <- reshape2::melt(df)

df2$Var2 <- as.character(df2$Var2)

test <- df2[which(df2$Var1 == "KO"),]

test <- test[order(test$value, decreasing = T),]

new.levels <- test$Var2

df2$Var2 <- factor(df2$Var2, levels = new.levels )

df2$Var1 <- factor(df2$Var1, levels = c("NT", "NP", "KO"))

df2$gene <- sapply(as.character(df2$Var2), function(x) strsplit(x, split = "g")[[1]][1])

df2$guide_number <- sapply(as.character(df2$Var2), 

                           function(x) strsplit(x, split = "g")[[1]][2])

df3 <- df2[-c(which(df2$gene == "NT")),]
p1 <- ggplot(df3, aes(x = guide_number, y = value*100, fill= Var1)) +

  geom_bar(stat= "identity") +

  theme_classic()+

  scale_fill_manual(values = c("grey49", "grey79","coral1")) + 

  ylab("% of cells") +

  xlab("sgRNA")

p1 + theme(axis.text.x = element_text(size = 18, hjust = 1), axis.text.y = element_text(size = 18), axis.title = element_text(size = 16), strip.text = element_text(size=16, face = "bold")) + facet_wrap(vars(gene),ncol = 5, scales = "free") + labs(fill = "mixscape class") +theme(legend.title = element_text(size = 14), legend.text = element_text(size = 12))

检查mixscape结果

为了确保mixscape为细胞分配正确的扰动状态，我们可以使用下面的功能来查看目标基因类别（例如 IFNGR2）中细胞的扰动分数分布和概率，并将其与 NT 细胞的概率进行比较。此外，我们可以进行差异表达（DE）分析，并表明只有 IFNGR2 基因KO 的细胞降低了 IFNG 通路的表达。最后，作为一项独立的检查，我们可以查看 NP 和 KO 细胞中的 PD-L1 蛋白质表达值，寻找已知的调节 PD-L1的目标基因。

# Explore the perturbation scores of cells. PlotPerturbScore(object = eccite, target.gene.ident = "IFNGR2", group.by = "mixscape_class", col = "coral2") +labs(fill = "mixscape class")

# Inspect the posterior probability values in NP and KO cells. VlnPlot(eccite, "mixscape_class_p_ko", idents = c("NT", "IFNGR2 KO", "IFNGR2 NP")) + theme(axis.text.x = element_text(angle = 0, hjust = 0.5),axis.text = element_text(size = 16) ,plot.title = element_text(size = 20)) + NoLegend() + ggtitle("mixscape posterior probabilities")

# Run DE analysis and visualize results on a heatmap ordering cells by their posterior # probability values. Idents(object = eccite) <- "gene" MixscapeHeatmap(object = eccite, ident.1 = "NT", ident.2 = "IFNGR2", balanced = F, assay = "RNA", max.genes = 20, angle = 0, group.by = "mixscape_class", max.cells.group = 300, size=6.5) + NoLegend() +theme(axis.text.y = element_text(size = 16))

# Show that only IFNG pathway KO cells have a reduction in PD-L1 protein expression. VlnPlot( object = eccite, features = "adt_PDL1", idents = c("NT","JAK2","STAT1","IFNGR1","IFNGR2", "IRF1"), group.by = "gene", pt.size = 0.2, sort = T, split.by = "mixscape_class.global", cols = c("coral3","grey79","grey39")) + ggtitle("PD-L1 protein") + theme(axis.text.x = element_text(angle = 0, hjust = 0.5), plot.title = element_text(size = 20), axis.text = element_text(size = 16))

通过线性判别分析（LDA）可视化扰动响应

我们使用 LDA 作为一种降维方法来可视化特定扰动的集群。LDA正试图利用基因表达和标签作为输入，最大限度地提高已知标记（mixscape类别）的可分离性。

# Remove non-perturbed cells and run LDA to reduce the dimensionality of the data. Idents(eccite) <- "mixscape_class.global" sub <- subset(eccite, idents = c("KO", "NT"))


# Run LDA.

sub <- MixscapeLDA(

  object = sub, 

  assay = "RNA", 

  pc.assay = "PRTB", 

  labels = "gene", 

  nt.label = "NT", 

  npcs = 10, 

  logfc.threshold = 0.25, 

  verbose = F)
# Use LDA results to run UMAP and visualize cells on 2-D. 

# Here, we note that the number of the dimensions to be used is equal to the number of 

# labels minus one (to account for NT cells).

sub <- RunUMAP(

  object = sub,

  dims = 1:11,

  reduction = 'lda',

  reduction.key = 'ldaumap',

  reduction.name = 'ldaumap')
# Visualize UMAP clustering results.

Idents(sub) <- "mixscape_class"

sub$mixscape_class <- as.factor(sub$mixscape_class)
# Set colors for each perturbation.

col = setNames(object = hue_pal()(12),nm = levels(sub$mixscape_class))

names(col) <- c(names(col)[1:7], "NT", names(col)[9:12])

col[8] <- "grey39"
p <- DimPlot(object = sub, 

             reduction = "ldaumap", 

             repel = T, 

             label.size = 5, 

             label = T, 

             cols = col) + NoLegend()

p2 <- p+ scale_color_manual(values=col, drop=FALSE) + ylab("UMAP 2") + xlab("UMAP 1") + custom_theme p2

文中链接

[1]

2020: https://www.biorxiv.org/content/10.1101/2020.06.28.175596v1

Seurat4.0系列教程17：Mixscape

概述