Seurat - Guided Clustering Tutorial of 2,700 PBMCs

Binder

This notebook was created using the codes and documentations from the following Seurat tutorial: Seurat - Guided Clustering Tutorial. This notebook provides a basic overview of Seurat including the the following:

  • QC and pre-processing
  • Dimension reduction
  • Clustering
  • Differential expression

Downloading data from 10X Genomics

[1]:
system("cd /tmp;\
        wget -q http://cf.10xgenomics.com/samples/cell-exp/1.1.0/pbmc3k/pbmc3k_filtered_gene_bc_matrices.tar.gz;\
        tar -xzf pbmc3k_filtered_gene_bc_matrices.tar.gz")

Setup the Seurat Object

[2]:
library(dplyr)
library(Seurat)
library(patchwork)

# Load the PBMC dataset
pbmc.data <- Read10X(data.dir = "/tmp/filtered_gene_bc_matrices/hg19/")
# Initialize the Seurat object with the raw (non-normalized data).
pbmc <- CreateSeuratObject(counts = pbmc.data, project = "pbmc3k", min.cells = 3, min.features = 200)
pbmc
An object of class Seurat
13714 features across 2700 samples within 1 assay
Active assay: RNA (13714 features, 0 variable features)
[3]:
# Lets examine a few genes in the first thirty cells
pbmc.data[c("CD3D", "TCL1A", "MS4A1"), 1:30]
3 x 30 sparse Matrix of class "dgCMatrix"

CD3D  4 . 10 . . 1 2 3 1 . . 2 7 1 . . 1 3 . 2  3 . . . . . 3 4 1 5
TCL1A . .  . . . . . . 1 . . . . . . . . . . .  . 1 . . . . . . . .
MS4A1 . 6  . . . . . . 1 1 1 . . . . . . . . . 36 1 2 . . 2 . . . .
[4]:
dense.size <- object.size(as.matrix(pbmc.data))
dense.size
709591472 bytes
[5]:
sparse.size <- object.size(pbmc.data)
sparse.size
29905192 bytes
[6]:
dense.size/sparse.size
23.7 bytes

QC and selecting cells for further analysis

[7]:
# The [[ operator can add columns to object metadata. This is a great place to stash QC stats
pbmc[["percent.mt"]] <- PercentageFeatureSet(pbmc, pattern = "^MT-")
[8]:
# Show QC metrics for the first 5 cells
head(pbmc@meta.data, 5)
A data.frame: 5 × 4
orig.identnCount_RNAnFeature_RNApercent.mt
<fct><dbl><int><dbl>
AAACATACAACCAC-1pbmc3k2419 7793.0177759
AAACATTGAGCTAC-1pbmc3k490313523.7935958
AAACATTGATCAGC-1pbmc3k314711290.8897363
AAACCGTGCTTCCG-1pbmc3k2639 9601.7430845
AAACCGTGTATGCG-1pbmc3k 980 5211.2244898
[9]:
# Visualize QC metrics as a violin plot
options(repr.plot.width=20, repr.plot.height=8)
VlnPlot(pbmc, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
../_images/examples_seurat_pbmc_v0.0.1_12_0.png
[10]:
# FeatureScatter is typically used to visualize feature-feature relationships, but can be used
# for anything calculated by the object, i.e. columns in object metadata, PC scores etc.
options(repr.plot.width=20, repr.plot.height=10)
plot1 <- FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot2 <- FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
plot1 + plot2
../_images/examples_seurat_pbmc_v0.0.1_13_0.png
[11]:
pbmc <- subset(pbmc, subset = nFeature_RNA > 200 & nFeature_RNA < 2500 & percent.mt < 5)

Normalizing the data

[12]:
pbmc <- NormalizeData(pbmc, normalization.method = "LogNormalize", scale.factor = 10000)
[13]:
pbmc <- NormalizeData(pbmc)

Identification of highly variable features (feature selection)

[14]:
pbmc <- FindVariableFeatures(pbmc, selection.method = "vst", nfeatures = 2000)

# Identify the 10 most highly variable genes
top10 <- head(VariableFeatures(pbmc), 10)

# plot variable features with and without labels
plot1 <- VariableFeaturePlot(pbmc)
plot2 <- LabelPoints(plot = plot1, points = top10, repel = FALSE)
plot1 + plot2
../_images/examples_seurat_pbmc_v0.0.1_19_0.png

Scaling the data

[15]:
all.genes <- rownames(pbmc)
pbmc <- ScaleData(pbmc, features = all.genes)

Perform linear dimensional reduction

[16]:
pbmc <- RunPCA(pbmc, features = VariableFeatures(object = pbmc))
[17]:
# Examine and visualize PCA results a few different ways
print(pbmc[["pca"]], dims = 1:5, nfeatures = 5)
PC_ 1
Positive:  CST3, TYROBP, LST1, AIF1, FTL
Negative:  MALAT1, LTB, IL32, IL7R, CD2
PC_ 2
Positive:  CD79A, MS4A1, TCL1A, HLA-DQA1, HLA-DQB1
Negative:  NKG7, PRF1, CST7, GZMB, GZMA
PC_ 3
Positive:  HLA-DQA1, CD79A, CD79B, HLA-DQB1, HLA-DPB1
Negative:  PPBP, PF4, SDPR, SPARC, GNG11
PC_ 4
Positive:  HLA-DQA1, CD79B, CD79A, MS4A1, HLA-DQB1
Negative:  VIM, IL7R, S100A6, IL32, S100A8
PC_ 5
Positive:  GZMB, NKG7, S100A8, FGFBP2, GNLY
Negative:  LTB, IL7R, CKB, VIM, MS4A7
[18]:
VizDimLoadings(pbmc, dims = 1:2, reduction = "pca")
../_images/examples_seurat_pbmc_v0.0.1_25_0.png
[19]:
options(repr.plot.width=7, repr.plot.height=7)
DimPlot(pbmc, reduction = "pca")
../_images/examples_seurat_pbmc_v0.0.1_26_0.png
[20]:
options(repr.plot.width=14, repr.plot.height=7)
DimHeatmap(pbmc, dims = 1, cells = 500, balanced = TRUE)
../_images/examples_seurat_pbmc_v0.0.1_27_0.png
[21]:
options(repr.plot.width=12, repr.plot.height=25)
DimHeatmap(pbmc, dims = 1:15, cells = 500, balanced = TRUE,ncol = 2)
../_images/examples_seurat_pbmc_v0.0.1_28_0.png
../_images/examples_seurat_pbmc_v0.0.1_28_1.png

Determine the ‘dimensionality’ of the dataset

[22]:
# NOTE: This process can take a long time for big datasets, comment out for expediency. More
# approximate techniques such as those implemented in ElbowPlot() can be used to reduce
# computation time
pbmc <- JackStraw(pbmc, num.replicate = 100)
pbmc <- ScoreJackStraw(pbmc, dims = 1:20)
[23]:
options(repr.plot.width=12, repr.plot.height=7)
JackStrawPlot(pbmc, dims = 1:15)
../_images/examples_seurat_pbmc_v0.0.1_31_0.png
[24]:
options(repr.plot.width=12, repr.plot.height=7)
ElbowPlot(pbmc)
../_images/examples_seurat_pbmc_v0.0.1_32_0.png

Cluster the cells

[25]:
pbmc <- FindNeighbors(pbmc, dims = 1:10)
pbmc <- FindClusters(pbmc, resolution = 0.5)
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck

Number of nodes: 2638
Number of edges: 96033

Running Louvain algorithm...
Maximum modularity in 10 random starts: 0.8720
Number of communities: 9
Elapsed time: 0 seconds
[26]:
# Look at cluster IDs of the first 5 cells
head(Idents(pbmc), 5)
AAACATACAACCAC-1
1
AAACATTGAGCTAC-1
3
AAACATTGATCAGC-1
1
AAACCGTGCTTCCG-1
2
AAACCGTGTATGCG-1
6
Levels:
  1. '0'
  2. '1'
  3. '2'
  4. '3'
  5. '4'
  6. '5'
  7. '6'
  8. '7'
  9. '8'

Run non-linear dimensional reduction (UMAP/tSNE)

[27]:
# If you haven't installed UMAP, you can do so via reticulate::py_install(packages =
# 'umap-learn')
pbmc <- RunUMAP(pbmc, dims = 1:10)
[28]:
# note that you can set `label = TRUE` or use the LabelClusters function to help label
# individual clusters
options(repr.plot.width=10, repr.plot.height=8)
DimPlot(pbmc, reduction = "umap",label=TRUE)
../_images/examples_seurat_pbmc_v0.0.1_38_0.png

Finding differentially expressed features (cluster biomarkers)

[29]:
# find all markers of cluster 1
cluster1.markers <- FindMarkers(pbmc, ident.1 = 1, min.pct = 0.25)
head(cluster1.markers, n = 5)
A data.frame: 5 × 5
p_valavg_logFCpct.1pct.2p_val_adj
<dbl><dbl><dbl><dbl><dbl>
IL321.894810e-920.83738720.9480.4642.598542e-88
LTB7.953303e-890.89211700.9810.6421.090716e-84
CD3D1.655937e-700.64362860.9190.4312.270951e-66
IL7R3.688893e-680.81470820.7470.3255.058947e-64
LDHB2.292819e-670.62531100.9500.6133.144372e-63
[30]:
# find all markers distinguishing cluster 5 from clusters 0 and 3
cluster5.markers <- FindMarkers(pbmc, ident.1 = 5, ident.2 = c(0, 3), min.pct = 0.25)
head(cluster5.markers, n = 5)
A data.frame: 5 × 5
p_valavg_logFCpct.1pct.2p_val_adj
<dbl><dbl><dbl><dbl><dbl>
FCGR3A7.583625e-2092.9631440.9750.0371.040018e-204
IFITM32.500844e-1992.6981870.9750.0463.429657e-195
CFD1.763722e-1952.3623810.9380.0372.418768e-191
CD684.612171e-1922.0873660.9260.0366.325132e-188
RP11-290F20.31.846215e-1881.8862880.8400.0162.531900e-184
[31]:
# find markers for every cluster compared to all remaining cells, report only the positive ones
pbmc.markers <- FindAllMarkers(pbmc, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25);
pbmc.markers %>% group_by(cluster) %>% top_n(n = 2, wt = avg_logFC);
A grouped_df: 18 × 7
p_valavg_logFCpct.1pct.2p_val_adjclustergene
<dbl><dbl><dbl><dbl><dbl><fct><chr>
1.963031e-1070.73006350.9010.5942.692101e-1030LDHB
1.606796e-820.92191350.4360.110 2.203560e-780CCR7
7.953303e-890.89211700.9810.642 1.090716e-841LTB
1.851623e-600.85860340.4220.110 2.539316e-561AQP3
0.000000e+003.86087330.9960.215 0.000000e+002S100A9
0.000000e+003.79664030.9750.121 0.000000e+002S100A8
0.000000e+002.98758330.9360.041 0.000000e+003CD79A
9.481783e-2712.48949320.6220.0221.300332e-2663TCL1A
2.958181e-1892.12205550.9850.2404.056849e-1854CCL5
2.568683e-1582.04616870.5870.0593.522691e-1544GZMK
3.511192e-1842.29549310.9750.1344.815249e-1805FCGR3A
2.025672e-1252.13881251.0000.3152.778007e-1215LST1
7.949981e-2693.34622780.9610.0681.090260e-2646GZMB
3.132281e-1913.68989960.9610.1314.295609e-1876GNLY
1.480764e-2202.68327710.8120.0112.030720e-2167FCER1A
1.665286e-211.99242751.0000.513 2.283773e-177HLA-DPB1
7.731180e-2005.02072621.0000.0101.060254e-1958PF4
3.684548e-1105.94433471.0000.0245.052989e-1068PPBP
[32]:
cluster1.markers <- FindMarkers(pbmc, ident.1 = 0, logfc.threshold = 0.25, test.use = "roc", only.pos = TRUE)
[33]:
options(repr.plot.width=20, repr.plot.height=8)
VlnPlot(pbmc, features = c("MS4A1", "CD79A"))
../_images/examples_seurat_pbmc_v0.0.1_44_0.png
[34]:
# you can plot raw counts as well
VlnPlot(pbmc, features = c("NKG7", "PF4"), slot = "counts", log = TRUE)
../_images/examples_seurat_pbmc_v0.0.1_45_0.png
[35]:
options(repr.plot.width=20, repr.plot.height=20)
FeaturePlot(pbmc, features = c("MS4A1", "GNLY", "CD3E", "CD14", "FCER1A", "FCGR3A", "LYZ", "PPBP",
    "CD8A"))
../_images/examples_seurat_pbmc_v0.0.1_46_0.png
[36]:
options(repr.plot.width=20, repr.plot.height=14)
top10 <- pbmc.markers %>% group_by(cluster) %>% top_n(n = 10, wt = avg_logFC)
DoHeatmap(pbmc, features = top10$gene) + NoLegend()
../_images/examples_seurat_pbmc_v0.0.1_47_0.png

Assigning cell type identity to clusters

[37]:
options(repr.plot.width=10, repr.plot.height=7)
new.cluster.ids <- c("Naive CD4 T", "Memory CD4 T", "CD14+ Mono", "B", "CD8 T", "FCGR3A+ Mono",
    "NK", "DC", "Platelet")
names(new.cluster.ids) <- levels(pbmc)
pbmc <- RenameIdents(pbmc, new.cluster.ids)
DimPlot(pbmc, reduction = "umap", label = TRUE, pt.size = 0.5) + NoLegend()
../_images/examples_seurat_pbmc_v0.0.1_49_0.png
[ ]: