# $Score_g = \sum_{i=1}^n w_i \cdot e^{-\frac{Rank_i-1}{\lambda}}$
# Consensus ranking with exponential decay
# using rank-based metrics to make the result more robust
# log2FC: abundance metric
# p-value: significance (reliability) metric
# pct_query: coverage metric in query cells
# pct_reference: coverage metric in reference cells
# lambda: decay parameter
library(tidyverse)
library(vroom)
trans_rank <- function(r, lambda = 500) {
exp(-(r - 1) / lambda)
}
markers_file <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/Astro_Glioma/Annotated/cell_subclass_one_vs_all_markers/cell_subclass_markers.tsv"
# background signal
# for datasets with severely ambient RNA contamination
# which leads to higher background signal (i.e., higher pct_reference)
# you can set a larger epsilon
epsilon <- 0.1
# Lower pct rewards genes excelling in few metrics, e.g., 0.05
# higher pct rewards genes excelling in all metrics, e.g., 0.5
pct <- 0.5
log2fc_cap <- 3
markers_df <- vroom(markers_file)
markers_df <- markers_df %>%
mutate(group = gsub("/", "_", group))
for (g in unique(markers_df$group)) {
g_df <- markers_df %>%
filter(group == g)
L <- nrow(g_df) * pct
g_df <- g_df %>%
mutate(
pct_nz_group_pseudocount = pmax(pct_nz_group, epsilon),
pct_nz_reference_pseudocount = pmax(pct_nz_reference, epsilon),
logfoldchanges_capped = pmin(logfoldchanges, log2fc_cap),
rank_fc = rank(-logfoldchanges_capped, ties.method = "average"),
rank_pval = rank(pvals, ties.method = "average"),
rank_pct_query = rank(-pct_nz_group_pseudocount, ties.method = "average"),
rank_pct_ref = rank(pct_nz_reference_pseudocount, ties.method = "average"),
# rank_pct_query = rank(-pct_nz_group, ties.method = "average"),
# rank_pct_ref = rank(pct_nz_reference, ties.method = "average"),
score_fc = trans_rank(rank_fc, lambda = L),
score_pval = trans_rank(rank_pval, lambda = L),
score_pct_query = trans_rank(rank_pct_query, lambda = L),
score_pct_ref = trans_rank(rank_pct_ref, lambda = L),
consensus_score = score_fc + score_pval + score_pct_query + score_pct_ref,
consensus_rank = rank(-consensus_score, ties.method = "average")
) %>%
arrange(desc(consensus_score))
vroom_write(
g_df,
file = gsub("\\.tsv$", paste0(".ordered.", g, ".pct", pct, ".epsilon", epsilon, ".tsv"), markers_file),
col_names = TRUE,
append = FALSE
)
}1 Main workflow (from raw data to annotation)
1.1 Sort gene markers
1.2 Predict doublets using scDblFinder
# predict doublets using scDblFinder
library(anndataR)
library(scDblFinder)
library(vroom)
library(tidyverse)
# estimate doublet rate using linear interpolation
estimate_doublet_rate <- function(n, ref_df) {
x_ref <- ref_df$n
y_ref <- ref_df$doublet_rate
n_constrained <- pmax(min(x_ref), pmin(max(x_ref), n))
res <- approx(x = x_ref, y = y_ref, xout = n_constrained, method = "linear")
return(res$y)
}
doublet_rate_file <- "/data/database/data/misc/scOmics_doublet_rates/mobi_snRNA-seq_v20260327.tsv"
h5ad_file <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/Astro_Glioma/Glioma.filtered_by_lower_bounds_for_doublet_detection.h5ad"
doublet_rate_df <- vroom(doublet_rate_file) %>%
mutate(
n = recovered_cells,
doublet_rate = doublet_rate_in_percent / 100
)
adata <- read_h5ad(h5ad_file)
doublet_rate <- estimate_doublet_rate(n = adata$n_obs(), ref_df = doublet_rate_df)
counts <- t(adata$X)
sce <- scDblFinder(counts, dbr = doublet_rate)
res <- as.data.frame(sce@colData)
res$cell_barcode <- rownames(res)
vroom_write(
res,
paste0(h5ad_file, ".scDblFinder_res.tsv"),
col_names = TRUE,
append = FALSE,
delim = "\t"
)1.3 Run Seurat SCTransform
# perform Seurat SCTransform
library(anndataR)
library(Seurat)
library(tidyverse)
library(YRUtils)
library(vroom)
h5ad_file <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/Astro_Glioma/filtered_by_upper_bounds_for_sctransform.h5ad"
# set continuous variables to regress
# regressing out sequencing depth again is not recommended
# as it can lead to overfitting
# SCTransform is designed to handle sequencing depth
vars_to_regress <- c("total_counts", "pct_counts_MT", "pct_counts_RIBO")
n_hvgs <- 3000
scheme <- "seurat-sctransform.full-regression"
batch_cols <- c("sample")
seu <- read_h5ad(h5ad_file, as = "Seurat")
# save raw counts in `X` layer to `counts` layer
LayerData(seu, assay = "RNA", layer = "counts") <- LayerData(seu, assay = "RNA", layer = "X")
# set `counts` as default layer
DefaultLayer(seu[["RNA"]]) <- "counts"
# remove `X` layer
LayerData(seu, assay = "RNA", layer = "X") <- NULL
seu <- SCTransform(
seu,
batch_var = batch_cols,
vars.to.regress = vars_to_regress,
variable.features.n = n_hvgs,
do.correct.umi = TRUE,
do.scale = FALSE,
do.center = TRUE,
vst.flavor = "v2",
return.only.var.genes = FALSE
)
DefaultAssay(seu) <- "RNA"
# save all layers in `SCT` assay to `RNA` assay
LayerData(seu, assay = "RNA", layer = "sct.corrected.counts") <- LayerData(seu, assay = "SCT", layer = "counts")
LayerData(seu, assay = "RNA", layer = "sct.corrected.data") <- LayerData(seu, assay = "SCT", layer = "data")
LayerData(seu, assay = "RNA", layer = "sct.corrected.scale.data") <- LayerData(seu, assay = "SCT", layer = "scale.data")
# mark highly variable features
seu[["RNA"]]@meta.data[["highly_variable"]] <- seu[["RNA"]]@meta.data[["gene_name"]] %in% seu[["SCT"]]@var.features
adata <- as_AnnData(
seu,
assay_name = "RNA",
x_mapping = "corrected.scale.data",
layers_mapping = names(seu[["RNA"]]@layers),
obs_mapping = names(seu@meta.data),
var_mapping = names(seu[["RNA"]]@meta.data)
)
adata$uns$highly_variable_features <- seu[["SCT"]]@var.features
adata$write_h5ad(paste0(h5ad_file, ".", scheme, ".h5ad"))
noise_genes <- seu[["SCT"]]@var.features[grepl(seu[["SCT"]]@var.features, pattern = "^(mt-|Rps|Rpl)", ignore.case = TRUE)]
# plot highly variable feature distribution
# using mean-variance relationship
DefaultAssay(seu) <- "SCT"
p <- VariableFeaturePlot(seu, log = TRUE) +
geom_hline(yintercept = 1, linetype = "dashed", color = "grey") +
scale_y_log10() +
labs(
x = "log10(Geometric Mean of Expression)",
y = "log10(Residual Variance)",
title = paste0("Highly Variable Genes - ", n_hvgs - length(noise_genes), "/", n_hvgs)
)
ppreview(
plot = p,
file = paste0(h5ad_file, ".", scheme, ".highly_variable_genes.pdf"),
width = 600,
height = 350
)
df <- p$data
df$gene_name <- row.names(df)
vroom_write(
df,
file = paste0(h5ad_file, ".", scheme, ".highly_variable_genes.tsv"),
delim = "\t",
col_names = TRUE,
append = FALSE
)1.4 Main workflow
# %%
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import gc
from pathlib import Path
import scanpy as sc
import anndata as ad
import rapids_singlecell as rsc
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
# set memory management type
import rmm
import cupy as cp
from rmm.allocators.cupy import rmm_cupy_allocator
rmm.reinitialize(
managed_memory=False,
pool_allocator=True,
devices=[0],
initial_pool_size=12 * 1024**3,
)
cp.cuda.set_allocator(rmm_cupy_allocator)
# %%
sc.set_figure_params(dpi=600, facecolor="white")
# %%
root_dir = "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res"
adata_files = {
"Astro": "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/quant_res/JZ25176043-250818-Astro_20250927_221710/JZ25176043-250818-Astro_outs/JZ25176043-250818-Astro_filtered.h5ad",
"Glioma": "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/quant_res/JZ25176044-250818-Glioma_20250928_004852/JZ25176044-250818-Glioma_outs/JZ25176044-250818-Glioma_filtered.h5ad",
}
# %%
work_dir = Path(root_dir, "_".join(adata_files.keys()))
os.makedirs(work_dir, exist_ok=True)
os.chdir(work_dir)
# %% Satge 1:
# Filter cells and genes with lower bounds
# Prepare for doublet detection
# adatas = {}
# for k, v in adata_files.items():
# adata_tmp = sc.read_h5ad(v)
# adata_tmp.var = adata_tmp.var.assign(gene_name=adata_tmp.var.index).rename(
# columns={"gene_ids": "gene_id"}
# )
# adata_tmp.obs = adata_tmp.obs.assign(cell_barcode=adata_tmp.obs.index).rename(
# columns={"n_count": "orig.total_counts"}
# )
# adatas[k] = adata_tmp
# adata = ad.concat(adatas, label="sample", join="inner", merge="same", index_unique="_")
# del adatas
# %%
sample = "Astro"
# %%
# adata = sc.read_h5ad(adata_files[sample])
# %%
adata.obs["sample"] = sample
adata.var = adata.var.assign(gene_name=adata.var.index).rename(
columns={"gene_ids": "gene_id"}
)
adata.obs = adata.obs.assign(cell_barcode=adata.obs.index).rename(
columns={"n_count": "orig.total_counts"}
)
# %% load data into GPU memory
rsc.get.anndata_to_GPU(adata)
# %%
print(adata.var_names[adata.var_names.str.startswith("mt-")])
print(adata.var_names[adata.var_names.str.startswith(("Rps", "Rpl"))])
print(adata.var_names[adata.var_names.str.contains("^Hb[^(p)]")])
# %%
adata.var["MT"] = adata.var_names.str.contains("^mt-", case=False, na=False)
adata.var["RIBO"] = adata.var_names.str.contains("^(Rps|Rpl)", case=False, na=False)
adata.var["HB"] = adata.var_names.str.contains("^Hb[^(p)]", case=False, na=False)
# %%
rsc.pp.calculate_qc_metrics(adata, qc_vars=["MT", "RIBO", "HB"])
# %%
sc.set_figure_params(figsize=(6, 6))
sc.pl.scatter(adata, x="total_counts", y="pct_counts_MT")
sc.pl.scatter(adata, x="total_counts", y="n_genes_by_counts")
# %%
sc.set_figure_params(figsize=(4, 4))
sc.pl.violin(adata, "n_genes_by_counts", jitter=0.4)
sc.pl.violin(adata, "total_counts", jitter=0.4)
sc.pl.violin(adata, "pct_counts_MT", jitter=0.4)
sc.pl.violin(adata, "pct_counts_RIBO", jitter=0.4)
sc.pl.violin(adata, "pct_counts_HB", jitter=0.4)
# %% evaluate cell quality based on lower bounds
min_counts = 500
min_genes = 250
min_cells = 10
max_mt_pct = 10
n_cells = adata.shape[0]
n_genes = adata.shape[1]
min_total_counts = min(adata.obs["total_counts"])
min_gene_number = min(adata.obs["n_genes_by_counts"])
n_cells_left_min_counts = sum(adata.obs["total_counts"] >= min_counts)
n_cells_left_min_genes = sum(adata.obs["n_genes_by_counts"] >= min_genes)
n_cells_left_mt = sum(adata.obs["pct_counts_MT"] <= max_mt_pct)
n_genes_left_min_cells = sum(adata.var["n_cells_by_counts"] >= min_cells)
print(
f"""
Total cells: {n_cells}
Total genes: {n_genes}
Minimum total counts: {min_total_counts}
Minimum gene number: {min_gene_number}
% cells with total counts >= {min_counts} [{n_cells-n_cells_left_min_counts} filtered]: {n_cells_left_min_counts/n_cells * 100}
% cells with gene number >= {min_genes} [{n_cells-n_cells_left_min_genes} filtered]: {n_cells_left_min_genes/n_cells * 100}
% cells with MT percent <= {max_mt_pct} [{n_cells-n_cells_left_mt} filtered]: {n_cells_left_mt/n_cells * 100}
# genes detected in >= {min_cells} cells [{n_genes-n_genes_left_min_cells} filtered]: {n_genes_left_min_cells}
"""
)
# %% only filtering low-quality cells and genes
# do not filter potential doublets
# prepare for doublet detection
adata = adata[adata.obs["total_counts"] >= min_counts, :]
adata = adata[adata.obs["n_genes_by_counts"] >= min_genes, :]
adata = adata[adata.obs["pct_counts_MT"] <= max_mt_pct, :]
adata = adata[:, adata.var["n_cells_by_counts"] >= min_cells]
adata.shape
# %%
rsc.pp.calculate_qc_metrics(adata, qc_vars=["MT", "RIBO", "HB"])
# %% Stage 2:
# Filter cells and genes with upper bounds
# Remove doublets (optional)
# Prepare for Seurat SCTransform
adata_out_file = f"{sample}.filtered_by_lower_bounds_for_doublet_detection.h5ad"
# adata.write(adata_out_file)
# adata = sc.read_h5ad(adata_out_file)
# %% load data into GPU memory
rsc.get.anndata_to_GPU(adata)
# %% import doublet detection results of scDblFinder
import pandas as pd
doublet_res = pd.read_csv(adata_out_file + ".scDblFinder_res.tsv", sep="\t")
doublet_res = doublet_res.set_index("cell_barcode")
doublet_res = doublet_res[
[c for c in doublet_res.columns if c.startswith("scDblFinder.")]
]
print(
f"""
# cells in total: {adata.shape[0]}
# doublets detected by scDblFinder: {sum(doublet_res["scDblFinder.class"] == "doublet")}
% doublets detected by scDblFinder: {sum(doublet_res["scDblFinder.class"] == "doublet")/doublet_res.shape[0] * 100}
"""
)
adata_obs_orig = adata.obs.copy()
adata.obs = adata.obs.join(doublet_res, how="left")
adata.obs["scDblFinder.class"] = adata.obs["scDblFinder.class"].astype("category")
assert adata.obs.index.equals(adata_obs_orig.index)
# %%
rsc.pp.calculate_qc_metrics(adata, qc_vars=["MT", "RIBO", "HB"])
# %%
sc.set_figure_params(figsize=(6, 6))
sc.pl.scatter(adata, x="total_counts", y="pct_counts_MT")
sc.pl.scatter(adata, x="total_counts", y="n_genes_by_counts")
# %%
sc.set_figure_params(figsize=(4, 4))
sc.pl.violin(adata, "n_genes_by_counts", groupby="scDblFinder.class", jitter=0.4)
sc.pl.violin(adata, "total_counts", groupby="scDblFinder.class", jitter=0.4)
sc.pl.violin(adata, "pct_counts_MT", groupby="scDblFinder.class", jitter=0.4)
sc.pl.violin(adata, "pct_counts_RIBO", groupby="scDblFinder.class", jitter=0.4)
sc.pl.violin(adata, "pct_counts_HB", groupby="scDblFinder.class", jitter=0.4)
# %% filter cells based on upper bounds
max_counts = 15000
max_genes = 5000
max_mt_pct = 10
min_cells = 10
print(
f"""
filtered by total counts > {max_counts}: {sum(adata.obs["total_counts"]>max_counts)}
filtered by gene number > {max_genes}: {sum(adata.obs["n_genes_by_counts"]>max_genes)}
filtered by MT percent > {max_mt_pct}: {sum(adata.obs["pct_counts_MT"]>max_mt_pct)}
"""
)
# %%
adata = adata[adata.obs["total_counts"] <= max_counts, :]
adata = adata[adata.obs["n_genes_by_counts"] <= max_genes, :]
adata = adata[adata.obs["pct_counts_MT"] <= max_mt_pct, :]
# optional: depending on your datasets
adata = adata[~(adata.obs["scDblFinder.class"] == "doublet"), :]
adata.shape
# %%
rsc.pp.calculate_qc_metrics(adata, qc_vars=["MT", "RIBO", "HB"])
# %%
adata = adata[:, adata.var["n_cells_by_counts"] >= min_cells]
adata.shape
# %%
rsc.pp.calculate_qc_metrics(adata, qc_vars=["MT", "RIBO", "HB"])
# %% Stage 3:
# Prepare for downstream analysis
adata_out_file = f"{sample}.filtered_by_upper_bounds_for_sctransform.h5ad"
# adata.write(adata_out_file)
# adata = sc.read_h5ad(adata_out_file)
# %% merge all datasets before running Seurat SCTransform
adata_files = {
k: f"{k}.filtered_by_upper_bounds_for_sctransform.h5ad" for k in adata_files.keys()
}
# %%
adatas = {}
for k, v in adata_files.items():
adata_tmp = sc.read_h5ad(v)
adatas[k] = adata_tmp
# %%
set_a = set(adatas["Astro"].var_names)
set_b = set(adatas["Glioma"].var_names)
len_a = len(set_a)
len_b = len(set_b)
union_set = len(set_a | set_b)
intersect_set = len(set_a & set_b)
sym_diff = len(set_a ^ set_b)
print(
f"""
# of genes in A: {len_a}
# of genes in B: {len_b}
# of overlapping genes: {intersect_set}
# of unique genes: {union_set}
# of different genes: {sym_diff}
"""
)
# %%
import pandas as pd
var_dfs = []
for a in adatas.values():
tmp_df = a.var[["gene_name", "gene_id"]].copy()
var_dfs.append(tmp_df)
var_df = pd.concat(var_dfs).drop_duplicates()
var_df = var_df[~var_df.index.duplicated(keep="first")]
# %% merge AnnData objects
adata = ad.concat(adatas, axis="obs", join="outer", merge="same", index_unique="_")
# %%
adata.var = adata.var.join(var_df, how="left")
# %%
del adatas
gc.collect()
# %% load data into GPU memory
rsc.get.anndata_to_GPU(adata)
# %%
print(adata.var_names[adata.var_names.str.startswith("mt-")])
print(adata.var_names[adata.var_names.str.startswith(("Rps", "Rpl"))])
print(adata.var_names[adata.var_names.str.contains("^Hb[^(p)]")])
# %%
adata.var["MT"] = adata.var_names.str.contains("^mt-", case=False, na=False)
adata.var["RIBO"] = adata.var_names.str.contains("^(Rps|Rpl)", case=False, na=False)
adata.var["HB"] = adata.var_names.str.contains("^Hb[^(p)]", case=False, na=False)
# %%
rsc.pp.calculate_qc_metrics(adata, qc_vars=["MT", "RIBO", "HB"])
# %%
sc.set_figure_params(figsize=(6, 6))
sc.pl.scatter(adata, x="total_counts", y="pct_counts_MT")
sc.pl.scatter(adata, x="total_counts", y="n_genes_by_counts")
# %%
sc.set_figure_params(figsize=(4, 4))
sc.pl.violin(adata, "n_genes_by_counts", jitter=0.4)
sc.pl.violin(adata, "total_counts", jitter=0.4)
sc.pl.violin(adata, "pct_counts_MT", jitter=0.4)
sc.pl.violin(adata, "pct_counts_RIBO", jitter=0.4)
sc.pl.violin(adata, "pct_counts_HB", jitter=0.4)
# %%
adata = adata[:, adata.var["n_cells_by_counts"] >= 10]
adata.shape
# %%
rsc.pp.calculate_qc_metrics(adata, qc_vars=["MT", "RIBO", "HB"])
# %%
adata_out_file = "filtered_by_upper_bounds_for_sctransform.h5ad"
# adata.write(adata_out_file)
# adata = sc.read_h5ad(adata_out_file)
# %%
adata_file = "filtered_by_upper_bounds_for_sctransform.h5ad.seurat-sctransform.full-regression.h5ad"
# adata = sc.read_h5ad(adata_file)
# %%
adata.X = adata.layers["counts"].copy()
# %%
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
# %%
adata.layers["data"] = adata.X.copy()
# %%
adata.raw = adata
# %%
adata.X = adata.layers["sct.corrected.scale.data"].copy()
# %%
import re
noise_genes = [
g for g in adata.var_names if re.match(r"^(mt-|Rps|Rpl)", g, re.IGNORECASE)
]
display(adata.var.loc[adata.var.index.isin(noise_genes) & adata.var["highly_variable"]])
# %%
adata.var.loc[adata.var.index.isin(noise_genes), "highly_variable"] = False
print(
f"# highly variable genes after removing MT/RIBO/HB: {sum(adata.var['highly_variable'])}"
)
# %%
rsc.get.anndata_to_GPU(adata)
# # %%
# adata.layers["counts"] = adata.X.copy()
# # %%
# rsc.pp.normalize_total(adata, target_sum=1e4)
# rsc.pp.log1p(adata)
# # %%
# adata.layers["data"] = adata.X.copy()
# # %%
# adata.raw = adata
# # %%
# nhvgs = 2000
# rsc.pp.highly_variable_genes(
# adata, n_top_genes=nhvgs, flavor="seurat_v3", layer="counts"
# )
# # %%
# import re
# noise_genes = [
# g for g in adata.var_names if re.match(r"^(mt-|Rps|Rpl)", g, re.IGNORECASE)
# ]
# display(adata.var.loc[adata.var.index.isin(noise_genes) & adata.var["highly_variable"]])
# # %%
# adata.var.loc[adata.var.index.isin(noise_genes), "highly_variable"] = False
# print(
# f"# highly variable genes after removing MT/RIBO/HB: {sum(adata.var['highly_variable'])}"
# )
# # %%
# sc.set_figure_params(figsize=(8, 5))
# sc.pl.highly_variable_genes(adata, log=True, show=False)
# fig = plt.gcf()
# fig.suptitle(
# f"Highly Variable Genes - {sum(adata.var['highly_variable'])}/{nhvgs}",
# fontsize=20,
# y=1,
# )
# fig.savefig("highly_variable_genes.pdf", dpi=600, bbox_inches="tight")
# plt.show()
# # %%
# rsc.pp.regress_out(adata, keys=["total_counts", "pct_counts_MT", "pct_counts_RIBO"])
# # %%
# rsc.pp.scale(adata, max_value=10)
# %%
rsc.tl.pca(adata, n_comps=100, use_highly_variable=True)
# %% elbow plot
sc.set_figure_params(figsize=(22, 8))
sc.pl.pca_variance_ratio(adata, log=True, n_pcs=100, show=False)
fig = plt.gcf()
fig.savefig("pca_variance_ratio.original.pdf", dpi=600, bbox_inches="tight")
plt.show()
# %% customized elbow plot
import pandas as pd
import numpy as np
from plotnine import *
npcs = 100
var_ratio = adata.uns["pca"]["variance_ratio"][:npcs]
df_elbow = pd.DataFrame(
{"PC": np.arange(1, len(var_ratio) + 1), "Variance_Ratio": var_ratio}
)
p = (
ggplot(df_elbow, aes(x="PC", y="Variance_Ratio"))
+ geom_line(color="#e41a1c", size=1)
+ geom_point(color="#e41a1c", size=1.5)
+ scale_y_log10()
+ scale_x_continuous(breaks=np.arange(0, 101, 10))
+ geom_vline(xintercept=np.arange(20, 51, 10), linetype="dashed", color="blue")
+ labs(
x="Principal Component",
y="Variance Ratio (log10)",
title="PCA Elbow Plot - Variance Explained",
)
+ theme_classic()
+ theme(
figure_size=(10, 5),
plot_title=element_text(hjust=0.5),
axis_text=element_text(color="black"),
)
)
p.save("pca_variance_ratio.pdf", dpi=600)
p
# %%
rsc.get.anndata_to_CPU(adata)
# %% check spearman correlations between PCs and continuous technical metrics
import scanpy as sc
import pandas as pd
from scipy.stats import spearmanr
from plotnine import *
npcs = 50
eval_metrics = [
"total_counts",
"n_genes_by_counts",
"pct_counts_MT",
"pct_counts_RIBO",
]
pcs = adata.obsm["X_pca"][:, :npcs]
obs_data = adata.obs[eval_metrics]
corr_list = []
for i in range(npcs):
pc_name = f"PC{i+1}"
for eval_metric in eval_metrics:
r, _ = spearmanr(pcs[:, i], obs_data[eval_metric])
corr_list.append(
{"PC": pc_name, "Metric": eval_metric, "Corr": r, "AbsCorr": abs(r)}
)
corr_df = pd.DataFrame(corr_list)
pc_order = [f"PC{i+1}" for i in range(npcs)]
corr_df["PC"] = pd.Categorical(corr_df["PC"], categories=pc_order[::-1], ordered=True)
corr_df["Metric"] = pd.Categorical(
corr_df["Metric"],
categories=eval_metrics,
ordered=True,
)
p = (
ggplot(corr_df, aes(x="Metric", y="PC", fill="Corr"))
+ geom_tile(color="white")
+ geom_text(aes(label="Corr.round(2)"), size=8)
+ scale_fill_gradient2(
low="#313695", mid="#ffffbf", high="#a50026", midpoint=0, limits=[-1, 1]
)
+ theme_minimal()
+ labs(
x="Technical Metrics",
y="Principal Components",
fill="Spearman's Rho",
)
+ theme(
figure_size=(4, 10),
axis_text_x=element_text(angle=45, hjust=1),
panel_grid_major=element_blank(),
)
)
p.save("pc_tech_correlations.pdf", dpi=600)
p
# %% pairwise spearman correlations between continuous metrics
import pandas as pd
from plotnine import *
eval_metrics = [
"total_counts",
"n_genes_by_counts",
"pct_counts_MT",
"pct_counts_RIBO",
]
obs_data = adata.obs[eval_metrics]
corr_matrix = obs_data.corr(method="spearman")
corr_df = corr_matrix.reset_index().melt(id_vars="index")
corr_df.columns = ["Metric_Y", "Metric_X", "Corr"]
corr_df["Metric_Y"] = pd.Categorical(
corr_df["Metric_Y"], categories=eval_metrics[::-1], ordered=True
)
corr_df["Metric_X"] = pd.Categorical(
corr_df["Metric_X"], categories=eval_metrics, ordered=True
)
p = (
ggplot(corr_df, aes(x="Metric_X", y="Metric_Y", fill="Corr"))
+ geom_tile(color="white")
+ geom_text(aes(label="Corr.round(2)"), size=10)
+ scale_fill_gradient2(
low="#313695", mid="#ffffbf", high="#a50026", midpoint=0, limits=[-1, 1]
)
+ theme_minimal()
+ labs(
x="",
y="",
fill="Spearman's Rho",
title="Pairwise Correlations",
)
+ theme(
aspect_ratio=1,
figure_size=(5, 5),
axis_text_x=element_text(angle=90, hjust=1),
panel_grid_major=element_blank(),
)
)
p.save("tech_correlations.pdf", dpi=600)
p
# %% PC loading heatmap
import pandas as pd
import numpy as np
from plotnine import *
from matplotlib.backends.backend_pdf import PdfPages
import matplotlib.pyplot as plt
def pc_loading_heatmap(
data,
pc_loadings,
pc_embeddings,
pc_num,
n_genes=20,
n_cells=500,
figsize=(8, 6),
):
# select top/bottom gene loadings
loadings = pc_loadings[pc_num]
top_genes = loadings.sort_values(ascending=False).head(n_genes).index.tolist()
bottom_genes = (
loadings.sort_values(ascending=True).head(n_genes).index[::-1].tolist()
)
gene_order = top_genes + bottom_genes
# select top/bottom cell embeddings
pc_scores = pc_embeddings[pc_num]
top_cells = pc_scores.sort_values(ascending=False).head(n_cells).index.tolist()
bottom_cells = pc_scores.sort_values(ascending=True).head(n_cells).index.tolist()
cell_order = top_cells + bottom_cells
# log-normalized expression data
expr_mat = data.loc[cell_order, gene_order].copy()
m = expr_mat.mean(axis=0)
s = expr_mat.std(axis=0) + 1e-9
# centered and scaled
expr_df = ((expr_mat - m) / s).clip(lower=-2, upper=2)
# convert to long format
df_plot = expr_df.reset_index().melt(
id_vars="index", var_name="Gene", value_name="Zscore"
)
df_plot.columns = ["Cell", "Gene", "Zscore"]
df_plot["Cell"] = pd.Categorical(df_plot["Cell"], categories=cell_order)
df_plot["Gene"] = pd.Categorical(df_plot["Gene"], categories=gene_order[::-1])
df_plot["Cell_Group"] = np.where(
df_plot["Cell"].isin(top_cells), "Top Cells", "Bottom Cells"
)
df_plot["Gene_Group"] = np.where(
df_plot["Gene"].isin(top_genes), "Top Loadings", "Bottom Loadings"
)
df_plot["Gene_Group"] = pd.Categorical(
df_plot["Gene_Group"], categories=["Top Loadings", "Bottom Loadings"]
)
df_plot["Cell_Group"] = pd.Categorical(
df_plot["Cell_Group"], categories=["Top Cells", "Bottom Cells"]
)
p = (
ggplot(df_plot, aes(x="Cell", y="Gene", fill="Zscore"))
+ geom_tile()
+ scale_fill_gradient2(
low="#4575b4", mid="white", high="#d73027", midpoint=0, limits=[-2, 2]
)
+ facet_grid("Gene_Group ~ Cell_Group", scales="free", space="free")
+ theme_minimal()
+ labs(
title=f"{pc_num}",
x="Cells (ranked by score)",
y="Genes (ranked by loading)",
)
+ theme(
figure_size=figsize,
axis_text_x=element_blank(),
axis_ticks_major_x=element_blank(),
panel_grid=element_blank(),
strip_background=element_rect(fill="white", color=None),
strip_text=element_text(size=10, face="bold"),
)
)
return p
pcs = range(0, 100)
data_layer = "data"
pc_loadings = pd.DataFrame(
adata.varm["PCs"][:, pcs].copy(),
index=adata.var_names.copy(),
columns=[f"PC{i+1}" for i in pcs],
)
pc_embeddings = pd.DataFrame(
adata.obsm["X_pca"][:, pcs].copy(),
index=adata.obs_names.copy(),
columns=[f"PC{i+1}" for i in pcs],
)
data = adata.layers[data_layer].copy()
if hasattr(data, "get"):
data = data.get()
if hasattr(data, "toarray"):
data = data.toarray()
data = pd.DataFrame(
data,
index=adata.obs_names.copy(),
columns=adata.var_names.copy(),
)
with PdfPages("pc_loading_heatmap.pdf") as pdf:
for pn in pcs:
p = pc_loading_heatmap(
data=data,
pc_loadings=pc_loadings,
pc_embeddings=pc_embeddings,
pc_num=f"PC{pn+1}",
n_genes=20,
n_cells=500,
)
fig = p.draw(show=False)
pdf.savefig(fig, bbox_inches="tight")
plt.close(fig)
del p, fig
gc.collect()
# %% test different PC numbers for UMAP
from matplotlib.backends.backend_pdf import PdfPages
import matplotlib.pyplot as plt
pcs = range(20, 101)
# here we set a resolution
# leading to over-clustering
resolution = 1
resolution_str = f"leiden_res{resolution}".replace(".", "_")
sc.set_figure_params(figsize=(8, 8))
with PdfPages(f"pc{min(pcs)}-{max(pcs)}.{resolution_str}.umap.pdf") as pdf:
for pn in pcs:
rsc.pp.neighbors(
adata,
n_neighbors=15,
n_pcs=pn,
algorithm="brute",
use_rep="X_pca",
key_added="neighbors",
)
rsc.tl.umap(adata, neighbors_key="neighbors", key_added="X_umap")
rsc.tl.leiden(
adata,
resolution=resolution,
key_added=resolution_str,
neighbors_key="neighbors",
)
sc.pl.umap(
adata,
color=resolution_str,
legend_loc="on data",
palette="tab20",
show=False,
title=f"{pn} PCs ({resolution_str})",
)
fig = plt.gcf()
pdf.savefig(fig, bbox_inches="tight")
plt.close(fig)
del fig
gc.collect()
# %%
adata_out_file = "proceed2pca.h5ad"
# adata.write(adata_out_file)
# adata = sc.read_h5ad(adata_out_file)
# %%
npcs = 34
# 关键参数:
# n_neighbors:全局/局部结构平衡
# use_rep/n_pcs
# algorithm:近邻搜索算法-速度与精度平衡
# method:计算连通性权重,如 'gauss' 计算的权重随距离增加呈指数级衰减
# metric:距离度量
rsc.pp.neighbors(
adata,
n_neighbors=15,
n_pcs=npcs,
algorithm="brute",
use_rep="X_pca",
key_added="neighbors",
)
# %%
rsc.tl.umap(adata, neighbors_key="neighbors", key_added="X_umap")
# %%
rsc.tl.tsne(adata, n_pcs=npcs, use_rep="X_pca", key_added="X_tsne")
# %%
resolutions = np.round(np.arange(0.2, 2.1, 0.1), 1)
for resolution in resolutions:
rsc.tl.leiden(
adata,
resolution=resolution,
key_added=f"leiden_res{resolution}".replace(".", "_"),
neighbors_key="neighbors",
)
# %%
adata_out_file = "proceed2umap.h5ad"
# adata.write(adata_out_file)
# adata = sc.read_h5ad(adata_out_file)
# %%
from matplotlib.backends.backend_pdf import PdfPages
import matplotlib.pyplot as plt
target_cols = sorted(
list(adata.obs.columns[adata.obs.columns.str.contains(r"^leiden_res")])
)
sc.set_figure_params(figsize=(8, 8))
with PdfPages(f"leiden_res.umap.pdf") as pdf:
for target_col in target_cols:
sc.pl.umap(
adata,
color=target_col,
legend_loc="on data",
palette="tab20",
show=False,
)
fig = plt.gcf()
pdf.savefig(fig, bbox_inches="tight")
plt.close(fig)
del fig
gc.collect()
# %%
group_keys = [
"leiden_res0_7",
"n_genes_by_counts",
"total_counts",
"pct_counts_MT",
"pct_counts_RIBO",
"scDblFinder.class",
"sample",
]
sc.set_figure_params(figsize=(8, 8))
sc.pl.umap(
adata,
color=group_keys,
ncols=2,
wspace=0.25,
hspace=0.25,
palette="tab20",
show=False,
)
fig = plt.gcf()
fig.savefig("tech_check.umap.pdf", dpi=600, bbox_inches="tight")
plt.show()
# %%
group_column = "leiden_res0_7"
sc.set_figure_params(figsize=(25, 8))
sc.pl.violin(adata, "n_genes_by_counts", jitter=0.4, groupby=group_column, rotation=0)
sc.pl.violin(adata, "total_counts", jitter=0.4, groupby=group_column, rotation=0)
sc.pl.violin(adata, "pct_counts_MT", jitter=0.4, groupby=group_column, rotation=0)
sc.pl.violin(adata, "pct_counts_RIBO", jitter=0.4, groupby=group_column, rotation=0)
sc.pl.violin(adata, "pct_counts_HB", jitter=0.4, groupby=group_column, rotation=0)
# %%
import pandas as pd
markers_file = "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/data/adult_mm.hippocampus.gene_markers.tsv"
name_mapping = {v.lower(): v for v in adata.var_names}
markers_df = pd.read_csv(markers_file, sep="\t")
markers_df["marker"] = markers_df["marker"].str.split("/")
markers_df = markers_df.explode("marker")
markers_df["marker"] = markers_df["marker"].str.strip()
markers_df["var_name"] = markers_df["marker"].str.lower().map(name_mapping)
missing_markers = markers_df[markers_df["var_name"].isna()]["marker"].unique()
print(f"Missing markers: {missing_markers}")
markers_df = markers_df.dropna(subset=["var_name"])
markers_dict = (
markers_df.groupby("cell_type")["var_name"]
.apply(lambda x: list(x.unique()))
.to_dict()
)
markers_dict = {
k: markers_dict[k]
for k in markers_df["cell_type"].unique()
if k in markers_dict.keys()
}
all_markers = list(set([m for sublist in markers_dict.values() for m in sublist]))
# %%
from matplotlib.backends.backend_pdf import PdfPages
import matplotlib.pyplot as plt
npcs = 34
target_cols = sorted(
list(adata.obs.columns[adata.obs.columns.str.contains(r"^leiden_res")])
)
sc.set_figure_params(figsize=(8, 8))
with PdfPages(f"leiden_res.canonical_gene_markers.dotplot.pdf") as pdf:
for target_col in target_cols:
sc.tl.dendrogram(
adata,
groupby=target_col,
# var_names=all_markers,
optimal_ordering=True,
use_rep="X_pca",
n_pcs=npcs,
)
sc.pl.dotplot(
adata,
var_names=markers_dict,
groupby=target_col,
dendrogram=True,
layer="data",
standard_scale="var",
show=False,
linewidth=0,
title=target_col,
)
fig = plt.gcf()
pdf.savefig(fig, bbox_inches="tight")
plt.close(fig)
del fig
gc.collect()
# %% identify marker genes for each cluster (one vs. all)
target_col = "leiden_res0_7"
sc.tl.rank_genes_groups(
adata,
groupby=target_col,
layer="data",
use_raw=False,
method="wilcoxon",
pts=True,
tie_correct=True,
groups="all",
reference="rest",
corr_method="benjamini-hochberg",
)
# %% extract marker genes as a data frame (one vs. all)
import math
target_col = "leiden_res0_7"
markers_df = sc.get.rank_genes_groups_df(
adata,
group=None,
key="rank_genes_groups",
pval_cutoff=1,
log2fc_min=0,
log2fc_max=math.inf,
gene_symbols="gene_id",
)
out_dir = f"{target_col}_one_vs_all_markers"
os.makedirs(out_dir, exist_ok=True)
markers_df.to_csv(f"{out_dir}/{target_col}_markers.tsv", sep="\t", index=False)
# %%
adata_out_file = "processed.h5ad"
# adata.write(adata_out_file)
# adata = sc.read_h5ad(adata_out_file)
# %%
import numpy as np
import pandas as pd
import re
from scipy.cluster.hierarchy import to_tree
def get_siblings(tree, siblings, categories):
if tree.is_leaf():
return
left = tree.get_left()
right = tree.get_right()
if left.is_leaf() and right.is_leaf():
siblings.append(
{
"left_id": categories[left.id],
"right_id": categories[right.id],
"distance": tree.dist,
"count": tree.count,
}
)
get_siblings(left, siblings, categories)
get_siblings(right, siblings, categories)
target_col = "leiden_res0_7"
categories = adata.obs[target_col].cat.categories
linkage_matrix = adata.uns[f"dendrogram_{target_col}"]["linkage"]
tree = to_tree(linkage_matrix)
siblings = []
get_siblings(tree, siblings, categories)
siblings_df = pd.DataFrame(siblings).sort_values("distance")
siblings_df.to_csv(f"{target_col}_siblings.tsv", sep="\t", index=False)
# %%
def trans_rank(r, lmbda=500):
return np.exp(-(r - 1) / lmbda)
# ordering parameters
epsilon = 0.1
log2fc_cap = 3
pct_thresholds = [0.05, 0.5]
target_col = "leiden_res0_7"
clusters = sorted(list(adata.obs[target_col].unique()))
for c1 in clusters:
out_dir = f"{target_col}_c{c1}_pairwise_markers"
os.makedirs(out_dir, exist_ok=True)
remaining_clusters = [c for c in clusters if c != c1]
for c2 in remaining_clusters:
sc.tl.rank_genes_groups(
adata,
groupby=target_col,
layer="data",
use_raw=False,
method="wilcoxon",
pts=True,
tie_correct=True,
groups=[c1],
reference=c2,
corr_method="benjamini-hochberg",
)
pairwise_markers_df = sc.get.rank_genes_groups_df(
adata,
group=None,
key="rank_genes_groups",
pval_cutoff=1,
log2fc_min=0,
log2fc_max=math.inf,
gene_symbols="gene_id",
)
pairwise_markers_df["pct_nz_reference"] = pairwise_markers_df["names"].map(
adata.uns["rank_genes_groups"]["pts"][c2]
)
for pct_threshold in pct_thresholds:
tmp_df = pairwise_markers_df.copy()
tmp_df["logfoldchanges_capped"] = tmp_df["logfoldchanges"].clip(
upper=log2fc_cap
)
tmp_df["pct_nz_group_pseudo"] = tmp_df["pct_nz_group"].clip(lower=epsilon)
tmp_df["pct_nz_reference_pseudo"] = tmp_df["pct_nz_reference"].clip(
lower=epsilon
)
L = len(tmp_df) * pct_threshold
tmp_df["rank_fc"] = tmp_df["logfoldchanges_capped"].rank(
ascending=False, method="average"
)
tmp_df["rank_pval"] = tmp_df["pvals"].rank(ascending=True, method="average")
tmp_df["rank_pct_query"] = tmp_df["pct_nz_group_pseudo"].rank(
ascending=False, method="average"
)
tmp_df["rank_pct_ref"] = tmp_df["pct_nz_reference_pseudo"].rank(
ascending=True, method="average"
)
tmp_df["score_fc"] = trans_rank(tmp_df["rank_fc"], lmbda=L)
tmp_df["score_pval"] = trans_rank(tmp_df["rank_pval"], lmbda=L)
tmp_df["score_pct_query"] = trans_rank(tmp_df["rank_pct_query"], lmbda=L)
tmp_df["score_pct_ref"] = trans_rank(tmp_df["rank_pct_ref"], lmbda=L)
tmp_df["consensus_score"] = (
tmp_df["score_fc"]
+ tmp_df["score_pval"]
+ tmp_df["score_pct_query"]
+ tmp_df["score_pct_ref"]
)
tmp_df = tmp_df.sort_values("consensus_score", ascending=False).reset_index(
drop=True
)
tmp_df.to_csv(
f"{out_dir}/{c1}_vs_{c2}_pct{pct_threshold}.markers.tsv",
sep="\t",
index=False,
)
del pairwise_markers_df, tmp_df
gc.collect()
# %% annotate cell types
work_dir = "Annotated"
os.makedirs(work_dir, exist_ok=True)
os.chdir(work_dir)
# %%
target_col = "leiden_res0_7"
class_dict = {
"15": "OLs",
"17": "Pericytes",
"11": "VLMCs",
"20": "CP",
"19": "Ependymals",
"12": "Astrocytes",
"7": "Microglia",
"18": "OPC",
"16": "CR",
"13": "IN_Vip",
"8": "IN_Sst",
"9": "IN_Sst",
"2": "DG EN",
"10": "CA1 EN",
"3": "CA1 EN",
"14": "CA1 EN",
"1": "CA3 EN",
"6": "Proximal Sub EN",
"4": "Superficial Sub EN",
"5": "Deep Sub EN",
"0": "PLN",
}
if set(adata.obs[target_col].unique()) == set(class_dict.keys()):
print("OK")
else:
print("NO")
# %%
adata.obs["cell_subclass"] = adata.obs[target_col].map(class_dict)
# %% identify marker genes for each cluster (one vs. all)
target_col = "cell_subclass"
sc.tl.rank_genes_groups(
adata,
groupby=target_col,
layer="data",
use_raw=False,
method="wilcoxon",
pts=True,
tie_correct=True,
groups="all",
reference="rest",
corr_method="benjamini-hochberg",
)
# %% extract marker genes as a data frame (one vs. all)
import math
target_col = "cell_subclass"
markers_df = sc.get.rank_genes_groups_df(
adata,
group=None,
key="rank_genes_groups",
pval_cutoff=1,
log2fc_min=0,
log2fc_max=math.inf,
gene_symbols="gene_id",
)
out_dir = f"{target_col}_one_vs_all_markers"
os.makedirs(out_dir, exist_ok=True)
markers_df.to_csv(f"{out_dir}/{target_col}_markers.tsv", sep="\t", index=False)
# %%
def trans_rank(r, lmbda=500):
return np.exp(-(r - 1) / lmbda)
# ordering parameters
epsilon = 0.1
log2fc_cap = 3
pct_thresholds = [0.05, 0.5]
target_col = "cell_subclass"
clusters = sorted(list(adata.obs[target_col].unique()))
for c1 in clusters:
out_dir = f"{target_col}_c{c1.replace("/", "_")}_pairwise_markers"
os.makedirs(out_dir, exist_ok=True)
remaining_clusters = [c for c in clusters if c != c1]
for c2 in remaining_clusters:
sc.tl.rank_genes_groups(
adata,
groupby=target_col,
layer="data",
use_raw=False,
method="wilcoxon",
pts=True,
tie_correct=True,
groups=[c1],
reference=c2,
corr_method="benjamini-hochberg",
)
pairwise_markers_df = sc.get.rank_genes_groups_df(
adata,
group=None,
key="rank_genes_groups",
pval_cutoff=1,
log2fc_min=0,
log2fc_max=math.inf,
gene_symbols="gene_id",
)
pairwise_markers_df["pct_nz_reference"] = pairwise_markers_df["names"].map(
adata.uns["rank_genes_groups"]["pts"][c2]
)
for pct_threshold in pct_thresholds:
tmp_df = pairwise_markers_df.copy()
tmp_df["logfoldchanges_capped"] = tmp_df["logfoldchanges"].clip(
upper=log2fc_cap
)
tmp_df["pct_nz_group_pseudo"] = tmp_df["pct_nz_group"].clip(lower=epsilon)
tmp_df["pct_nz_reference_pseudo"] = tmp_df["pct_nz_reference"].clip(
lower=epsilon
)
L = len(tmp_df) * pct_threshold
tmp_df["rank_fc"] = tmp_df["logfoldchanges_capped"].rank(
ascending=False, method="average"
)
tmp_df["rank_pval"] = tmp_df["pvals"].rank(ascending=True, method="average")
tmp_df["rank_pct_query"] = tmp_df["pct_nz_group_pseudo"].rank(
ascending=False, method="average"
)
tmp_df["rank_pct_ref"] = tmp_df["pct_nz_reference_pseudo"].rank(
ascending=True, method="average"
)
tmp_df["score_fc"] = trans_rank(tmp_df["rank_fc"], lmbda=L)
tmp_df["score_pval"] = trans_rank(tmp_df["rank_pval"], lmbda=L)
tmp_df["score_pct_query"] = trans_rank(tmp_df["rank_pct_query"], lmbda=L)
tmp_df["score_pct_ref"] = trans_rank(tmp_df["rank_pct_ref"], lmbda=L)
tmp_df["consensus_score"] = (
tmp_df["score_fc"]
+ tmp_df["score_pval"]
+ tmp_df["score_pct_query"]
+ tmp_df["score_pct_ref"]
)
tmp_df = tmp_df.sort_values("consensus_score", ascending=False).reset_index(
drop=True
)
tmp_df.to_csv(
f"{out_dir}/{c1.replace("/", "_")}_vs_{c2.replace("/", "_")}_pct{pct_threshold}.markers.tsv",
sep="\t",
index=False,
)
del pairwise_markers_df, tmp_df
gc.collect()
# %%
rsc.pp.neighbors(
adata,
n_neighbors=15,
n_pcs=34,
algorithm="brute",
use_rep="X_pca",
key_added="neighbors",
method="umap",
)
rsc.tl.umap(
adata, min_dist=0.5, spread=1, neighbors_key="neighbors", key_added="X_umap"
)
# %%
sc.set_figure_params(figsize=(5, 5))
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator
target_col = "cell_subclass"
ax = sc.pl.embedding(
adata,
basis="X_umap",
color=target_col,
legend_loc="right margin",
show=False,
legend_fontsize=7,
legend_fontweight="normal",
legend_fontoutline=2,
palette="tab20",
)
ax.set_aspect("equal", adjustable="box")
ax.axis("on")
ax.grid(False)
ax.tick_params(
axis="both",
which="major",
labelbottom=True,
labelleft=True,
bottom=True,
left=True,
labelsize=10,
)
ax.xaxis.set_major_locator(MultipleLocator(5))
ax.yaxis.set_major_locator(MultipleLocator(5))
fig = plt.gcf()
fig.savefig(target_col + ".umap.pdf", dpi=600, bbox_inches="tight")
plt.show()
# %%
group_keys = [
"n_genes_by_counts",
"cell_subclass",
"total_counts",
"pct_counts_MT",
"pct_counts_RIBO",
"scDblFinder.class",
"sample",
]
sc.set_figure_params(figsize=(8, 8))
sc.pl.umap(
adata,
color=group_keys,
ncols=2,
wspace=0.25,
hspace=0.25,
palette="tab20",
show=False,
)
fig = plt.gcf()
fig.savefig("cell_subclass.tech_check.umap.pdf", dpi=600, bbox_inches="tight")
plt.show()
# %%
from matplotlib.backends.backend_pdf import PdfPages
import matplotlib.pyplot as plt
group_column = "cell_subclass"
sc.set_figure_params(figsize=(6, 3))
with PdfPages(f"{group_column}.qc_violin.pdf") as pdf:
sc.pl.violin(
adata,
"n_genes_by_counts",
jitter=0.4,
groupby=group_column,
rotation=90,
show=False,
)
fig = plt.gcf()
pdf.savefig(fig, bbox_inches="tight")
plt.close(fig)
sc.pl.violin(
adata, "total_counts", jitter=0.4, groupby=group_column, rotation=90, show=False
)
fig = plt.gcf()
pdf.savefig(fig, bbox_inches="tight")
plt.close(fig)
sc.pl.violin(
adata,
"pct_counts_MT",
jitter=0.4,
groupby=group_column,
rotation=90,
show=False,
)
fig = plt.gcf()
pdf.savefig(fig, bbox_inches="tight")
plt.close(fig)
sc.pl.violin(
adata,
"pct_counts_RIBO",
jitter=0.4,
groupby=group_column,
rotation=90,
show=False,
)
fig = plt.gcf()
pdf.savefig(fig, bbox_inches="tight")
plt.close(fig)
sc.pl.violin(
adata,
"pct_counts_HB",
jitter=0.4,
groupby=group_column,
rotation=90,
show=False,
)
fig = plt.gcf()
pdf.savefig(fig, bbox_inches="tight")
plt.close(fig)
# %%
adata_out_file = "annotated_by_cell_subclass.h5ad"
# adata.write(adata_out_file)
# adata = sc.read_h5ad(adata_out_file)
# %%
import pandas as pd
from plotnine import *
import numpy as np
import glob
import re
import os
work_dir = "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/Astro_Glioma/Annotated/cell_subclass_one_vs_all_markers"
marker_file_pattern = "cell_subclass_markers.ordered.*.pct0.05.epsilon0.1.tsv"
top_n = 20
layer = "data"
group = "cell_subclass"
os.chdir(work_dir)
class_counts_df = (
adata.obs[group].value_counts(ascending=False).reset_index(name="count")
)
adata.obs[group] = pd.Categorical(
adata.obs[group], categories=class_counts_df[group].to_list(), ordered=True
)
marker_files = glob.glob(marker_file_pattern)
marker_dict = {k: [] for k in class_counts_df[group].to_list()}
key_mpt_dict = {k.replace("/", "_"): k for k in marker_dict.keys()}
for f in marker_files:
match = re.search(r"\.ordered\.(.+)\.pct", os.path.basename(f))
if match:
cell_type = match.group(1)
df = pd.read_csv(f, sep="\t")
top_genes = df["names"].head(top_n).tolist()
marker_dict[key_mpt_dict[cell_type]] = top_genes
sc.set_figure_params(figsize=(5, 10))
sc.pl.heatmap(
adata,
var_names=marker_dict,
groupby=group,
layer=layer,
use_raw=False,
swap_axes=True,
show_gene_labels=False,
standard_scale="var",
cmap="Oranges",
show=False,
)
fig = plt.gcf()
fig.savefig(
re.sub(
r"\.tsv$", f".top{top_n}.heatmap.pdf", marker_file_pattern.replace(".*.", ".")
),
dpi=600,
bbox_inches="tight",
)
plt.show()1.5 DE analysis
# calculate and visualize cell proportion statistics
# perform DE analysis
library(anndataR)
library(Seurat)
library(tidyverse)
library(YRUtils)
library(vroom)
library(unigd)
work_dir <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/Astro_Glioma/initial_analysis"
setwd(work_dir)
h5ad_file <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/Astro_Glioma/Annotated/annotated_by_cell_subclass.h5ad"
sample_levels <- c("Glioma", "Astro")
target_cell_types <- c("CA1 EN", "CA3 EN", "Proximal Sub EN", "Superficial Sub EN", "Deep Sub EN", "DG EN", "IN_Sst", "IN_Vip", "PLN", "Astrocytes", "Microglia", "OLs", "OPC")
# seu <- read_h5ad(h5ad_file, as = "Seurat")
# seu.bak <- seu
seu$sample_cellSubclass <- paste0(seu$sample, ".", seu$cell_subclass)
Idents(seu) <- "sample_cellSubclass"
# cell proportion statistics
prop_df <- seu[[]] %>%
select(sample, cell_subclass) %>%
filter(cell_subclass %in% target_cell_types) %>%
group_by(sample, cell_subclass) %>%
reframe(n_cond = n()) %>%
group_by(cell_subclass) %>%
reframe(n_total = sum(n_cond), sample, n_cond) %>%
mutate(
prop_cond = n_cond / n_total * 100,
sample = factor(sample, levels = sample_levels),
cell_subclass = factor(cell_subclass, levels = target_cell_types)
)
vroom_write(
prop_df,
file = "cond_prop_for_each_cell_type.tsv",
col_names = TRUE, append = FALSE
)
gg_pt <- function(x) x / (ggplot2::.pt * 0.75)
p <- ggplot() +
geom_bar(
mapping = aes(cell_subclass, prop_cond, fill = sample),
data = prop_df, stat = "identity", color = NA
) +
labs(x = "Cell types", y = "Number of cells", fill = "Condition") +
scale_y_continuous(expand = expansion(0)) +
theme_classic(base_family = "Arial") +
theme(
axis.title = element_text(size = 12, color = "black"),
axis.text.y = element_text(size = 10, color = "black"),
axis.text.x = element_text(size = 10, color = "black", angle = 45, hjust = 1, vjust = 1),
legend.title = element_text(size = 10, color = "black"),
legend.text = element_text(size = 10, color = "black"),
axis.line = element_line(color = "black", linewidth = gg_pt(1)),
axis.ticks = element_line(color = "black", linewidth = gg_pt(1))
)
ugd_save_inline(
{
print(p)
},
file = "cond_prop_for_each_cell_type.barplot.pdf",
width = 400,
height = 200
)
# DE analysis
de_out_dir <- "de"
dir.create(de_out_dir)
for (cell_type in target_cell_types) {
tmp_prop_df <- prop_df %>%
filter(cell_subclass == cell_type) %>%
mutate(sample_cellSubclass = paste0(sample, ".", cell_subclass))
ident_1 <- tmp_prop_df$sample_cellSubclass[tmp_prop_df$sample == sample_levels[1]]
ident_2 <- tmp_prop_df$sample_cellSubclass[tmp_prop_df$sample == sample_levels[2]]
de_res <- FindMarkers(
seu,
ident.1 = ident_1,
ident.2 = ident_2,
logfc.threshold = 0,
test.use = "wilcox",
min.pct = 0.1,
min.diff.pct = -Inf,
only.pos = FALSE,
random.seed = 123456
)
de_res <- de_res %>%
mutate(
gene_name = row.names(.),
log2fc_sign = if_else(avg_log2FC >= 0, 1L, -1L),
up_down = if_else(log2fc_sign == 1L, "up", "down"),
up_down = factor(up_down, levels = c("up", "down")),
signed_pval = p_val * log2fc_sign,
) %>%
group_by(up_down) %>%
arrange(up_down, signed_pval)
vroom_write(
de_res,
file = file.path(de_out_dir, paste0(gsub(" +", "_", cell_type), ".", sample_levels[1], "_vs_", sample_levels[2], ".tsv")),
col_names = TRUE, append = FALSE
)
}1.6 GO ORA & GSEA
# perform ORA and GSEA over GO terms for DE results
library(YRUtils)
library(tidyverse)
library(vroom)
library(clusterProfiler)
library(AnnotationDbi)
library(glue)
work_dir <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/Astro_Glioma/initial_analysis"
setwd(work_dir)
orgdb_file <- "/data/database/annotation/release/mus_musculus/org.Mm.eg.db.sqlite"
de_dir <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/Astro_Glioma/initial_analysis/de"
padj_th <- 0.1
gsea_output_dir <- "gsea"
go_out_dir <- "go"
dir.create(gsea_output_dir, showWarnings = FALSE)
dir.create(go_out_dir, showWarnings = FALSE)
orgdb <- loadDb(orgdb_file)
de_files <- list.files(de_dir, pattern = "\\.tsv$", full.names = TRUE, recursive = FALSE)
for (de_file in de_files) {
df <- vroom(de_file)
noise_genes <- df$gene_name[grepl("^(mt-|Rps|Rpl|Mrps|Mrpl|Hb[^(p)]|Gm\\d+$|Xist$|Ttr$|Malat1$|Meg3$|Rn\\d+s|Snhg\\d+)", df$gene_name)]
df <- df %>%
filter(!(gene_name %in% noise_genes)) %>%
mutate(signed_log10_pval = -log10(p_val) * sign(avg_log2FC)) %>%
arrange(desc(signed_log10_pval))
vroom_write(
df,
file = file.path(gsea_output_dir, gsub("\\.tsv$", ".clean.tsv", basename(de_file))),
col_names = TRUE, append = FALSE
)
gene_list <- setNames(df$signed_log10_pval, df$gene_name)
gse <- gseGO(
geneList = gene_list,
OrgDb = orgdb,
ont = "ALL",
keyType = "SYMBOL",
pvalueCutoff = 0.05,
pAdjustMethod = "BH",
seed = 1234
)
if (!is.null(gse)) {
saveRDS(gse, file = file.path(gsea_output_dir, gsub("\\.tsv$", ".gseGO.rds", basename(de_file))))
vroom_write(
gse@result,
file = file.path(gsea_output_dir, gsub("\\.tsv$", ".gseGO.tsv", basename(de_file))),
col_names = TRUE, append = FALSE
)
}
tmp_df <- df %>%
filter(p_val_adj < padj_th)
vroom_write(
tmp_df,
file = file.path(gsea_output_dir, gsub("\\.tsv$", glue(".{padj_th}.clean.tsv"), basename(de_file))),
col_names = TRUE, append = FALSE
)
gene_ls <- lapply(split(tmp_df, tmp_df$up_down), function(x) unique(na.omit(x[["gene_name"]])))
ego <- compareCluster(
gene_ls,
OrgDb = orgdb,
keyType = "SYMBOL",
fun = "enrichGO",
ont = "ALL",
pvalueCutoff = 0.05,
qvalueCutoff = 0.05,
pAdjustMethod = "BH",
minGSSize = 10,
maxGSSize = 500,
readable = FALSE,
pool = FALSE
)
if (!is.null(ego)) {
saveRDS(ego, file = file.path(go_out_dir, gsub("\\.tsv$", glue(".{padj_th}.oraGO.rds"), basename(de_file))))
vroom_write(
ego@compareClusterResult,
file = file.path(go_out_dir, gsub("\\.tsv$", glue(".{padj_th}.oraGO.tsv"), basename(de_file))),
col_names = TRUE, append = FALSE
)
}
}1.7 GO bar plot
library(tidyverse)
library(vroom)
library(YRUtils)
go_file <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/Astro_Glioma/initial_analysis/go/DG_EN.Glioma_vs_Astro.0.1.oraGO.select_terms.tsv"
go_df <- vroom(go_file)
go_df <- go_df %>%
mutate(Cluster = factor(Cluster, levels = c("down", "up"))) %>%
arrange(Cluster, desc(p.adjust)) %>%
mutate(Description = factor(Description, levels = unique(Description)))
p <- ggplot(go_df, aes(x = -log10(p.adjust), y = Description, fill = Cluster)) +
geom_bar(stat = "identity") +
labs(fill = "Group") +
scale_fill_manual(
labels = c("up" = "Up regulation", "down" = "Down regulation"),
values = c("up" = "#D73027", "down" = "#4575B4")
) +
scale_x_continuous(expand = expansion(0)) +
theme_classic() +
theme(
legend.position = "top",
plot.margin = margin(5, 5, 5, 5, "cm")
)
ppreview(plot = p, file = gsub("\\.tsv$", ".pdf", go_file))1.8 Run pySCENIC
# %% run pySCENIC on the full dataset
import os
import gc
import pandas as pd
import numpy as np
import scanpy as sc
import loompy as lp
from pathlib import Path
# %%
root_dir = "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/Astro_Glioma/initial_analysis"
adata_file = "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/Astro_Glioma/Annotated/annotated_by_cell_subclass.h5ad"
pyscenic_input_loom_path = "pyscenic_input.loom"
# %%
work_dir = Path(root_dir, "pyscenic")
os.makedirs(work_dir, exist_ok=True)
os.chdir(work_dir)
# %% prepare input loom file for pySCENIC
# adata = sc.read_h5ad(adata_file)
# %%
adata.obs["sample_cellSubclass"] = (
adata.obs["sample"].astype(str) + "_" + adata.obs["cell_subclass"].astype(str)
)
obs_df = adata.obs[["sample", "cell_subclass", "sample_cellSubclass"]].reset_index()
obs_df.to_csv("adata_obs.tsv", sep="\t", index=False)
# %%
# adata.X = adata.layers["counts"].copy()
# %%
# row_attrs = {
# "Gene": np.array(adata.var_names),
# }
# col_attrs = {
# "CellID": np.array(adata.obs_names),
# "nGene": np.array(np.sum(adata.X.transpose() > 0, axis=0)).flatten(),
# "nUMI": np.array(np.sum(adata.X.transpose(), axis=0)).flatten(),
# }
# lp.create(pyscenic_input_loom_path, adata.X.transpose(), row_attrs, col_attrs)
# %%
tf_file = "/home/yangrui/softwares/pySCENIC/resources/mm_mgi_tfs.txt"
adj_file = str(Path(os.getcwd(), "adj.csv"))
reg_file = str(Path(os.getcwd(), "reg.csv"))
db_files = "/home/yangrui/softwares/pySCENIC_ranking_databases/mm10_refseq_r80_gene_based/mm10_10kbp_up_10kbp_down_full_tx_v10_clust.genes_vs_motifs.rankings.feather /home/yangrui/softwares/pySCENIC_ranking_databases/mm10_refseq_r80_gene_based/mm10_500bp_up_100bp_down_full_tx_v10_clust.genes_vs_motifs.rankings.feather"
motif_file = "/home/yangrui/softwares/pySCENIC_motif2tf_annotations/motifs-v10nr_clust-nr.mgi-m0.001-o0.0.tbl"
pyscenic_output_loom_path = str(Path(os.getcwd(), "pyscenic_output.loom"))
pyscenic_input_loom_path = str(Path(os.getcwd(), pyscenic_input_loom_path))
adata_obs_file = str(Path(os.getcwd(), "adata_obs.tsv"))
# %% GRN inference with GRNBoost2
# using the counts matrix without log transformation or further processing
# output a list of adjacencies between TFs and their targets
# !pyscenic grn {pyscenic_input_loom_path} {tf_file} -o {adj_file} --num_workers 64
# %% regulon prediction with cisTarget
# output a list of adjacencies between TFs and their targets
# !pyscenic ctx {adj_file} {db_files} --annotations_fname {motif_file} --expression_mtx_fname {pyscenic_input_loom_path} --output {reg_file} --mask_dropouts --num_workers 64
# %% cellular enrichment with AUCell
# !pyscenic aucell {pyscenic_input_loom_path} {reg_file} --output {pyscenic_output_loom_path} --num_workers 64
# %% extract results from output loom file
import json
import zlib
import base64
lf = lp.connect(pyscenic_output_loom_path, mode="r", validate=False)
# %%
meta = json.loads(zlib.decompress(base64.b64decode(lf.attrs.MetaData)))
regulon_thresholds_df = pd.json_normalize(meta["regulonThresholds"])
regulon_thresholds_df = regulon_thresholds_df.sort_values("regulon").reset_index(
drop=True
)
regulon_matrix = pd.DataFrame(lf.ra.Regulons)
regulon_matrix.index = lf.ra.Gene
regulon_df = regulon_matrix.stack().reset_index()
regulon_df.columns = ["Target", "TF", "Is_Target"]
regulon_df = regulon_df[regulon_df["Is_Target"] > 0].drop(columns=["Is_Target"])
regulon_df = regulon_df[["TF", "Target"]].reset_index(drop=True)
regulon_df = regulon_df.sort_values(by="TF").reset_index(drop=True)
auc_mtx = pd.DataFrame(lf.ca.RegulonsAUC, index=lf.ca.CellID)
lf.close()
# %%
# regulon_thresholds_df.to_csv("regulon_thresholds.tsv", index=False, sep="\t")
# regulon_df.to_csv("regulons.tsv", index=False, sep="\t")
# auc_mtx.to_csv("auc_mtx.tsv", index=True, sep="\t")
# %%
obs_df = pd.read_csv(adata_obs_file, sep="\t")
obs_df = obs_df.set_index("index")
# %% cell type-specific regulons - z-score
zscore_threshold = 1.96
combined_df = auc_mtx.join(obs_df["sample_cellSubclass"], how="left")
regulon_columns = auc_mtx.columns.tolist()
zscore_df = (
combined_df.groupby(by="sample_cellSubclass").mean()
- combined_df[regulon_columns].mean()
) / combined_df[regulon_columns].std()
zscore_df = (
zscore_df.stack().reset_index().rename(columns={"level_1": "regulon", 0: "zscore"})
)
zscore_df = zscore_df.set_index("sample_cellSubclass")
zscore_df = zscore_df.join(
obs_df.set_index("sample_cellSubclass").drop_duplicates(), how="left"
).sort_values(by=["cell_subclass", "sample", "zscore"], ascending=[True, True, False])
zscore_df = zscore_df.reset_index()
top_regulons = zscore_df[zscore_df["zscore"] >= zscore_threshold]
# zscore_df.to_csv("regulon_zscores.tsv", index=False, sep="\t")
# top_regulons.to_csv(
# f"top_regulons.z-score{zscore_threshold}.tsv", index=False, sep="\t"
# )
# %%
from plotnine import *
import pandas as pd
plot_df = top_regulons.copy()
plot_df["sample_cellSubclass"] = pd.Categorical(
plot_df["sample_cellSubclass"], categories=plot_df["sample_cellSubclass"].unique()
)
p = (
ggplot(plot_df, aes(x="regulon", y="sample_cellSubclass", fill="zscore"))
+ geom_tile(color="white", size=0.5)
+ geom_text(aes(label="zscore.round(1)"), size=6)
+ scale_fill_gradientn(colors=["#FFFFD9", "#41B6C4", "#081D58"])
+ theme_minimal()
+ theme(
figure_size=(18, 6),
axis_text_x=element_text(rotation=90, hjust=1),
panel_grid_major=element_blank(),
panel_grid_minor=element_blank(),
axis_title=element_blank(),
)
+ labs(fill="z-score")
)
# p.save(f"cell_type_specific_regulons.z-score{zscore_threshold}.pdf", dpi=600)
p
# %% cell type-specific regulons - rss
from pyscenic.rss import regulon_specificity_scores
import pandas as pd
from plotnine import *
top_n = 3
rss = regulon_specificity_scores(
auc_mtx, obs_df.reindex(auc_mtx.index)["sample_cellSubclass"]
)
rss = rss.join(obs_df.set_index("sample_cellSubclass").drop_duplicates(), how="left")
rss = rss.reset_index().rename(columns={"index": "sample_cellSubclass"})
rss_long = rss.melt(
id_vars=["sample_cellSubclass", "cell_subclass", "sample"],
var_name="regulon",
value_name="rss",
)
rss_long = rss_long.sort_values(
by=["cell_subclass", "sample", "rss"], ascending=[True, True, False]
)
rss_long["rank"] = rss_long.groupby("sample_cellSubclass")["rss"].rank(
ascending=False, method="first"
)
rss_long["label"] = rss_long.apply(
lambda x: x["regulon"] if x["rank"] <= top_n else "", axis=1
)
rss_long.to_csv(f"cell_type_specific_regulons.rss{top_n}.tsv", index=False, sep="\t")
p = (
ggplot(rss_long, aes(x="rank", y="rss", color="sample"))
+ geom_point(size=0.8, alpha=0.6)
+ geom_text(
aes(label="label"),
size=7,
va="bottom",
ha="left",
show_legend=False,
)
+ facet_wrap("~cell_subclass", scales="free", ncol=3)
+ scale_color_manual(values={"Astro": "#3182bd", "Glioma": "#e6550d"})
+ theme_minimal()
+ theme(
figure_size=(16, 20),
panel_spacing=0.01,
strip_text=element_text(size=10, face="bold"),
panel_grid_minor=element_blank(),
legend_position="top",
)
+ labs(
title="Regulon Specificity Score",
x="Regulon Rank",
y="RSS Score",
color="Condition",
)
)
p.save(f"cell_type_specific_regulons.rss{top_n}.pdf", dpi=600)
p1.9 Intersect GO terms with regulons
library(tidyverse)
library(vroom)
regulon_file <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/Astro_Glioma/initial_analysis/pyscenic/regulons.tsv"
go_file <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/Astro_Glioma/initial_analysis/go/DG_EN.Glioma_vs_Astro.0.1.oraGO.select_terms.tsv"
regulon_df <- vroom(regulon_file)
go_df <- vroom(go_file)
go_df <- go_df %>%
select(Cluster, Description, geneID) %>%
separate_longer_delim(cols = geneID, delim = "/")
net_df <- left_join(go_df, regulon_df, by = c("geneID" = "Target")) %>%
na.omit() %>%
arrange(Cluster, Description, geneID) %>%
mutate(TF = gsub("(+)", "", TF, fixed = TRUE)) %>%
select(TF, geneID, Description, Cluster)
node_df <- bind_rows(
net_df %>%
select(geneID, Description, Cluster) %>%
mutate(type = "TG") %>%
distinct(),
net_df %>%
select(TF) %>%
rename(geneID = TF) %>%
distinct() %>%
mutate(
type = "TF",
Description = "TF",
Cluster = "TF"
)
)
vroom_write(net_df, gsub("\\.tsv$", ".regulons.net.tsv", go_file))
vroom_write(node_df, gsub("\\.tsv$", ".regulons.node.tsv", go_file))2 Additional analysis on merged cell types
2.1 DE analysis
# perform DE analysis on merged cell types
library(anndataR)
library(Seurat)
library(tidyverse)
library(YRUtils)
library(vroom)
work_dir <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/Astro_Glioma/additional_analysis"
setwd(work_dir)
h5ad_file <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/Astro_Glioma/Annotated/annotated_by_cell_subclass.h5ad"
sample_levels <- c("Glioma", "Astro")
merge_ls <- list(
"CA EN L1" = c("CA1 EN", "CA3 EN"),
"CA EN L2" = c("CA1 EN", "CA3 EN", "Proximal Sub EN", "Superficial Sub EN", "Deep Sub EN"),
"CA EN L3" = c("CA1 EN", "DG EN"),
"CA EN L4" = c("CA1 EN", "CA3 EN", "Proximal Sub EN", "Superficial Sub EN", "Deep Sub EN", "DG EN"),
"IN" = c("IN_Sst", "IN_Vip")
)
# seu <- read_h5ad(h5ad_file, as = "Seurat")
# seu.bak <- seu
seu <- seu.bak
for (k in names(merge_ls)) {
new_col_name <- gsub(" +", "_", k)
v <- merge_ls[[k]]
seu@meta.data[[new_col_name]] <- ifelse(
seu$cell_subclass %in% v, k,
as.character(seu$cell_subclass)
)
}
seu_meta <- seu@meta.data
seu_meta$index <- row.names(seu_meta)
vroom_write(seu_meta, file = "seurat_metadata.tsv", col_names = TRUE, append = FALSE, delim = "\t")
# DE analysis
de_out_dir <- "de_merged"
dir.create(de_out_dir)
for (k in names(merge_ls)) {
col_name <- gsub(" +", "_", k)
seu$de_col <- paste0(seu$sample, ".", seu@meta.data[[col_name]])
Idents(seu) <- "de_col"
ident_1 <- paste0(sample_levels[1], ".", k)
ident_2 <- paste0(sample_levels[2], ".", k)
de_res <- FindMarkers(
seu,
ident.1 = ident_1,
ident.2 = ident_2,
logfc.threshold = 0,
test.use = "wilcox",
min.pct = 0.1,
min.diff.pct = -Inf,
only.pos = FALSE,
random.seed = 123456
)
de_res <- de_res %>%
mutate(
gene_name = row.names(.),
log2fc_sign = if_else(avg_log2FC >= 0, 1L, -1L),
up_down = if_else(log2fc_sign == 1L, "up", "down"),
up_down = factor(up_down, levels = c("up", "down")),
signed_pval = p_val * log2fc_sign,
) %>%
group_by(up_down) %>%
arrange(up_down, signed_pval)
vroom_write(
de_res,
file = file.path(de_out_dir, paste0(gsub(" +", "_", k), ".", sample_levels[1], "_vs_", sample_levels[2], ".tsv")),
col_names = TRUE, append = FALSE
)
}2.2 GO ORA & GSEA
# perform ORA and GSEA over GO terms for DE results
library(YRUtils)
library(tidyverse)
library(vroom)
library(clusterProfiler)
library(AnnotationDbi)
library(glue)
work_dir <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/Astro_Glioma/additional_analysis"
setwd(work_dir)
orgdb_file <- "/data/database/annotation/release/mus_musculus/org.Mm.eg.db.sqlite"
de_dir <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/Astro_Glioma/additional_analysis/de_merged"
padj_th <- 0.05
gsea_output_dir <- "gsea_merged"
go_out_dir <- "go_merged"
dir.create(gsea_output_dir, showWarnings = FALSE)
dir.create(go_out_dir, showWarnings = FALSE)
orgdb <- loadDb(orgdb_file)
de_files <- list.files(de_dir, pattern = "\\.tsv$", full.names = TRUE, recursive = FALSE)
for (de_file in de_files) {
df <- vroom(de_file)
noise_genes <- df$gene_name[grepl("^(mt-|Rps|Rpl|Mrps|Mrpl|Hb[^(p)]|Gm\\d+$|Xist$|Ttr$|Malat1$|Meg3$|Rn\\d+s|Snhg\\d+)", df$gene_name)]
df <- df %>%
filter(!(gene_name %in% noise_genes)) %>%
mutate(signed_log10_pval = -log10(p_val) * sign(avg_log2FC)) %>%
arrange(desc(signed_log10_pval))
vroom_write(
df,
file = file.path(gsea_output_dir, gsub("\\.tsv$", ".clean.tsv", basename(de_file))),
col_names = TRUE, append = FALSE
)
gene_list <- setNames(df$signed_log10_pval, df$gene_name)
gse <- gseGO(
geneList = gene_list,
OrgDb = orgdb,
ont = "ALL",
keyType = "SYMBOL",
pvalueCutoff = 0.05,
pAdjustMethod = "BH",
seed = 1234
)
if (!is.null(gse)) {
saveRDS(gse, file = file.path(gsea_output_dir, gsub("\\.tsv$", ".gseGO.rds", basename(de_file))))
vroom_write(
gse@result,
file = file.path(gsea_output_dir, gsub("\\.tsv$", ".gseGO.tsv", basename(de_file))),
col_names = TRUE, append = FALSE
)
}
tmp_df <- df %>%
filter(p_val_adj < padj_th)
vroom_write(
tmp_df,
file = file.path(gsea_output_dir, gsub("\\.tsv$", glue(".{padj_th}.clean.tsv"), basename(de_file))),
col_names = TRUE, append = FALSE
)
gene_ls <- lapply(split(tmp_df, tmp_df$up_down), function(x) unique(na.omit(x[["gene_name"]])))
ego <- compareCluster(
gene_ls,
OrgDb = orgdb,
keyType = "SYMBOL",
fun = "enrichGO",
ont = "ALL",
pvalueCutoff = 0.05,
qvalueCutoff = 0.05,
pAdjustMethod = "BH",
minGSSize = 10,
maxGSSize = 500,
readable = FALSE,
pool = FALSE
)
if (!is.null(ego)) {
saveRDS(ego, file = file.path(go_out_dir, gsub("\\.tsv$", glue(".{padj_th}.oraGO.rds"), basename(de_file))))
vroom_write(
ego@compareClusterResult,
file = file.path(go_out_dir, gsub("\\.tsv$", glue(".{padj_th}.oraGO.tsv"), basename(de_file))),
col_names = TRUE, append = FALSE
)
}
}# extract genes belonging to select GO terms
library(tidyverse)
library(vroom)
file <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/Astro_Glioma/additional_analysis/go_merged/CA_EN_L3.Glioma_vs_Astro.0.05.oraGO.select_syn-imm_terms.tsv"
df <- vroom(file)
df <- df %>%
select(Cluster, geneID) %>%
separate_longer_delim(cols = "geneID", delim = "/") %>%
distinct() %>%
arrange(Cluster)
vroom_write(df, file = gsub("\\.tsv$", ".only_genes.tsv", file), col_names = TRUE, append = FALSE, delim = "\t")2.3 Intersect top regulons with GO terms
# intersect top regulons with GO terms
library(tidyverse)
library(vroom)
regulon_file <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/Astro_Glioma/initial_analysis/pyscenic/regulons.tsv"
rss_file <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/Astro_Glioma/additional_analysis/pyscenic_merged/cell_type_specific_regulons.rss5.tsv"
go_file <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/Astro_Glioma/additional_analysis/go_merged/CA_EN_L3.Glioma_vs_Astro.0.05.oraGO.select_syn-imm_terms.tsv"
rss_df <- vroom(rss_file)
rss_df <- rss_df %>%
select(CA_EN_L3, sample, regulon, rss) %>%
filter(CA_EN_L3 %in% c("CA EN L3")) %>%
pivot_wider(names_from = "sample", values_from = "rss", values_fill = 0) %>%
mutate(
rss_diff = Astro - Glioma,
rss_diff_abs = abs(rss_diff),
rss_diff_abs_zscore = (rss_diff_abs - mean(rss_diff_abs)) / sd(rss_diff_abs)
) %>%
arrange(desc(rss_diff_abs_zscore))
vroom_write(rss_df, file = file.path(dirname(go_file), "CA_EN_L3.ordered_rss.tsv"), col_names = TRUE, append = FALSE, delim = "\t")
ggplot(rss_df, aes(rss_diff_abs_zscore)) +
geom_density() +
theme_classic()
top_rss_df <- rss_df %>%
filter(rss_diff_abs_zscore > 1)
regulon_df <- vroom(regulon_file)
regulon_df <- regulon_df %>%
filter(TF %in% top_rss_df$regulon) %>%
mutate(TF = gsub("(+)", "", TF, fixed = TRUE))
go_df <- vroom(go_file)
go_df <- go_df %>%
select(Cluster, geneID) %>%
separate_longer_delim(cols = geneID, delim = "/")
go_df$geneID[go_df$geneID %in% unique(regulon_df$TF)]
net_df <- left_join(go_df, regulon_df, by = c("geneID" = "Target"), relationship = "many-to-many") %>%
na.omit() %>%
arrange(Cluster, geneID) %>%
select(TF, geneID, Cluster)
node_df <- bind_rows(
net_df %>%
select(geneID, Cluster) %>%
mutate(type = "TG") %>%
distinct(),
net_df %>%
select(TF) %>%
rename(geneID = TF) %>%
distinct() %>%
mutate(
type = "TF",
Cluster = "TF"
)
)
node_df %>%
group_by(geneID) %>%
reframe(n = n()) %>%
filter(n > 1)
vroom_write(net_df, file = gsub("\\.tsv$", ".regulons.net.tsv", go_file), col_names = TRUE, append = FALSE, delim = "\t")
vroom_write(node_df, file = gsub("\\.tsv$", ".regulons.node.tsv", go_file), col_names = TRUE, append = FALSE, delim = "\t")2.4 Intersect DE genes with epilepsy genes
library(tidyverse)
library(vroom)
epi_file <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/data/epilepsy_genes/tidy.uniq.tsv"
de_genes_file <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/Astro_Glioma/additional_analysis/gsea_merged/CA_EN_L3.Glioma_vs_Astro.0.05.clean.tsv"
epi_df <- vroom(epi_file, delim = "\t")
de_genes_df <- vroom(de_genes_file)
df <- de_genes_df %>%
filter(str_to_lower(gene_name) %in% str_to_lower(epi_df$GN))
vroom_write(df, file = gsub("\\.tsv$", ".epi.tsv", de_genes_file), col_names = TRUE, append = FALSE, delim = "\t")2.5 Post-processing pySCENIC results on merged cell types
# %% post-processing pySCENIC results on merged cell types
import os
import gc
import pandas as pd
import numpy as np
import scanpy as sc
import loompy as lp
from pathlib import Path
# %%
root_dir = "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/Astro_Glioma/additional_analysis"
pyscenic_output_loom_path = "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/Astro_Glioma/initial_analysis/pyscenic/pyscenic_output.loom"
adata_obs_file = "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/Astro_Glioma/additional_analysis/seurat_metadata.tsv"
# %%
work_dir = Path(root_dir, "pyscenic_merged")
os.makedirs(work_dir, exist_ok=True)
os.chdir(work_dir)
# %% extract results from output loom file
import json
import zlib
import base64
lf = lp.connect(pyscenic_output_loom_path, mode="r", validate=False)
# %%
meta = json.loads(zlib.decompress(base64.b64decode(lf.attrs.MetaData)))
regulon_thresholds_df = pd.json_normalize(meta["regulonThresholds"])
regulon_thresholds_df = regulon_thresholds_df.sort_values("regulon").reset_index(
drop=True
)
regulon_matrix = pd.DataFrame(lf.ra.Regulons)
regulon_matrix.index = lf.ra.Gene
regulon_df = regulon_matrix.stack().reset_index()
regulon_df.columns = ["Target", "TF", "Is_Target"]
regulon_df = regulon_df[regulon_df["Is_Target"] > 0].drop(columns=["Is_Target"])
regulon_df = regulon_df[["TF", "Target"]].reset_index(drop=True)
regulon_df = regulon_df.sort_values(by="TF").reset_index(drop=True)
auc_mtx = pd.DataFrame(lf.ca.RegulonsAUC, index=lf.ca.CellID)
lf.close()
# %%
cell_subclass_col = "CA_EN_L3"
obs_df = pd.read_csv(adata_obs_file, sep="\t")
obs_df = obs_df.set_index("index")
obs_df["sample_cellSubclass"] = (
obs_df["sample"].astype("str") + "_" + obs_df[cell_subclass_col].astype("str")
)
# %% cell type-specific regulons - z-score
zscore_threshold = 1.96
combined_df = auc_mtx.join(obs_df["sample_cellSubclass"], how="left")
regulon_columns = auc_mtx.columns.tolist()
zscore_df = (
combined_df.groupby(by="sample_cellSubclass").mean()
- combined_df[regulon_columns].mean()
) / combined_df[regulon_columns].std()
zscore_df = (
zscore_df.stack().reset_index().rename(columns={"level_1": "regulon", 0: "zscore"})
)
zscore_df = zscore_df.set_index("sample_cellSubclass")
zscore_df = zscore_df.join(
obs_df.set_index("sample_cellSubclass").drop_duplicates(), how="left"
).sort_values(by=["cell_subclass", "sample", "zscore"], ascending=[True, True, False])
zscore_df = zscore_df.reset_index()
top_regulons = zscore_df[zscore_df["zscore"] >= zscore_threshold]
zscore_df.to_csv("regulon_zscores.tsv", index=False, sep="\t")
top_regulons.to_csv(
f"top_regulons.z-score{zscore_threshold}.tsv", index=False, sep="\t"
)
# %%
from plotnine import *
import pandas as pd
plot_df = top_regulons.copy()
plot_df["sample_cellSubclass"] = pd.Categorical(
plot_df["sample_cellSubclass"], categories=plot_df["sample_cellSubclass"].unique()
)
p = (
ggplot(plot_df, aes(x="regulon", y="sample_cellSubclass", fill="zscore"))
+ geom_tile(color="white", size=0.5)
+ geom_text(aes(label="zscore.round(1)"), size=6)
+ scale_fill_gradientn(colors=["#FFFFD9", "#41B6C4", "#081D58"])
+ theme_minimal()
+ theme(
figure_size=(18, 6),
axis_text_x=element_text(rotation=90, hjust=1),
panel_grid_major=element_blank(),
panel_grid_minor=element_blank(),
axis_title=element_blank(),
)
+ labs(fill="z-score")
)
p.save(f"cell_type_specific_regulons.z-score{zscore_threshold}.pdf", dpi=600)
p
# %% cell type-specific regulons - rss
from pyscenic.rss import regulon_specificity_scores
import pandas as pd
from plotnine import *
top_n = 5
rss = regulon_specificity_scores(
auc_mtx, obs_df.reindex(auc_mtx.index)["sample_cellSubclass"]
)
rss = rss.join(
obs_df[["sample", "sample_cellSubclass", cell_subclass_col]]
.copy()
.set_index("sample_cellSubclass")
.drop_duplicates(),
how="left",
)
rss = rss.reset_index().rename(columns={"index": "sample_cellSubclass"})
rss_long = rss.melt(
id_vars=["sample_cellSubclass", cell_subclass_col, "sample"],
var_name="regulon",
value_name="rss",
)
rss_long = rss_long.sort_values(
by=[cell_subclass_col, "sample", "rss"], ascending=[True, True, False]
)
rss_long["rank"] = rss_long.groupby("sample_cellSubclass")["rss"].rank(
ascending=False, method="first"
)
rss_long["label"] = rss_long.apply(
lambda x: x["regulon"] if x["rank"] <= top_n else "", axis=1
)
rss_long.to_csv(f"cell_type_specific_regulons.rss{top_n}.tsv", index=False, sep="\t")
p = (
ggplot(rss_long, aes(x="rank", y="rss", color="sample"))
+ geom_point(size=0.8, alpha=0.6)
+ geom_text(
aes(label="label"),
size=7,
va="bottom",
ha="left",
show_legend=False,
)
+ facet_wrap(f"~{cell_subclass_col}", scales="free", ncol=3)
+ scale_color_manual(values={"Astro": "#3182bd", "Glioma": "#e6550d"})
+ theme_minimal()
+ theme(
figure_size=(16, 20),
panel_spacing=0.01,
strip_text=element_text(size=10, face="bold"),
panel_grid_minor=element_blank(),
legend_position="top",
)
+ labs(
title="Regulon Specificity Score",
x="Regulon Rank",
y="RSS Score",
color="Condition",
)
)
p.save(f"cell_type_specific_regulons.rss{top_n}.pdf", dpi=600)
p2.6 Calculate module scores over select GO terms
# %%
import os
import gc
from pathlib import Path
import scanpy as sc
import anndata as ad
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
# %%
sc.set_figure_params(dpi=600, facecolor="white")
# %%
root_dir = "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/Astro_Glioma/additional_analysis"
adata_file = "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/Astro_Glioma/Annotated/annotated_by_cell_subclass.h5ad"
gene_set_file = "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/Astro_Glioma/additional_analysis/customized_gene_sets/deg_sets.tsv"
# %%
work_dir = Path(root_dir, "module_score_corr")
os.makedirs(work_dir, exist_ok=True)
os.chdir(work_dir)
# %%
adata = sc.read_h5ad(adata_file)
# %%
import pandas as pd
gene_set_df = pd.read_csv(gene_set_file, sep="\t")
gene_set_df["geneID"] = gene_set_df["geneID"].str.split("/")
gene_set_df = gene_set_df.explode("geneID")
gene_set_df["geneID"] = gene_set_df["geneID"].str.strip()
print(f"All genes are valid: {all(gene_set_df["geneID"].isin(adata.var_names))}")
gene_set_dict = (
gene_set_df.groupby("module")["geneID"].apply(lambda x: list(x.unique())).to_dict()
)
# %%
for k in gene_set_dict.keys():
sc.tl.score_genes(
adata,
gene_list=gene_set_dict[k],
score_name=f"{k}_ModuleScore",
layer="data",
ctrl_size=50,
n_bins=25,
)
# %%
from plotnine import *
import pandas as pd
import numpy as np
import re
from copy import copy
def squash(x, range=(0, 1), only_finite=True):
res = copy(x)
if not len(x):
return res
if only_finite:
try:
finite = np.isfinite(x)
except TypeError:
finite = np.repeat(True, len(x))
else:
finite = np.repeat(True, len(x))
res[finite] = np.clip(res[finite], range[0], range[1])
return res
module_score_keys = [k for k in adata.obs.keys() if re.match(r".+_ModuleScore$", k)]
for target_col in module_score_keys:
gene_set_size = len(gene_set_dict[re.sub(r"_ModuleScore$", "", target_col)])
plot_df = pd.DataFrame(
adata.obsm["X_umap"], columns=["UMAP1", "UMAP2"], index=adata.obs_names
)
plot_df = pd.concat([plot_df, adata.obs[["sample", target_col]]], axis=1)
plot_df["sample"] = pd.Categorical(
plot_df["sample"], categories=["Astro", "Glioma"], ordered=True
)
plot_df = plot_df.sort_values(by=target_col, ascending=True)
qs = plot_df[target_col].quantile(np.arange(0, 1.01, 0.01))
qs.index = qs.index.round(3)
p = (
ggplot(plot_df, aes(x="UMAP1", y="UMAP2", color=target_col))
+ geom_point(size=0.03, alpha=0.7)
+ facet_wrap("~ sample", ncol=2)
+ scale_color_gradient2(
low="#41B7C4",
mid="#FCFCFC",
high="#FF6347",
midpoint=0,
limits=(min(qs[0.050], 0), max(qs[0.950], 0)),
oob=squash,
name="Activity",
)
+ coord_equal()
+ theme_classic(base_family="Arial", base_size=10)
+ theme(
figure_size=(5, 5),
strip_background=element_blank(),
strip_text=element_text(size=10, fontweight="normal", color="black"),
panel_grid=element_blank(),
)
+ labs(title=f"Activity: {target_col} ({gene_set_size})")
)
p.save(f"{target_col}.umap.pdf", dpi=600)
# %%
adata.obs.to_csv("adata_obs_ModuleScore.tsv", sep="\t", index=False)
adata.write("adata_with_ModuleScore.h5ad")2.7 Calculate module scores over select GO terms (2024, Xin Wang, sci. adv.)
# %%
import os
import gc
from pathlib import Path
import scanpy as sc
import anndata as ad
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
# %%
sc.set_figure_params(dpi=600, facecolor="white")
# %%
work_dir = "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/Astro_Glioma/additional_analysis/2024_XinWang_SciAdv_snRNA/module_score_corr"
adata_file = "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/data/2024_XinWang_SciAdv/SCSP0000482_snRNA.h5ad"
gene_set_file = "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/Astro_Glioma/additional_analysis/customized_gene_sets/deg_sets.tsv"
# %%
adata = sc.read_h5ad(adata_file)
# %%
# check whether .X is a raw counts matrix
sample_size = min(10000, adata.X.shape[0])
sample_data = adata.X[:sample_size]
# for sparse matrix
values = sample_data.data
if np.any(values < 0):
print("X contains negative numbers and may be scaled data")
else:
is_integer = np.all(np.mod(values, 1) == 0)
if is_integer:
print(f"X may be raw counts (i.e., all are integers)")
else:
print(f"X may be normalized data")
# %%
os.makedirs(work_dir, exist_ok=True)
os.chdir(work_dir)
# %%
adata.obs.groupby(["cell_type", "sample_name"]).size().unstack(fill_value=0)
# %%
adata.obs[["patient", "location"]] = adata.obs["sample_name"].str.extract(
r"^(.{2})(.*)$"
)
# %%
adata.obs.groupby(["cell_type", "location"]).size().unstack(fill_value=0)
# %%
import pandas as pd
gene_set_df = pd.read_csv(gene_set_file, sep="\t")
gene_set_df["geneID"] = gene_set_df["geneID"].str.split("/")
gene_set_df = gene_set_df.explode("geneID")
gene_set_df["geneID"] = gene_set_df["geneID"].str.strip()
gene_set_df["geneID_human"] = gene_set_df["geneID"].str.upper()
gene_set_df = gene_set_df[gene_set_df["geneID_human"].isin(adata.var_names)].copy()
gene_set_dict = (
gene_set_df.groupby("module")["geneID_human"]
.apply(lambda x: list(x.unique()))
.to_dict()
)
# %%
for k in gene_set_dict.keys():
sc.tl.score_genes(
adata,
gene_list=gene_set_dict[k],
score_name=f"{k}_ModuleScore",
ctrl_size=50,
n_bins=25,
)
# %%
from plotnine import *
import pandas as pd
import numpy as np
import re
from copy import copy
def squash(x, range=(0, 1), only_finite=True):
res = copy(x)
if not len(x):
return res
if only_finite:
try:
finite = np.isfinite(x)
except TypeError:
finite = np.repeat(True, len(x))
else:
finite = np.repeat(True, len(x))
res[finite] = np.clip(res[finite], range[0], range[1])
return res
module_score_keys = [k for k in adata.obs.keys() if re.match(r".+_ModuleScore$", k)]
for target_col in module_score_keys:
gene_set_size = len(gene_set_dict[re.sub(r"_ModuleScore$", "", target_col)])
plot_df = pd.DataFrame(
adata.obsm["X_umap"], columns=["UMAP1", "UMAP2"], index=adata.obs_names
)
plot_df = pd.concat([plot_df, adata.obs[["location", target_col]]], axis=1)
plot_df["location"] = pd.Categorical(
plot_df["location"], categories=["N21", "TC1"], ordered=True
)
plot_df = plot_df.sort_values(by=target_col, ascending=True)
qs = plot_df[target_col].quantile(np.arange(0, 1.01, 0.01))
qs.index = qs.index.round(3)
p = (
ggplot(plot_df, aes(x="UMAP1", y="UMAP2", color=target_col))
+ geom_point(size=0.03, alpha=0.7)
+ facet_wrap("~ location", ncol=2)
+ scale_color_gradient2(
low="#41B7C4",
mid="#FCFCFC",
high="#FF6347",
midpoint=0,
limits=(min(qs[0.050], 0), max(qs[0.950], 0)),
oob=squash,
name="Activity",
)
+ coord_equal()
+ theme_classic(base_family="Arial", base_size=10)
+ theme(
figure_size=(5, 5),
strip_background=element_blank(),
strip_text=element_text(size=10, fontweight="normal", color="black"),
panel_grid=element_blank(),
)
+ labs(title=f"Activity: {target_col} ({gene_set_size})")
)
p.save(f"{target_col}.umap.pdf", dpi=600)
# %%
sc.set_figure_params(figsize=(4, 4))
for loc in adata.obs["location"].unique():
sc.pl.umap(
adata[adata.obs["location"] == loc],
color=["cell_type"],
title=f"Location: {loc}",
# legend_loc="on data",
palette="tab20",
legend_fontsize=10,
legend_fontweight="normal",
legend_fontoutline=3,
show=False,
)
fig = plt.gcf()
fig.savefig(loc + ".cell_type.umap.pdf", dpi=600, bbox_inches="tight")
# %%
adata.obs.to_csv("adata_obs_ModuleScore.tsv", sep="\t", index=False)
adata.write("adata_with_ModuleScore.h5ad")2.8 Stratification of ENs by inflammatory burden
# To ensure statistical stringency and avoid circularity,
# we established activation thresholds based on the 99th
# percentile of the control group's score distribution,
# subsequently stratifying treatment-group neurons
# into high- and low-inflammatory states.
library(tidyverse)
library(vroom)
library(glue)
library(unigd)
obs_file <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/Astro_Glioma/additional_analysis/module_score_corr/adata_obs_ModuleScore.tsv"
target_cell_types <- c("CA1 EN", "CA3 EN", "Proximal Sub EN", "Superficial Sub EN", "Deep Sub EN", "DG EN")
target_module <- "up_immune_response_ModuleScore"
obs_df <- vroom(obs_file)
obs_df <- obs_df %>%
filter(cell_subclass %in% target_cell_types) %>%
rename(MS := !!sym(target_module)) %>%
select(cell_barcode, sample, cell_subclass, MS)
qs <- obs_df %>%
filter(sample == "Astro") %>%
pull(MS) %>%
quantile(probs = seq(0, 1, 0.01))
qs_th <- qs["99%"]
obs_df <- obs_df %>%
mutate(MS_active = if_else(MS > qs_th, "active", "inactive"))
vroom_write(
obs_df,
file = paste0(gsub("\\.tsv$", "", obs_file), ".", target_module, ".Q99_threshold.tsv"),
delim = "\t", col_names = TRUE, append = FALSE
)
stats <- table(obs_df$sample, obs_df$MS_active)
p <- ggplot(obs_df, aes(x = MS, color = sample)) +
geom_density() +
geom_vline(xintercept = qs_th, linetype = "dashed", color = "grey") +
scale_x_continuous(expand = expansion(0)) +
scale_y_continuous(expand = expansion(0)) +
labs(caption = glue("Quantile threshold: {round(qs_th, 6)}\nAA: {stats['Astro', 'active']}, AI: {stats['Astro', 'inactive']}, GA: {stats['Glioma', 'active']}, GI: {stats['Glioma', 'inactive']}")) +
theme_classic(base_family = "Arial") +
theme(legend.position = "top")
ugd_save_inline(
{
print(p)
},
file = paste0(gsub("\\.tsv$", "", obs_file), ".", target_module, ".Q99_threshold.pdf"),
width = 200,
height = 200
)2.9 Post-processing pySCENIC results on stritified ENs
# %% Post-processing pySCENIC results on stritified ENs
import os
import gc
import pandas as pd
import numpy as np
import scanpy as sc
import loompy as lp
from pathlib import Path
# %%
root_dir = "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/Astro_Glioma/additional_analysis"
pyscenic_output_loom_path = "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/Astro_Glioma/initial_analysis/pyscenic/pyscenic_output.loom"
adata_obs_file = "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/Astro_Glioma/additional_analysis/module_score_corr/adata_obs_ModuleScore.up_immune_response_ModuleScore.Q99_threshold.seu_meta.tsv"
# %%
work_dir = Path(root_dir, "pyscenic_merged.up_immune_response_ModuleScore")
os.makedirs(work_dir, exist_ok=True)
os.chdir(work_dir)
# %% extract results from output loom file
import json
import zlib
import base64
lf = lp.connect(pyscenic_output_loom_path, mode="r", validate=False)
# %%
meta = json.loads(zlib.decompress(base64.b64decode(lf.attrs.MetaData)))
regulon_thresholds_df = pd.json_normalize(meta["regulonThresholds"])
regulon_thresholds_df = regulon_thresholds_df.sort_values("regulon").reset_index(
drop=True
)
regulon_matrix = pd.DataFrame(lf.ra.Regulons)
regulon_matrix.index = lf.ra.Gene
regulon_df = regulon_matrix.stack().reset_index()
regulon_df.columns = ["Target", "TF", "Is_Target"]
regulon_df = regulon_df[regulon_df["Is_Target"] > 0].drop(columns=["Is_Target"])
regulon_df = regulon_df[["TF", "Target"]].reset_index(drop=True)
regulon_df = regulon_df.sort_values(by="TF").reset_index(drop=True)
auc_mtx = pd.DataFrame(lf.ca.RegulonsAUC, index=lf.ca.CellID)
lf.close()
# %%
cell_subclass_col = "new_cell_type"
obs_df = pd.read_csv(adata_obs_file, sep="\t")
obs_df = obs_df.set_index("index")
obs_df["sample_cellSubclass"] = (
obs_df["sample"].astype("str") + "_" + obs_df[cell_subclass_col].astype("str")
)
# %% cell type-specific regulons - z-score
zscore_threshold = 1.96
combined_df = auc_mtx.join(obs_df["sample_cellSubclass"], how="left")
regulon_columns = auc_mtx.columns.tolist()
zscore_df = (
combined_df.groupby(by="sample_cellSubclass").mean()
- combined_df[regulon_columns].mean()
) / combined_df[regulon_columns].std()
zscore_df = (
zscore_df.stack().reset_index().rename(columns={"level_1": "regulon", 0: "zscore"})
)
zscore_df = zscore_df.set_index("sample_cellSubclass")
zscore_df = zscore_df.join(
obs_df.set_index("sample_cellSubclass").drop_duplicates(), how="left"
).sort_values(by=[cell_subclass_col, "sample", "zscore"], ascending=[True, True, False])
zscore_df = zscore_df.reset_index()
top_regulons = zscore_df[zscore_df["zscore"] >= zscore_threshold]
zscore_df.to_csv("regulon_zscores.tsv", index=False, sep="\t")
top_regulons.to_csv(
f"top_regulons.z-score{zscore_threshold}.tsv", index=False, sep="\t"
)
# %%
from plotnine import *
import pandas as pd
plot_df = top_regulons.copy()
plot_df["sample_cellSubclass"] = pd.Categorical(
plot_df["sample_cellSubclass"], categories=plot_df["sample_cellSubclass"].unique()
)
p = (
ggplot(plot_df, aes(x="regulon", y="sample_cellSubclass", fill="zscore"))
+ geom_tile(color="white", size=0.5)
+ geom_text(aes(label="zscore.round(1)"), size=6)
+ scale_fill_gradientn(colors=["#FFFFD9", "#41B6C4", "#081D58"])
+ theme_minimal()
+ theme(
figure_size=(18, 6),
axis_text_x=element_text(rotation=90, hjust=1),
panel_grid_major=element_blank(),
panel_grid_minor=element_blank(),
axis_title=element_blank(),
)
+ labs(fill="z-score")
)
p.save(f"cell_type_specific_regulons.z-score{zscore_threshold}.pdf", dpi=600)
p
# %% cell type-specific regulons - rss
from pyscenic.rss import regulon_specificity_scores
import pandas as pd
from plotnine import *
top_n = 5
rss = regulon_specificity_scores(
auc_mtx, obs_df.reindex(auc_mtx.index)["sample_cellSubclass"]
)
rss = rss.join(
obs_df[["sample", "sample_cellSubclass", cell_subclass_col]]
.copy()
.set_index("sample_cellSubclass")
.drop_duplicates(),
how="left",
)
rss = rss.reset_index().rename(columns={"index": "sample_cellSubclass"})
rss_long = rss.melt(
id_vars=["sample_cellSubclass", cell_subclass_col, "sample"],
var_name="regulon",
value_name="rss",
)
rss_long = rss_long.sort_values(
by=[cell_subclass_col, "sample", "rss"], ascending=[True, True, False]
)
rss_long["rank"] = rss_long.groupby("sample_cellSubclass")["rss"].rank(
ascending=False, method="first"
)
rss_long["label"] = rss_long.apply(
lambda x: x["regulon"] if x["rank"] <= top_n else "", axis=1
)
rss_long.to_csv(f"cell_type_specific_regulons.rss{top_n}.tsv", index=False, sep="\t")
p = (
ggplot(rss_long, aes(x="rank", y="rss", color="sample"))
+ geom_point(size=0.8, alpha=0.6)
+ geom_text(
aes(label="label"),
size=7,
va="bottom",
ha="left",
show_legend=False,
)
+ facet_wrap(f"~{cell_subclass_col}", scales="free", ncol=3)
+ scale_color_manual(values={"Astro": "#3182bd", "Glioma": "#e6550d"})
+ theme_minimal()
+ theme(
figure_size=(16, 20),
panel_spacing=0.01,
strip_text=element_text(size=10, face="bold"),
panel_grid_minor=element_blank(),
legend_position="top",
)
+ labs(
title="Regulon Specificity Score",
x="Regulon Rank",
y="RSS Score",
color="Condition",
)
)
p.save(f"cell_type_specific_regulons.rss{top_n}.pdf", dpi=600)
p2.10 DE analysis on stritified ENs
# perform DE analysis on stritified ENs
library(anndataR)
library(Seurat)
library(tidyverse)
library(YRUtils)
library(vroom)
work_dir <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/Astro_Glioma/additional_analysis"
setwd(work_dir)
h5ad_file <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/Astro_Glioma/Annotated/annotated_by_cell_subclass.h5ad"
ms_file <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/Astro_Glioma/additional_analysis/module_score_corr/adata_obs_ModuleScore.up_immune_response_ModuleScore.Q99_threshold.tsv"
# seu <- read_h5ad(h5ad_file, as = "Seurat")
# seu.bak <- seu
ms_df <- vroom(ms_file)
ms_df <- ms_df %>%
filter(sample == "Glioma") %>%
mutate(
rowname = paste0(cell_barcode, "_", sample),
new_cell_type = paste0(sample, "_", MS_active)
) %>%
select(rowname, new_cell_type)
ms_ls <- split(ms_df, ms_df$new_cell_type) %>%
lapply(function(x) x$rowname)
seu <- seu.bak
seu$sample_cellBarcode <- row.names(seu@meta.data)
seu$new_cell_type <- ifelse(seu$sample_cellBarcode %in% ms_ls[["Glioma_active"]], "Glioma_active",
ifelse(seu$sample_cellBarcode %in% ms_ls[["Glioma_inactive"]], "Glioma_inactive", "Other")
)
# DE analysis
Idents(seu) <- "new_cell_type"
ident_1 <- "Glioma_active"
ident_2 <- "Glioma_inactive"
de_res <- FindMarkers(
seu,
ident.1 = ident_1,
ident.2 = ident_2,
logfc.threshold = 0,
test.use = "wilcox",
min.pct = 0.1,
min.diff.pct = -Inf,
only.pos = FALSE,
random.seed = 123456
)
de_res <- de_res %>%
mutate(
gene_name = row.names(.),
log2fc_sign = if_else(avg_log2FC >= 0, 1L, -1L),
up_down = if_else(log2fc_sign == 1L, "up", "down"),
up_down = factor(up_down, levels = c("up", "down")),
signed_pval = p_val * log2fc_sign,
) %>%
group_by(up_down) %>%
arrange(up_down, signed_pval)
vroom_write(
de_res,
file = gsub("\\.tsv$", paste0(".", ident_1, "_vs_", ident_2, ".tsv"), ms_file),
col_names = TRUE, append = FALSE
)2.11 GO ORA & GSEA on DE results of stritified ENs
# perform ORA and GSEA over GO terms for DE results
library(YRUtils)
library(tidyverse)
library(vroom)
library(clusterProfiler)
library(AnnotationDbi)
library(glue)
work_dir <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/Astro_Glioma/additional_analysis"
setwd(work_dir)
orgdb_file <- "/data/database/annotation/release/mus_musculus/org.Mm.eg.db.sqlite"
de_file <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/Astro_Glioma/additional_analysis/module_score_corr/adata_obs_ModuleScore.up_immune_response_ModuleScore.Q99_threshold.Glioma_active_vs_Glioma_inactive.tsv"
padj_th <- 0.05
gsea_output_dir <- "gsea_merged"
go_out_dir <- "go_merged"
dir.create(gsea_output_dir, showWarnings = FALSE)
dir.create(go_out_dir, showWarnings = FALSE)
orgdb <- loadDb(orgdb_file)
df <- vroom(de_file)
noise_genes <- df$gene_name[grepl("^(mt-|Rps|Rpl|Mrps|Mrpl|Hb[^(p)]|Gm\\d+$|Xist$|Ttr$|Malat1$|Meg3$|Rn\\d+s|Snhg\\d+)", df$gene_name)]
df <- df %>%
filter(!(gene_name %in% noise_genes)) %>%
mutate(signed_log10_pval = -log10(p_val) * sign(avg_log2FC)) %>%
arrange(desc(signed_log10_pval))
vroom_write(
df,
file = file.path(gsea_output_dir, gsub("\\.tsv$", ".clean.tsv", basename(de_file))),
col_names = TRUE, append = FALSE
)
gene_list <- setNames(df$signed_log10_pval, df$gene_name)
gse <- gseGO(
geneList = gene_list,
OrgDb = orgdb,
ont = "ALL",
keyType = "SYMBOL",
pvalueCutoff = 0.05,
pAdjustMethod = "BH",
seed = 1234
)
if (!is.null(gse)) {
saveRDS(gse, file = file.path(gsea_output_dir, gsub("\\.tsv$", ".gseGO.rds", basename(de_file))))
vroom_write(
gse@result,
file = file.path(gsea_output_dir, gsub("\\.tsv$", ".gseGO.tsv", basename(de_file))),
col_names = TRUE, append = FALSE
)
}
tmp_df <- df %>%
filter(p_val_adj < padj_th)
vroom_write(
tmp_df,
file = file.path(gsea_output_dir, gsub("\\.tsv$", glue(".{padj_th}.clean.tsv"), basename(de_file))),
col_names = TRUE, append = FALSE
)
gene_ls <- lapply(split(tmp_df, tmp_df$up_down), function(x) unique(na.omit(x[["gene_name"]])))
ego <- compareCluster(
gene_ls,
OrgDb = orgdb,
keyType = "SYMBOL",
fun = "enrichGO",
ont = "ALL",
pvalueCutoff = 0.05,
qvalueCutoff = 0.05,
pAdjustMethod = "BH",
minGSSize = 10,
maxGSSize = 500,
readable = FALSE,
pool = FALSE
)
if (!is.null(ego)) {
saveRDS(ego, file = file.path(go_out_dir, gsub("\\.tsv$", glue(".{padj_th}.oraGO.rds"), basename(de_file))))
vroom_write(
ego@compareClusterResult,
file = file.path(go_out_dir, gsub("\\.tsv$", glue(".{padj_th}.oraGO.tsv"), basename(de_file))),
col_names = TRUE, append = FALSE
)
}2.12 Attach metadata of strified ENs to Seurat metedata
library(tidyverse)
library(vroom)
seu_meta_file <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/Astro_Glioma/additional_analysis/seurat_metadata.tsv"
ms_file <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/Astro_Glioma/additional_analysis/module_score_corr/adata_obs_ModuleScore.up_immune_response_ModuleScore.Q99_threshold.tsv"
ms_df <- vroom(ms_file)
ms_df <- ms_df %>%
select(sample, cell_barcode, MS_active)
seu_meta <- vroom(seu_meta_file)
seu_meta <- seu_meta %>%
select(sample, cell_barcode, CA_EN_L4)
df <- left_join(seu_meta, ms_df, by = c("sample", "cell_barcode")) %>%
mutate(
MS_active = if_else(is.na(MS_active), "", MS_active),
new_cell_type = paste0(CA_EN_L4, " ", MS_active),
new_cell_type = trimws(new_cell_type),
index = paste0(cell_barcode, "_", sample)
) %>%
select(index, sample, new_cell_type)
vroom_write(
df,
file = gsub("\\.tsv$", ".seu_meta.tsv", ms_file),
col_names = TRUE, append = FALSE, delim = "\t"
)2.13 DE analysis on stritified ENs (2024, Xin Wang, sci. adv.)
library(anndataR)
library(Seurat)
library(tidyverse)
library(YRUtils)
library(vroom)
work_dir <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/Astro_Glioma/additional_analysis"
setwd(work_dir)
h5ad_file <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/Astro_Glioma/additional_analysis/2024_XinWang_SciAdv_snRNA/module_score_corr/adata_with_ModuleScore.h5ad"
target_module_score <- "up_immune_response_ModuleScore"
# seu <- read_h5ad(h5ad_file, as = "Seurat")
# seu.bak <- seu
seu <- seu.bak
LayerData(seu, assay = "RNA", layer = "data") <- LayerData(seu, assay = "RNA", layer = "X")
DefaultLayer(seu[["RNA"]]) <- "data"
LayerData(seu, assay = "RNA", layer = "X") <- NULL
df <- seu@meta.data
df <- df %>%
rename(MS := !!sym(target_module_score)) %>%
select(cell_type, sample_name, patient, location, MS) %>%
mutate(rowname = row.names(.)) %>%
as_tibble()
ggplot(df, aes(MS)) +
geom_density() +
theme_classic()
ggplot(filter(df, cell_type == "Excitatory neuron"), aes(MS)) +
geom_density() +
theme_classic()
df <- df %>%
mutate(
MS_active = if_else(MS > 0, "active", "inactive"),
is_EN = if_else(cell_type == "Excitatory neuron", TRUE, FALSE)
)
table(df$is_EN, df$MS_active)
df <- filter(df, is_EN)
ls <- split(df, df$MS_active) %>%
lapply(function(x) x$rowname)
seu$rowname <- row.names(seu@meta.data)
seu$new_cell_type <- ifelse(seu$rowname %in% ls[["active"]], "EN_Active",
ifelse(seu$rowname %in% ls[["inactive"]], "EN_Inactive", "Other")
)
# DE analysis
Idents(seu) <- "new_cell_type"
ident_1 <- "EN_Active"
ident_2 <- "EN_Inactive"
de_res <- FindMarkers(
seu,
ident.1 = ident_1,
ident.2 = ident_2,
logfc.threshold = 0,
test.use = "wilcox",
min.pct = 0.1,
min.diff.pct = -Inf,
only.pos = FALSE,
random.seed = 123456
)
de_res <- de_res %>%
mutate(
gene_name = row.names(.),
log2fc_sign = if_else(avg_log2FC >= 0, 1L, -1L),
up_down = if_else(log2fc_sign == 1L, "up", "down"),
up_down = factor(up_down, levels = c("up", "down")),
signed_pval = p_val * log2fc_sign,
) %>%
group_by(up_down) %>%
arrange(up_down, signed_pval)
vroom_write(
de_res,
file = gsub("\\.h5ad$", paste0(".", target_module_score, ".", ident_1, "_vs_", ident_2, ".DE.tsv"), h5ad_file),
col_names = TRUE, append = FALSE
)2.14 GO ORA & GSEA on DE results of stritified ENs (2024, Xin Wang, sci. adv.)
# perform ORA and GSEA over GO terms for DE results
library(YRUtils)
library(tidyverse)
library(vroom)
library(clusterProfiler)
library(AnnotationDbi)
library(glue)
work_dir <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/Astro_Glioma/additional_analysis"
setwd(work_dir)
orgdb_file <- "/data/database/annotation/release/homo_sapiens/org.Hs.eg.db.sqlite"
de_file <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/Astro_Glioma/additional_analysis/2024_XinWang_SciAdv_snRNA/module_score_corr/adata_with_ModuleScore.up_immune_response_ModuleScore.EN_Active_vs_EN_Inactive.DE.tsv"
padj_th <- 0.05
gsea_output_dir <- "gsea_merged"
go_out_dir <- "go_merged"
dir.create(gsea_output_dir, showWarnings = FALSE)
dir.create(go_out_dir, showWarnings = FALSE)
orgdb <- loadDb(orgdb_file)
df <- vroom(de_file)
noise_genes <- df$gene_name[grepl("^(MT-|RPS|RPL|MRPS|MRPL|HB[^(P)]|GM\\D+$|XIST$|TTR$|MALAT1$|MEG3$|RN\\D+S|SNHG\\D+)", df$gene_name)]
df <- df %>%
filter(!(gene_name %in% noise_genes)) %>%
mutate(signed_log10_pval = -log10(p_val + .Machine$double.xmin) * sign(avg_log2FC)) %>%
arrange(desc(signed_log10_pval))
vroom_write(
df,
file = file.path(gsea_output_dir, gsub("\\.tsv$", ".clean.tsv", basename(de_file))),
col_names = TRUE, append = FALSE
)
gene_list <- setNames(df$signed_log10_pval, df$gene_name)
gse <- gseGO(
geneList = gene_list,
OrgDb = orgdb,
ont = "ALL",
keyType = "SYMBOL",
pvalueCutoff = 0.05,
pAdjustMethod = "BH",
seed = 1234
)
if (!is.null(gse)) {
saveRDS(gse, file = file.path(gsea_output_dir, gsub("\\.tsv$", ".gseGO.rds", basename(de_file))))
vroom_write(
gse@result,
file = file.path(gsea_output_dir, gsub("\\.tsv$", ".gseGO.tsv", basename(de_file))),
col_names = TRUE, append = FALSE
)
}
tmp_df <- df %>%
filter(p_val_adj < padj_th)
vroom_write(
tmp_df,
file = file.path(gsea_output_dir, gsub("\\.tsv$", glue(".{padj_th}.clean.tsv"), basename(de_file))),
col_names = TRUE, append = FALSE
)
gene_ls <- lapply(split(tmp_df, tmp_df$up_down), function(x) unique(na.omit(x[["gene_name"]])))
ego <- compareCluster(
gene_ls,
OrgDb = orgdb,
keyType = "SYMBOL",
fun = "enrichGO",
ont = "ALL",
pvalueCutoff = 0.05,
qvalueCutoff = 0.05,
pAdjustMethod = "BH",
minGSSize = 10,
maxGSSize = 500,
readable = FALSE,
pool = FALSE
)
if (!is.null(ego)) {
saveRDS(ego, file = file.path(go_out_dir, gsub("\\.tsv$", glue(".{padj_th}.oraGO.rds"), basename(de_file))))
vroom_write(
ego@compareClusterResult,
file = file.path(go_out_dir, gsub("\\.tsv$", glue(".{padj_th}.oraGO.tsv"), basename(de_file))),
col_names = TRUE, append = FALSE
)
}3 Appendix
3.1 DoRothRA
library(dorothea)
library(decoupleR)
library(vroom)
library(tidyverse)
net_file <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/data/dorothea_mouse_A2D.tsv"
# net <- get_dorothea(
# organism = "mouse",
# levels = c("A", "B", "C", "D")
# )
# vroom_write(net, file = net_file, delim = "\t")3.2 Figures (v20260506)
3.2.1 UMAP plot & cell proportion bar plot
library(Seurat)
library(anndataR)
library(unigd)
library(tidyverse)
library(scCustomize)
library(vroom)
library(patchwork)
gg_pt <- function(x) x / (ggplot2::.pt * 0.75)
work_dir <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/Astro_Glioma/figures_v20260506"
setwd(work_dir)
h5ad_file <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/Astro_Glioma/Annotated/annotated_by_cell_subclass.h5ad"
cell_prop_file <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/Astro_Glioma/initial_analysis/cond_prop_for_each_cell_type.tsv"
# seu <- read_h5ad(h5ad_file, as = "Seurat")
# seu.bak <- seu
cell_prop_df <- vroom(cell_prop_file)
colors <- setNames(
DiscretePalette_scCustomize(num_colors = 40, palette = "ditto_seq")[c(2:3, 5:7, 17:20, 21:26, 29:31)],
levels(seu@meta.data$cell_subclass)
)
p1 <- DimPlot_scCustom(
seu,
reduction = "X_umap",
figure_plot = TRUE,
group.by = "cell_subclass",
colors_use = colors,
pt.size = 0.1,
aspect_ratio = 1
) +
labs(x = "UMAP1", y = "UMAP2") +
theme(text = element_text(family = "Arial", size = 10, color = "black"))
ugd_save_inline(
{
print(p1)
},
file = "cell_subclass.umap.pdf",
width = 400,
height = 400
)
p2 <- ggplot(cell_prop_df) +
geom_bar(aes(cell_subclass, prop_cond, fill = cell_subclass, alpha = sample),
stat = "identity", color = NA, position = "stack"
) +
geom_hline(yintercept = 50, linetype = "dashed", color = "grey25", linewidth = gg_pt(1)) +
labs(x = NULL, y = "Number of cells", alpha = "Condition") +
guides(fill = "none") +
scale_fill_manual(values = colors) +
scale_alpha_manual(values = c("Astro" = 0.6, "Glioma" = 1)) +
scale_y_continuous(expand = expansion(0)) +
theme_classic(base_family = "Arial") +
theme(
axis.title = element_text(size = 12, color = "black"),
axis.text.y = element_text(size = 10, color = "black"),
axis.text.x = element_blank(),
legend.title = element_text(size = 10, color = "black"),
legend.text = element_text(size = 10, color = "black"),
axis.line.x = element_blank(),
axis.ticks.x = element_blank()
)
ugd_save_inline(
{
print(p2)
},
file = "cell_subclass.cell_prop.barplot.pdf",
width = 400,
height = 200
)3.2.2 GO bar plot
library(tidyverse)
library(vroom)
library(unigd)
gg_pt <- function(x) x / (ggplot2::.pt * 0.75)
work_dir <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/Astro_Glioma/figures_v20260506"
setwd(work_dir)
go_file <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/Astro_Glioma/additional_analysis/go_merged/CA_EN_L4.Glioma_vs_Astro.0.05.oraGO.select_terms.tsv"
go_df <- vroom(go_file)
go_df <- go_df %>%
mutate(Cluster = factor(Cluster, levels = c("down", "up"))) %>%
arrange(Cluster, desc(p.adjust)) %>%
mutate(Description = factor(Description, levels = unique(Description)))
p <- ggplot(go_df, aes(x = -log10(p.adjust), y = Description, fill = Cluster)) +
geom_bar(stat = "identity", width = 0.8, alpha = 0.6) +
geom_text(
aes(x = 0.05, label = Description),
hjust = 0,
size = gg_pt(10),
family = "Arial",
color = "black"
) +
scale_fill_manual(
values = c("down" = "#377EB8", "up" = "#E41A1C"),
labels = c("down" = "Down", "up" = "Up")
) +
labs(y = NULL, fill = "DE direction") +
scale_x_continuous(expand = expansion(0), position = "top") +
theme_classic(base_family = "Arial") +
theme(
legend.position = "bottom",
axis.text.y = element_blank(),
axis.ticks.y = element_blank(),
axis.line = element_line(color = "black", linewidth = gg_pt(1)),
axis.ticks.x = element_line(color = "black", linewidth = gg_pt(1)),
axis.text.x = element_text(size = 10, color = "black"),
axis.title = element_text(size = 12, color = "black"),
legend.title = element_text(size = 10, color = "black"),
legend.text = element_text(size = 10, color = "black"),
)
ugd_save_inline(
{
print(p)
},
file = "select_go_terms.barplot.pdf",
width = 300,
height = 300
)3.2.3 Module score UMAP plot
library(Seurat)
library(anndataR)
library(unigd)
library(tidyverse)
library(scCustomize)
library(vroom)
library(patchwork)
library(ggrastr)
gg_pt <- function(x) x / (ggplot2::.pt * 0.75)
work_dir <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/Astro_Glioma/figures_v20260506"
setwd(work_dir)
mm_h5ad_file <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/Astro_Glioma/additional_analysis/module_score_corr/adata_with_ModuleScore.h5ad"
hs_h5ad_file <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/Astro_Glioma/additional_analysis/2024_XinWang_SciAdv_snRNA/module_score_corr/adata_with_ModuleScore.h5ad"
# mm_seu <- read_h5ad(mm_h5ad_file, as = "Seurat")
# mm_seu.bak <- mm_seu
umap_df <- mm_seu@reductions$X_umap@cell.embeddings %>%
as.data.frame() %>%
mutate(cell_id = row.names(.))
ms_df <- mm_seu@meta.data %>%
select(sample, up_immune_response_ModuleScore) %>%
mutate(cell_id = row.names(.))
plot_data <- inner_join(umap_df, ms_df, by = "cell_id") %>%
mutate(sample = factor(sample, levels = c("Astro", "Glioma"))) %>%
arrange(up_immune_response_ModuleScore)
qs <- quantile(plot_data$up_immune_response_ModuleScore, probs = seq(0, 1, 0.01))
p <- ggplot(plot_data, aes(x = Xumap_1, y = Xumap_2, color = up_immune_response_ModuleScore)) +
rasterise(geom_point(size = 0.3), dpi = 600) +
scale_color_gradientn(
colors = c("#41B7C4", "#FCFCFC", "#FF6347"),
values = scales::rescale(c(qs["5%"], 0, qs["95%"])),
limits = c(qs["5%"], qs["95%"]),
oob = scales::squish
) +
facet_wrap(~sample, nrow = 1) +
labs(x = NULL, y = NULL, color = "Module Score") +
theme_void(base_family = "Arial") +
coord_equal() +
theme(
text = element_text(size = 10, color = "black"),
legend.title = element_text(size = 10, color = "black"),
legend.text = element_text(size = 10, color = "black")
)
ugd_save_inline(
{
print(p)
},
file = "mm_HC.module_score.umap.pdf",
width = 440,
height = 220
)
# hs_seu <- read_h5ad(hs_h5ad_file, as = "Seurat")
# hs_seu.bak <- hs_seu
hs_seu <- subset(hs_seu, location == "N21")
umap_df <- hs_seu@reductions$X_umap@cell.embeddings %>%
as.data.frame() %>%
mutate(cell_id = row.names(.))
ms_df <- hs_seu@meta.data %>%
select(up_immune_response_ModuleScore) %>%
mutate(cell_id = row.names(.))
plot_data <- inner_join(umap_df, ms_df, by = "cell_id") %>%
arrange(up_immune_response_ModuleScore)
qs <- quantile(plot_data$up_immune_response_ModuleScore, probs = seq(0, 1, 0.01))
p1 <- ggplot(plot_data, aes(x = Xumap_1, y = Xumap_2, color = up_immune_response_ModuleScore)) +
rasterise(geom_point(size = 0.1), dpi = 600) +
scale_color_gradientn(
colors = c("#41B7C4", "#FCFCFC", "#FF6347"),
values = scales::rescale(c(qs["5%"], 0, qs["95%"])),
limits = c(qs["5%"], qs["95%"]),
oob = scales::squish,
breaks = scales::breaks_extended(10)
) +
labs(x = NULL, y = NULL, color = "Module Score") +
theme_void(base_family = "Arial") +
coord_equal() +
theme(
legend.title = element_text(size = 10, color = "black"),
legend.text = element_text(size = 10, color = "black")
)
colors <- setNames(
DiscretePalette_scCustomize(num_colors = 40, palette = "ditto_seq")[c(3, 5, 6, 18, 19, 29:31, 32)],
levels(hs_seu@meta.data$cell_type)
)
p2 <- DimPlot_scCustom(
hs_seu,
reduction = "X_umap",
figure_plot = TRUE,
group.by = "cell_type",
colors_use = colors,
pt.size = 0.1,
aspect_ratio = 1
) +
labs(x = "UMAP1", y = "UMAP2") +
theme(text = element_text(family = "Arial", size = 10, color = "black"))
ugd_save_inline(
{
print(p2 | p1)
},
file = "hs_PTB.cell_type.module_score.umap.pdf",
width = 700,
height = 350
)3.2.4 Module score thresholding plot
library(tidyverse)
library(vroom)
library(unigd)
gg_pt <- function(x) x / (ggplot2::.pt * 0.75)
work_dir <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/Astro_Glioma/figures_v20260506"
setwd(work_dir)
hs_ms_file <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/Astro_Glioma/additional_analysis/2024_XinWang_SciAdv_snRNA/module_score_corr/adata_obs_ModuleScore.tsv"
mm_ms_file <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/Astro_Glioma/additional_analysis/module_score_corr/adata_obs_ModuleScore.tsv"
mm_en_types <- c("CA1 EN", "CA3 EN", "Proximal Sub EN", "Superficial Sub EN", "Deep Sub EN", "DG EN")
mm_ms_df <- vroom(mm_ms_file)
mm_ms_df <- mm_ms_df %>%
filter(cell_subclass %in% mm_en_types) %>%
rename(MS = up_immune_response_ModuleScore) %>%
select(sample, cell_subclass, MS)
mm_ctrl_qs <- mm_ms_df %>%
filter(sample == "Astro") %>%
pull(MS) %>%
quantile(probs = seq(0, 1, 0.01))
mm_stats <- table(mm_ms_df$sample, mm_ms_df$MS > mm_ctrl_qs["99%"]) %>%
as.data.frame() %>%
rename(sample = Var1, MS_active = Var2) %>%
arrange(sample, MS_active)
vroom_write(
mm_stats,
file = "mm_module_score.threshold_stats.tsv",
col_names = TRUE, append = FALSE, delim = "\t"
)
p <- ggplot(mm_ms_df, aes(x = MS, color = sample)) +
geom_density(linewidth = gg_pt(1)) +
geom_vline(xintercept = mm_ctrl_qs["99%"], linetype = "dashed", color = "#B3B3B3", linewidth = gg_pt(1)) +
scale_color_manual(values = c("Astro" = "#B3B3B3", "Glioma" = "#030303")) +
scale_x_continuous(expand = expansion(0)) +
scale_y_continuous(expand = expansion(0)) +
labs(
x = "Module Score", y = "Density", color = "Condition",
title = "Control quantile threshold: Q99"
) +
theme_classic(base_family = "Arial") +
theme(
plot.title = element_text(size = 12, color = "black", hjust = 0.5),
axis.title = element_text(size = 12, color = "black"),
axis.text.y = element_text(size = 10, color = "black"),
axis.text.x = element_text(size = 10, color = "black"),
legend.title = element_text(size = 10, color = "black"),
legend.text = element_text(size = 10, color = "black"),
axis.line = element_line(color = "black", linewidth = gg_pt(1)),
axis.ticks = element_line(color = "black", linewidth = gg_pt(1))
)
ugd_save_inline(
{
print(p)
},
file = "mm_module_score.threshold_plot.pdf",
width = 250,
height = 150
)
hs_ms_df <- vroom(hs_ms_file)
hs_ms_df <- hs_ms_df %>%
rename(MS = up_immune_response_ModuleScore) %>%
select(cell_type, sample_name, patient, location, MS) %>%
as_tibble()
hs_stats <- table(hs_ms_df$cell_type, hs_ms_df$MS > 0) %>%
as.data.frame() %>%
rename(cell_type = Var1, MS_active = Var2) %>%
arrange(cell_type, MS_active)
vroom_write(
hs_stats,
file = "hs_module_score.threshold_stats.tsv",
col_names = TRUE, append = FALSE, delim = "\t"
)
p <- ggplot(hs_ms_df, aes(x = MS, color = "All")) +
geom_density(linewidth = gg_pt(1)) +
geom_density(
data = filter(hs_ms_df, cell_type == "Excitatory neuron"),
mapping = aes(x = MS, color = "EN"),
linewidth = gg_pt(1),
inherit.aes = FALSE
) +
geom_vline(xintercept = 0, linetype = "dashed", color = "#B3B3B3", linewidth = gg_pt(1)) +
scale_color_manual(values = c("All" = "#B3B3B3", "EN" = "#030303")) +
scale_x_continuous(expand = expansion(0)) +
scale_y_continuous(expand = expansion(0)) +
labs(
x = "Module Score", y = "Density", color = "Cell Type",
title = "Global biomodal threshold: 0"
) +
theme_classic(base_family = "Arial") +
theme(
plot.title = element_text(size = 12, color = "black", hjust = 0.5),
axis.title = element_text(size = 12, color = "black"),
axis.text.y = element_text(size = 10, color = "black"),
axis.text.x = element_text(size = 10, color = "black"),
legend.title = element_text(size = 10, color = "black"),
legend.text = element_text(size = 10, color = "black"),
axis.line = element_line(color = "black", linewidth = gg_pt(1)),
axis.ticks = element_line(color = "black", linewidth = gg_pt(1))
)
ugd_save_inline(
{
print(p)
},
file = "hs_module_score.threshold_plot.pdf",
width = 250,
height = 150
)3.2.5 GSEA line plot
library(tidyverse)
library(vroom)
library(unigd)
library(clusterProfiler)
library(enrichplot)
gg_pt <- function(x) x / (ggplot2::.pt * 0.75)
work_dir <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/Astro_Glioma/figures_v20260506"
setwd(work_dir)
mm_rds_file <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/Astro_Glioma/additional_analysis/gsea_merged/adata_obs_ModuleScore.up_immune_response_ModuleScore.Q99_threshold.Glioma_active_vs_Glioma_inactive.gseGO.rds"
hs_rds_file <- "/data/users/yangrui/lab_projs/wutingting/hsGliomaAstro_mmHPC_snRNA-seq_v20250916/ana_res/Astro_Glioma/additional_analysis/gsea_merged/adata_with_ModuleScore.up_immune_response_ModuleScore.EN_Active_vs_EN_Inactive.DE.gseGO.rds"
mm_rds <- readRDS(mm_rds_file)
hs_rds <- readRDS(hs_rds_file)
up_ids <- c(8, 11)
down_ids <- c(26, 46)
p <- gseaplot2(
mm_rds,
geneSetID = c(up_ids, down_ids),
subplots = 1:2
)
p[[1]] <- p[[1]] + theme(
text = element_text(family = "Arial", size = 10),
axis.line = element_line(colour = "black", linewidth = gg_pt(1)),
axis.ticks.y = element_line(color = "black", linewidth = gg_pt(1)),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank()
)
p[[2]] <- p[[2]] + theme(
text = element_text(family = "Arial", size = 10),
axis.line = element_line(colour = "black", linewidth = gg_pt(1)),
axis.ticks.x = element_line(color = "black", linewidth = gg_pt(1)),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank()
)
ugd_save_inline(
{
print(p)
},
file = "mm.select_gsea_terms.lineplot.pdf",
width = 350,
height = 200
)
up_ids <- c(2, 63)
down_ids <- c(14, 18, 29, 55)
p <- gseaplot2(
hs_rds,
geneSetID = c(up_ids, down_ids),
subplots = 1:2
)
p[[1]] <- p[[1]] + theme(
text = element_text(family = "Arial", size = 10),
axis.line = element_line(colour = "black", linewidth = gg_pt(1)),
axis.ticks.y = element_line(color = "black", linewidth = gg_pt(1)),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank()
)
p[[2]] <- p[[2]] + theme(
text = element_text(family = "Arial", size = 10),
axis.line = element_line(colour = "black", linewidth = gg_pt(1)),
axis.ticks.x = element_line(color = "black", linewidth = gg_pt(1)),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank()
)
ugd_save_inline(
{
print(p)
},
file = "hs.select_gsea_terms.lineplot.pdf",
width = 350,
height = 200
)