# %%
# Log-Normalize scheme
# with all necessary variables regressed out.
# i.e., sequencing depth, mitochondrial content, ribosomal content, etc.
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
import gc
from pathlib import Path
import scanpy as sc
import rapids_singlecell as rsc
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
# set memory management type
import rmm
import cupy as cp
from rmm.allocators.cupy import rmm_cupy_allocator
rmm.reinitialize(
managed_memory=False,
pool_allocator=True,
devices=[0],
initial_pool_size=12 * 1024**3,
)
cp.cuda.set_allocator(rmm_cupy_allocator)
# %%
sc.set_figure_params(dpi=600, facecolor="white")
# %%
adata_file = "/data/users/yangrui/lab_projs/test/data/neuron_counts.h5ad"
scheme = "log-normalize.full-regression"
# %%
work_dir = Path(adata_file).parent.parent
os.makedirs(work_dir, exist_ok=True)
os.chdir(work_dir)
# %%
adata = sc.read_h5ad(adata_file)
# %%
print(adata.var_names[adata.var_names.str.startswith("mt-")])
print(adata.var_names[adata.var_names.str.startswith(("Rps", "Rpl"))])
print(adata.var_names[adata.var_names.str.contains("^Hb[^(p)]")])
# %%
adata.var["MT"] = adata.var_names.str.contains("^mt-", case=False, na=False)
adata.var["RIBO"] = adata.var_names.str.contains("^(Rps|Rpl)", case=False, na=False)
adata.var["HB"] = adata.var_names.str.contains("^Hb[^(p)]", case=False, na=False)
# %%
sc.pp.calculate_qc_metrics(adata, qc_vars=["MT", "RIBO", "HB"], inplace=True)
# %%
sc.set_figure_params(figsize=(6, 6))
sc.pl.scatter(adata, x="total_counts", y="pct_counts_MT")
sc.pl.scatter(adata, x="total_counts", y="n_genes_by_counts")
# %%
sc.set_figure_params(figsize=(4, 4))
sc.pl.violin(adata, "n_genes_by_counts", jitter=0.4)
sc.pl.violin(adata, "total_counts", jitter=0.4)
sc.pl.violin(adata, "pct_counts_MT", jitter=0.4)
sc.pl.violin(adata, "pct_counts_RIBO", jitter=0.4)
sc.pl.violin(adata, "pct_counts_HB", jitter=0.4)
# %%
sc.set_figure_params(figsize=(4, 4))
sc.pl.violin(adata, "n_genes_by_counts", groupby="scDblFinder.class", jitter=0.4)
sc.pl.violin(adata, "total_counts", groupby="scDblFinder.class", jitter=0.4)
sc.pl.violin(adata, "pct_counts_MT", groupby="scDblFinder.class", jitter=0.4)
sc.pl.violin(adata, "pct_counts_RIBO", groupby="scDblFinder.class", jitter=0.4)
sc.pl.violin(adata, "pct_counts_HB", groupby="scDblFinder.class", jitter=0.4)
# %%
adata = adata[:, adata.var["n_cells_by_counts"] >= 10]
adata.shape
# %%
sc.pp.calculate_qc_metrics(adata, qc_vars=["MT", "RIBO", "HB"], inplace=True)
# %%
adata_out_file = "data/neuron_counts.qc_filtered.h5ad"
adata.write(adata_out_file)
# %%
adata.layers["counts"] = adata.X.copy()
# %%
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
# %%
adata.layers["data"] = adata.X.copy()
# %%
adata.raw = adata
# %%
nhvgs = 2000
sc.pp.highly_variable_genes(
adata, n_top_genes=nhvgs, flavor="seurat_v3", layer="counts"
)
# %%
import re
noise_genes = [
g for g in adata.var_names if re.match(r"^(mt-|Rps|Rpl)", g, re.IGNORECASE)
]
display(adata.var.loc[adata.var.index.isin(noise_genes) & adata.var["highly_variable"]])
# %%
adata.var.loc[adata.var.index.isin(noise_genes), "highly_variable"] = False
print(
f"# highly variable genes after removing MT/RIBO/HB: {sum(adata.var['highly_variable'])}"
)
# %%
sc.pl.highly_variable_genes(adata, log=True, show=False)
fig = plt.gcf()
fig.suptitle(
f"Highly Variable Genes - {sum(adata.var['highly_variable'])}/{nhvgs}",
fontsize=20,
y=1,
)
fig.savefig(f"figures/highly_variable_genes.{scheme}.pdf", dpi=600, bbox_inches="tight")
plt.show()
# %% load data into GPU memory
rsc.get.anndata_to_GPU(adata)
# %%
rsc.pp.regress_out(adata, keys=["total_counts", "pct_counts_MT", "pct_counts_RIBO"])
# %%
rsc.tl.pca(adata, n_comps=100, use_highly_variable=True)
# %% elbow plot
sc.set_figure_params(figsize=(22, 8))
sc.pl.pca_variance_ratio(adata, log=True, n_pcs=100)
# %% customized elbow plot
import pandas as pd
import numpy as np
from plotnine import *
npcs = 100
var_ratio = adata.uns["pca"]["variance_ratio"][:npcs]
df_elbow = pd.DataFrame(
{"PC": np.arange(1, len(var_ratio) + 1), "Variance_Ratio": var_ratio}
)
p = (
ggplot(df_elbow, aes(x="PC", y="Variance_Ratio"))
+ geom_line(color="#e41a1c", size=1)
+ geom_point(color="#e41a1c", size=1.5)
+ scale_y_log10()
+ scale_x_continuous(breaks=np.arange(0, 101, 10))
+ geom_vline(xintercept=np.arange(20, 51, 10), linetype="dashed", color="blue")
+ labs(
x="Principal Component",
y="Variance Ratio (log10)",
title="PCA Elbow Plot - Variance Explained",
)
+ theme_classic()
+ theme(
figure_size=(10, 5),
plot_title=element_text(hjust=0.5),
axis_text=element_text(color="black"),
)
)
p.save(f"figures/pca_variance_ratio.{scheme}.pdf", dpi=600)
p
# %%
rsc.get.anndata_to_CPU(adata)
# %% check spearman correlations between PCs and continuous technical metrics
import scanpy as sc
import pandas as pd
from scipy.stats import spearmanr
from plotnine import *
npcs = 50
eval_metrics = [
"total_counts",
"n_genes_by_counts",
"pct_counts_MT",
"pct_counts_RIBO",
]
pcs = adata.obsm["X_pca"][:, :npcs]
obs_data = adata.obs[eval_metrics]
corr_list = []
for i in range(npcs):
pc_name = f"PC{i+1}"
for eval_metric in eval_metrics:
r, _ = spearmanr(pcs[:, i], obs_data[eval_metric])
corr_list.append(
{"PC": pc_name, "Metric": eval_metric, "Corr": r, "AbsCorr": abs(r)}
)
corr_df = pd.DataFrame(corr_list)
pc_order = [f"PC{i+1}" for i in range(npcs)]
corr_df["PC"] = pd.Categorical(corr_df["PC"], categories=pc_order[::-1], ordered=True)
corr_df["Metric"] = pd.Categorical(
corr_df["Metric"],
categories=eval_metrics,
ordered=True,
)
p = (
ggplot(corr_df, aes(x="Metric", y="PC", fill="Corr"))
+ geom_tile(color="white")
+ geom_text(aes(label="Corr.round(2)"), size=8)
+ scale_fill_gradient2(
low="#313695", mid="#ffffbf", high="#a50026", midpoint=0, limits=[-1, 1]
)
+ theme_minimal()
+ labs(
x="Technical Metrics",
y="Principal Components",
fill="Spearman's Rho",
)
+ theme(
figure_size=(4, 10),
axis_text_x=element_text(angle=45, hjust=1),
panel_grid_major=element_blank(),
)
)
p.save(f"figures/pc_tech_correlations.{scheme}.pdf", dpi=600)
p
# %% pairwise spearman correlations between continuous metrics
import pandas as pd
from plotnine import *
eval_metrics = [
"total_counts",
"n_genes_by_counts",
"pct_counts_MT",
"pct_counts_RIBO",
]
obs_data = adata.obs[eval_metrics]
corr_matrix = obs_data.corr(method="spearman")
corr_df = corr_matrix.reset_index().melt(id_vars="index")
corr_df.columns = ["Metric_Y", "Metric_X", "Corr"]
corr_df["Metric_Y"] = pd.Categorical(
corr_df["Metric_Y"], categories=eval_metrics[::-1], ordered=True
)
corr_df["Metric_X"] = pd.Categorical(
corr_df["Metric_X"], categories=eval_metrics, ordered=True
)
p = (
ggplot(corr_df, aes(x="Metric_X", y="Metric_Y", fill="Corr"))
+ geom_tile(color="white")
+ geom_text(aes(label="Corr.round(2)"), size=10)
+ scale_fill_gradient2(
low="#313695", mid="#ffffbf", high="#a50026", midpoint=0, limits=[-1, 1]
)
+ theme_minimal()
+ labs(
x="",
y="",
fill="Spearman's Rho",
title="Pairwise Correlations",
)
+ theme(
aspect_ratio=1,
figure_size=(5, 5),
axis_text_x=element_text(angle=90, hjust=1),
panel_grid_major=element_blank(),
)
)
p.save(f"figures/tech_correlations.{scheme}.pdf", dpi=600)
p
# %%
npcs = 50
# 关键参数:
# n_neighbors:全局/局部结构平衡
# use_rep/n_pcs
# algorithm:近邻搜索算法-速度与精度平衡
# method:计算连通性权重,如 'gauss' 计算的权重随距离增加呈指数级衰减
# metric:距离度量
rsc.pp.neighbors(
adata,
n_neighbors=15,
n_pcs=npcs,
algorithm="brute",
use_rep="X_pca",
key_added="neighbors",
method="umap",
)
# %%
# key parameters:
# min_dist: The effective minimum distance between embedded points.
# Smaller values will result in a more clustered/clumped embedding
# where nearby points on the manifold are drawn closer together,
# while larger values will result on a more even dispersal of points.
# 让同类细胞聚集更紧或更松。
# spread: The effective scale of embedded points.
# 整体缩小或放大尺度。
rsc.tl.umap(
adata, min_dist=0.1, spread=1, neighbors_key="neighbors", key_added="X_umap"
)
# %%
rsc.tl.leiden(adata, resolution=1, key_added="leiden_res1", neighbors_key="neighbors")
# %%
sc.set_figure_params(figsize=(8, 8))
target_col = "leiden_res1"
sc.pl.umap(
adata,
color=target_col,
legend_loc="on data",
palette="tab20",
show=False,
)
fig = plt.gcf()
fig.savefig(f"figures/{target_col}.umap.{scheme}.pdf", dpi=600, bbox_inches="tight")
plt.show()
# %%
group_keys = [
"leiden_res1",
"n_genes_by_counts",
"total_counts",
"pct_counts_MT",
"pct_counts_RIBO",
"scDblFinder.class",
]
sc.set_figure_params(figsize=(8, 8))
sc.pl.umap(
adata,
color=group_keys,
ncols=2,
wspace=0.25,
hspace=0.25,
palette="tab20",
show=False,
)
fig = plt.gcf()
fig.savefig(f"figures/qc_check.umap.{scheme}.pdf", dpi=600, bbox_inches="tight")
plt.show()1 Introduction
For generated results, see here.
2 Log-Normalize scheme with regressing out sequencing depth
3 Log-Normalize scheme without regressing out sequencing depth
# %%
# Log-Normalize scheme
# without sequencing depth regressed out.
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
import gc
from pathlib import Path
import scanpy as sc
import rapids_singlecell as rsc
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
# set memory management type
import rmm
import cupy as cp
from rmm.allocators.cupy import rmm_cupy_allocator
rmm.reinitialize(
managed_memory=False,
pool_allocator=True,
devices=[0],
initial_pool_size=12 * 1024**3,
)
cp.cuda.set_allocator(rmm_cupy_allocator)
# %%
sc.set_figure_params(dpi=600, facecolor="white")
# %%
adata_file = "/data/users/yangrui/lab_projs/test/data/neuron_counts.qc_filtered.h5ad"
scheme = "log-normalize.no-depth-regression"
# %%
work_dir = Path(adata_file).parent.parent
os.makedirs(work_dir, exist_ok=True)
os.chdir(work_dir)
# %%
adata = sc.read_h5ad(adata_file)
# %%
adata.layers["counts"] = adata.X.copy()
# %%
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
# %%
adata.layers["data"] = adata.X.copy()
# %%
adata.raw = adata
# %%
nhvgs = 2000
sc.pp.highly_variable_genes(
adata, n_top_genes=nhvgs, flavor="seurat_v3", layer="counts"
)
# %%
import re
noise_genes = [
g for g in adata.var_names if re.match(r"^(mt-|Rps|Rpl)", g, re.IGNORECASE)
]
display(adata.var.loc[adata.var.index.isin(noise_genes) & adata.var["highly_variable"]])
# %%
adata.var.loc[adata.var.index.isin(noise_genes), "highly_variable"] = False
print(
f"# highly variable genes after removing MT/RIBO/HB: {sum(adata.var['highly_variable'])}"
)
# %%
sc.pl.highly_variable_genes(adata, log=True, show=False)
fig = plt.gcf()
fig.suptitle(
f"Highly Variable Genes - {sum(adata.var['highly_variable'])}/{nhvgs}",
fontsize=20,
y=1,
)
fig.savefig(f"figures/highly_variable_genes.{scheme}.pdf", dpi=600, bbox_inches="tight")
plt.show()
# %% load data into GPU memory
rsc.get.anndata_to_GPU(adata)
# %%
rsc.pp.regress_out(adata, keys=["pct_counts_MT", "pct_counts_RIBO"])
# %%
rsc.tl.pca(adata, n_comps=100, use_highly_variable=True)
# %% elbow plot
sc.set_figure_params(figsize=(22, 8))
sc.pl.pca_variance_ratio(adata, log=True, n_pcs=100)
# %% customized elbow plot
import pandas as pd
import numpy as np
from plotnine import *
npcs = 100
var_ratio = adata.uns["pca"]["variance_ratio"][:npcs]
df_elbow = pd.DataFrame(
{"PC": np.arange(1, len(var_ratio) + 1), "Variance_Ratio": var_ratio}
)
p = (
ggplot(df_elbow, aes(x="PC", y="Variance_Ratio"))
+ geom_line(color="#e41a1c", size=1)
+ geom_point(color="#e41a1c", size=1.5)
+ scale_y_log10()
+ scale_x_continuous(breaks=np.arange(0, 101, 10))
+ geom_vline(xintercept=np.arange(20, 51, 10), linetype="dashed", color="blue")
+ labs(
x="Principal Component",
y="Variance Ratio (log10)",
title="PCA Elbow Plot - Variance Explained",
)
+ theme_classic()
+ theme(
figure_size=(10, 5),
plot_title=element_text(hjust=0.5),
axis_text=element_text(color="black"),
)
)
p.save(f"figures/pca_variance_ratio.{scheme}.pdf", dpi=600)
p
# %%
rsc.get.anndata_to_CPU(adata)
# %% check spearman correlations between PCs and continuous technical metrics
import scanpy as sc
import pandas as pd
from scipy.stats import spearmanr
from plotnine import *
npcs = 50
eval_metrics = [
"total_counts",
"n_genes_by_counts",
"pct_counts_MT",
"pct_counts_RIBO",
]
pcs = adata.obsm["X_pca"][:, :npcs]
obs_data = adata.obs[eval_metrics]
corr_list = []
for i in range(npcs):
pc_name = f"PC{i+1}"
for eval_metric in eval_metrics:
r, _ = spearmanr(pcs[:, i], obs_data[eval_metric])
corr_list.append(
{"PC": pc_name, "Metric": eval_metric, "Corr": r, "AbsCorr": abs(r)}
)
corr_df = pd.DataFrame(corr_list)
pc_order = [f"PC{i+1}" for i in range(npcs)]
corr_df["PC"] = pd.Categorical(corr_df["PC"], categories=pc_order[::-1], ordered=True)
corr_df["Metric"] = pd.Categorical(
corr_df["Metric"],
categories=eval_metrics,
ordered=True,
)
p = (
ggplot(corr_df, aes(x="Metric", y="PC", fill="Corr"))
+ geom_tile(color="white")
+ geom_text(aes(label="Corr.round(2)"), size=8)
+ scale_fill_gradient2(
low="#313695", mid="#ffffbf", high="#a50026", midpoint=0, limits=[-1, 1]
)
+ theme_minimal()
+ labs(
x="Technical Metrics",
y="Principal Components",
fill="Spearman's Rho",
)
+ theme(
figure_size=(4, 10),
axis_text_x=element_text(angle=45, hjust=1),
panel_grid_major=element_blank(),
)
)
p.save(f"figures/pc_tech_correlations.{scheme}.pdf", dpi=600)
p
# %% pairwise spearman correlations between continuous metrics
import pandas as pd
from plotnine import *
eval_metrics = [
"total_counts",
"n_genes_by_counts",
"pct_counts_MT",
"pct_counts_RIBO",
]
obs_data = adata.obs[eval_metrics]
corr_matrix = obs_data.corr(method="spearman")
corr_df = corr_matrix.reset_index().melt(id_vars="index")
corr_df.columns = ["Metric_Y", "Metric_X", "Corr"]
corr_df["Metric_Y"] = pd.Categorical(
corr_df["Metric_Y"], categories=eval_metrics[::-1], ordered=True
)
corr_df["Metric_X"] = pd.Categorical(
corr_df["Metric_X"], categories=eval_metrics, ordered=True
)
p = (
ggplot(corr_df, aes(x="Metric_X", y="Metric_Y", fill="Corr"))
+ geom_tile(color="white")
+ geom_text(aes(label="Corr.round(2)"), size=10)
+ scale_fill_gradient2(
low="#313695", mid="#ffffbf", high="#a50026", midpoint=0, limits=[-1, 1]
)
+ theme_minimal()
+ labs(
x="",
y="",
fill="Spearman's Rho",
title="Pairwise Correlations",
)
+ theme(
aspect_ratio=1,
figure_size=(5, 5),
axis_text_x=element_text(angle=90, hjust=1),
panel_grid_major=element_blank(),
)
)
p.save(f"figures/tech_correlations.{scheme}.pdf", dpi=600)
p
# %%
npcs = 50
# 关键参数:
# n_neighbors:全局/局部结构平衡
# use_rep/n_pcs
# algorithm:近邻搜索算法-速度与精度平衡
# method:计算连通性权重,如 'gauss' 计算的权重随距离增加呈指数级衰减
# metric:距离度量
rsc.pp.neighbors(
adata,
n_neighbors=15,
n_pcs=npcs,
algorithm="brute",
use_rep="X_pca",
key_added="neighbors",
method="umap",
)
# %%
# key parameters:
# min_dist: The effective minimum distance between embedded points.
# Smaller values will result in a more clustered/clumped embedding
# where nearby points on the manifold are drawn closer together,
# while larger values will result on a more even dispersal of points.
# 让同类细胞聚集更紧或更松。
# spread: The effective scale of embedded points.
# 整体缩小或放大尺度。
rsc.tl.umap(
adata, min_dist=0.1, spread=1, neighbors_key="neighbors", key_added="X_umap"
)
# %%
rsc.tl.leiden(adata, resolution=1, key_added="leiden_res1", neighbors_key="neighbors")
# %%
sc.set_figure_params(figsize=(8, 8))
target_col = "leiden_res1"
sc.pl.umap(
adata,
color=target_col,
legend_loc="on data",
palette="tab20",
show=False,
)
fig = plt.gcf()
fig.savefig(f"figures/{target_col}.umap.{scheme}.pdf", dpi=600, bbox_inches="tight")
plt.show()
# %%
group_keys = [
"leiden_res1",
"n_genes_by_counts",
"total_counts",
"pct_counts_MT",
"pct_counts_RIBO",
"scDblFinder.class",
]
sc.set_figure_params(figsize=(8, 8))
sc.pl.umap(
adata,
color=group_keys,
ncols=2,
wspace=0.25,
hspace=0.25,
palette="tab20",
show=False,
)
fig = plt.gcf()
fig.savefig(f"figures/qc_check.umap.{scheme}.pdf", dpi=600, bbox_inches="tight")
plt.show()4 Log-Normalize-Scale scheme without regressing out sequencing depth
# %%
# Log-Normalize-Scale scheme
# without sequencing depth regressed out.
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
import gc
from pathlib import Path
import scanpy as sc
import rapids_singlecell as rsc
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
# set memory management type
import rmm
import cupy as cp
from rmm.allocators.cupy import rmm_cupy_allocator
rmm.reinitialize(
managed_memory=False,
pool_allocator=True,
devices=[0],
initial_pool_size=12 * 1024**3,
)
cp.cuda.set_allocator(rmm_cupy_allocator)
# %%
sc.set_figure_params(dpi=600, facecolor="white")
# %%
adata_file = "/data/users/yangrui/lab_projs/test/data/neuron_counts.qc_filtered.h5ad"
scheme = "log-normalize-scale.no-depth-regression"
# %%
work_dir = Path(adata_file).parent.parent
os.makedirs(work_dir, exist_ok=True)
os.chdir(work_dir)
# %%
adata = sc.read_h5ad(adata_file)
# %%
adata.layers["counts"] = adata.X.copy()
# %%
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
# %%
adata.layers["data"] = adata.X.copy()
# %%
adata.raw = adata
# %%
nhvgs = 2000
sc.pp.highly_variable_genes(
adata, n_top_genes=nhvgs, flavor="seurat_v3", layer="counts"
)
# %%
import re
noise_genes = [
g for g in adata.var_names if re.match(r"^(mt-|Rps|Rpl)", g, re.IGNORECASE)
]
display(adata.var.loc[adata.var.index.isin(noise_genes) & adata.var["highly_variable"]])
# %%
adata.var.loc[adata.var.index.isin(noise_genes), "highly_variable"] = False
print(
f"# highly variable genes after removing MT/RIBO/HB: {sum(adata.var['highly_variable'])}"
)
# %%
sc.pl.highly_variable_genes(adata, log=True, show=False)
fig = plt.gcf()
fig.suptitle(
f"Highly Variable Genes - {sum(adata.var['highly_variable'])}/{nhvgs}",
fontsize=20,
y=1,
)
fig.savefig(f"figures/highly_variable_genes.{scheme}.pdf", dpi=600, bbox_inches="tight")
plt.show()
# %% load data into GPU memory
rsc.get.anndata_to_GPU(adata)
# %%
rsc.pp.regress_out(adata, keys=["pct_counts_MT", "pct_counts_RIBO"])
# %%
rsc.pp.scale(adata, max_value=10)
# %%
rsc.tl.pca(adata, n_comps=100, use_highly_variable=True)
# %% elbow plot
sc.set_figure_params(figsize=(22, 8))
sc.pl.pca_variance_ratio(adata, log=True, n_pcs=100)
# %% customized elbow plot
import pandas as pd
import numpy as np
from plotnine import *
npcs = 100
var_ratio = adata.uns["pca"]["variance_ratio"][:npcs]
df_elbow = pd.DataFrame(
{"PC": np.arange(1, len(var_ratio) + 1), "Variance_Ratio": var_ratio}
)
p = (
ggplot(df_elbow, aes(x="PC", y="Variance_Ratio"))
+ geom_line(color="#e41a1c", size=1)
+ geom_point(color="#e41a1c", size=1.5)
+ scale_y_log10()
+ scale_x_continuous(breaks=np.arange(0, 101, 10))
+ geom_vline(xintercept=np.arange(20, 51, 10), linetype="dashed", color="blue")
+ labs(
x="Principal Component",
y="Variance Ratio (log10)",
title="PCA Elbow Plot - Variance Explained",
)
+ theme_classic()
+ theme(
figure_size=(10, 5),
plot_title=element_text(hjust=0.5),
axis_text=element_text(color="black"),
)
)
p.save(f"figures/pca_variance_ratio.{scheme}.pdf", dpi=600)
p
# %%
rsc.get.anndata_to_CPU(adata)
# %% check spearman correlations between PCs and continuous technical metrics
import scanpy as sc
import pandas as pd
from scipy.stats import spearmanr
from plotnine import *
npcs = 50
eval_metrics = [
"total_counts",
"n_genes_by_counts",
"pct_counts_MT",
"pct_counts_RIBO",
]
pcs = adata.obsm["X_pca"][:, :npcs]
obs_data = adata.obs[eval_metrics]
corr_list = []
for i in range(npcs):
pc_name = f"PC{i+1}"
for eval_metric in eval_metrics:
r, _ = spearmanr(pcs[:, i], obs_data[eval_metric])
corr_list.append(
{"PC": pc_name, "Metric": eval_metric, "Corr": r, "AbsCorr": abs(r)}
)
corr_df = pd.DataFrame(corr_list)
pc_order = [f"PC{i+1}" for i in range(npcs)]
corr_df["PC"] = pd.Categorical(corr_df["PC"], categories=pc_order[::-1], ordered=True)
corr_df["Metric"] = pd.Categorical(
corr_df["Metric"],
categories=eval_metrics,
ordered=True,
)
p = (
ggplot(corr_df, aes(x="Metric", y="PC", fill="Corr"))
+ geom_tile(color="white")
+ geom_text(aes(label="Corr.round(2)"), size=8)
+ scale_fill_gradient2(
low="#313695", mid="#ffffbf", high="#a50026", midpoint=0, limits=[-1, 1]
)
+ theme_minimal()
+ labs(
x="Technical Metrics",
y="Principal Components",
fill="Spearman's Rho",
)
+ theme(
figure_size=(4, 10),
axis_text_x=element_text(angle=45, hjust=1),
panel_grid_major=element_blank(),
)
)
p.save(f"figures/pc_tech_correlations.{scheme}.pdf", dpi=600)
p
# %% pairwise spearman correlations between continuous metrics
import pandas as pd
from plotnine import *
eval_metrics = [
"total_counts",
"n_genes_by_counts",
"pct_counts_MT",
"pct_counts_RIBO",
]
obs_data = adata.obs[eval_metrics]
corr_matrix = obs_data.corr(method="spearman")
corr_df = corr_matrix.reset_index().melt(id_vars="index")
corr_df.columns = ["Metric_Y", "Metric_X", "Corr"]
corr_df["Metric_Y"] = pd.Categorical(
corr_df["Metric_Y"], categories=eval_metrics[::-1], ordered=True
)
corr_df["Metric_X"] = pd.Categorical(
corr_df["Metric_X"], categories=eval_metrics, ordered=True
)
p = (
ggplot(corr_df, aes(x="Metric_X", y="Metric_Y", fill="Corr"))
+ geom_tile(color="white")
+ geom_text(aes(label="Corr.round(2)"), size=10)
+ scale_fill_gradient2(
low="#313695", mid="#ffffbf", high="#a50026", midpoint=0, limits=[-1, 1]
)
+ theme_minimal()
+ labs(
x="",
y="",
fill="Spearman's Rho",
title="Pairwise Correlations",
)
+ theme(
aspect_ratio=1,
figure_size=(5, 5),
axis_text_x=element_text(angle=90, hjust=1),
panel_grid_major=element_blank(),
)
)
p.save(f"figures/tech_correlations.{scheme}.pdf", dpi=600)
p
# %%
npcs = 50
# 关键参数:
# n_neighbors:全局/局部结构平衡
# use_rep/n_pcs
# algorithm:近邻搜索算法-速度与精度平衡
# method:计算连通性权重,如 'gauss' 计算的权重随距离增加呈指数级衰减
# metric:距离度量
rsc.pp.neighbors(
adata,
n_neighbors=15,
n_pcs=npcs,
algorithm="brute",
use_rep="X_pca",
key_added="neighbors",
method="umap",
)
# %%
# key parameters:
# min_dist: The effective minimum distance between embedded points.
# Smaller values will result in a more clustered/clumped embedding
# where nearby points on the manifold are drawn closer together,
# while larger values will result on a more even dispersal of points.
# 让同类细胞聚集更紧或更松。
# spread: The effective scale of embedded points.
# 整体缩小或放大尺度。
rsc.tl.umap(
adata, min_dist=0.1, spread=1, neighbors_key="neighbors", key_added="X_umap"
)
# %%
rsc.tl.leiden(adata, resolution=1, key_added="leiden_res1", neighbors_key="neighbors")
# %%
sc.set_figure_params(figsize=(8, 8))
target_col = "leiden_res1"
sc.pl.umap(
adata,
color=target_col,
legend_loc="on data",
palette="tab20",
show=False,
)
fig = plt.gcf()
fig.savefig(f"figures/{target_col}.umap.{scheme}.pdf", dpi=600, bbox_inches="tight")
plt.show()
# %%
group_keys = [
"leiden_res1",
"n_genes_by_counts",
"total_counts",
"pct_counts_MT",
"pct_counts_RIBO",
"scDblFinder.class",
]
sc.set_figure_params(figsize=(8, 8))
sc.pl.umap(
adata,
color=group_keys,
ncols=2,
wspace=0.25,
hspace=0.25,
palette="tab20",
show=False,
)
fig = plt.gcf()
fig.savefig(f"figures/qc_check.umap.{scheme}.pdf", dpi=600, bbox_inches="tight")
plt.show()5 Log-Normalize-Scale scheme with regressing out sequencing depth
# %%
# Log-Normalize-Scale scheme
# with all necessary variables regressed out.
# i.e., sequencing depth, mitochondrial content, ribosomal content, etc.
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
import gc
from pathlib import Path
import scanpy as sc
import rapids_singlecell as rsc
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
# set memory management type
import rmm
import cupy as cp
from rmm.allocators.cupy import rmm_cupy_allocator
rmm.reinitialize(
managed_memory=False,
pool_allocator=True,
devices=[0],
initial_pool_size=12 * 1024**3,
)
cp.cuda.set_allocator(rmm_cupy_allocator)
# %%
sc.set_figure_params(dpi=600, facecolor="white")
# %%
adata_file = "/data/users/yangrui/lab_projs/test/data/neuron_counts.qc_filtered.h5ad"
scheme = "log-normalize-scale.full-regression"
# %%
work_dir = Path(adata_file).parent.parent
os.makedirs(work_dir, exist_ok=True)
os.chdir(work_dir)
# %%
adata = sc.read_h5ad(adata_file)
# %%
adata.layers["counts"] = adata.X.copy()
# %%
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
# %%
adata.layers["data"] = adata.X.copy()
# %%
adata.raw = adata
# %%
nhvgs = 2000
sc.pp.highly_variable_genes(
adata, n_top_genes=nhvgs, flavor="seurat_v3", layer="counts"
)
# %%
import re
noise_genes = [
g for g in adata.var_names if re.match(r"^(mt-|Rps|Rpl)", g, re.IGNORECASE)
]
display(adata.var.loc[adata.var.index.isin(noise_genes) & adata.var["highly_variable"]])
# %%
adata.var.loc[adata.var.index.isin(noise_genes), "highly_variable"] = False
print(
f"# highly variable genes after removing MT/RIBO/HB: {sum(adata.var['highly_variable'])}"
)
# %%
sc.pl.highly_variable_genes(adata, log=True, show=False)
fig = plt.gcf()
fig.suptitle(
f"Highly Variable Genes - {sum(adata.var['highly_variable'])}/{nhvgs}",
fontsize=20,
y=1,
)
fig.savefig(f"figures/highly_variable_genes.{scheme}.pdf", dpi=600, bbox_inches="tight")
plt.show()
# %% load data into GPU memory
rsc.get.anndata_to_GPU(adata)
# %%
rsc.pp.regress_out(adata, keys=["total_counts", "pct_counts_MT", "pct_counts_RIBO"])
# %%
rsc.pp.scale(adata, max_value=10)
# %%
rsc.tl.pca(adata, n_comps=100, use_highly_variable=True)
# %% elbow plot
sc.set_figure_params(figsize=(22, 8))
sc.pl.pca_variance_ratio(adata, log=True, n_pcs=100)
# %% customized elbow plot
import pandas as pd
import numpy as np
from plotnine import *
npcs = 100
var_ratio = adata.uns["pca"]["variance_ratio"][:npcs]
df_elbow = pd.DataFrame(
{"PC": np.arange(1, len(var_ratio) + 1), "Variance_Ratio": var_ratio}
)
p = (
ggplot(df_elbow, aes(x="PC", y="Variance_Ratio"))
+ geom_line(color="#e41a1c", size=1)
+ geom_point(color="#e41a1c", size=1.5)
+ scale_y_log10()
+ scale_x_continuous(breaks=np.arange(0, 101, 10))
+ geom_vline(xintercept=np.arange(20, 51, 10), linetype="dashed", color="blue")
+ labs(
x="Principal Component",
y="Variance Ratio (log10)",
title="PCA Elbow Plot - Variance Explained",
)
+ theme_classic()
+ theme(
figure_size=(10, 5),
plot_title=element_text(hjust=0.5),
axis_text=element_text(color="black"),
)
)
p.save(f"figures/pca_variance_ratio.{scheme}.pdf", dpi=600)
p
# %%
rsc.get.anndata_to_CPU(adata)
# %% check spearman correlations between PCs and continuous technical metrics
import scanpy as sc
import pandas as pd
from scipy.stats import spearmanr
from plotnine import *
npcs = 50
eval_metrics = [
"total_counts",
"n_genes_by_counts",
"pct_counts_MT",
"pct_counts_RIBO",
]
pcs = adata.obsm["X_pca"][:, :npcs]
obs_data = adata.obs[eval_metrics]
corr_list = []
for i in range(npcs):
pc_name = f"PC{i+1}"
for eval_metric in eval_metrics:
r, _ = spearmanr(pcs[:, i], obs_data[eval_metric])
corr_list.append(
{"PC": pc_name, "Metric": eval_metric, "Corr": r, "AbsCorr": abs(r)}
)
corr_df = pd.DataFrame(corr_list)
pc_order = [f"PC{i+1}" for i in range(npcs)]
corr_df["PC"] = pd.Categorical(corr_df["PC"], categories=pc_order[::-1], ordered=True)
corr_df["Metric"] = pd.Categorical(
corr_df["Metric"],
categories=eval_metrics,
ordered=True,
)
p = (
ggplot(corr_df, aes(x="Metric", y="PC", fill="Corr"))
+ geom_tile(color="white")
+ geom_text(aes(label="Corr.round(2)"), size=8)
+ scale_fill_gradient2(
low="#313695", mid="#ffffbf", high="#a50026", midpoint=0, limits=[-1, 1]
)
+ theme_minimal()
+ labs(
x="Technical Metrics",
y="Principal Components",
fill="Spearman's Rho",
)
+ theme(
figure_size=(4, 10),
axis_text_x=element_text(angle=45, hjust=1),
panel_grid_major=element_blank(),
)
)
p.save(f"figures/pc_tech_correlations.{scheme}.pdf", dpi=600)
p
# %% pairwise spearman correlations between continuous metrics
import pandas as pd
from plotnine import *
eval_metrics = [
"total_counts",
"n_genes_by_counts",
"pct_counts_MT",
"pct_counts_RIBO",
]
obs_data = adata.obs[eval_metrics]
corr_matrix = obs_data.corr(method="spearman")
corr_df = corr_matrix.reset_index().melt(id_vars="index")
corr_df.columns = ["Metric_Y", "Metric_X", "Corr"]
corr_df["Metric_Y"] = pd.Categorical(
corr_df["Metric_Y"], categories=eval_metrics[::-1], ordered=True
)
corr_df["Metric_X"] = pd.Categorical(
corr_df["Metric_X"], categories=eval_metrics, ordered=True
)
p = (
ggplot(corr_df, aes(x="Metric_X", y="Metric_Y", fill="Corr"))
+ geom_tile(color="white")
+ geom_text(aes(label="Corr.round(2)"), size=10)
+ scale_fill_gradient2(
low="#313695", mid="#ffffbf", high="#a50026", midpoint=0, limits=[-1, 1]
)
+ theme_minimal()
+ labs(
x="",
y="",
fill="Spearman's Rho",
title="Pairwise Correlations",
)
+ theme(
aspect_ratio=1,
figure_size=(5, 5),
axis_text_x=element_text(angle=90, hjust=1),
panel_grid_major=element_blank(),
)
)
p.save(f"figures/tech_correlations.{scheme}.pdf", dpi=600)
p
# %%
npcs = 50
# 关键参数:
# n_neighbors:全局/局部结构平衡
# use_rep/n_pcs
# algorithm:近邻搜索算法-速度与精度平衡
# method:计算连通性权重,如 'gauss' 计算的权重随距离增加呈指数级衰减
# metric:距离度量
rsc.pp.neighbors(
adata,
n_neighbors=15,
n_pcs=npcs,
algorithm="brute",
use_rep="X_pca",
key_added="neighbors",
method="umap",
)
# %%
# key parameters:
# min_dist: The effective minimum distance between embedded points.
# Smaller values will result in a more clustered/clumped embedding
# where nearby points on the manifold are drawn closer together,
# while larger values will result on a more even dispersal of points.
# 让同类细胞聚集更紧或更松。
# spread: The effective scale of embedded points.
# 整体缩小或放大尺度。
rsc.tl.umap(
adata, min_dist=0.1, spread=1, neighbors_key="neighbors", key_added="X_umap"
)
# %%
rsc.tl.leiden(adata, resolution=1, key_added="leiden_res1", neighbors_key="neighbors")
# %%
sc.set_figure_params(figsize=(8, 8))
target_col = "leiden_res1"
sc.pl.umap(
adata,
color=target_col,
legend_loc="on data",
palette="tab20",
show=False,
)
fig = plt.gcf()
fig.savefig(f"figures/{target_col}.umap.{scheme}.pdf", dpi=600, bbox_inches="tight")
plt.show()
# %%
group_keys = [
"leiden_res1",
"n_genes_by_counts",
"total_counts",
"pct_counts_MT",
"pct_counts_RIBO",
"scDblFinder.class",
]
sc.set_figure_params(figsize=(8, 8))
sc.pl.umap(
adata,
color=group_keys,
ncols=2,
wspace=0.25,
hspace=0.25,
palette="tab20",
show=False,
)
fig = plt.gcf()
fig.savefig(f"figures/qc_check.umap.{scheme}.pdf", dpi=600, bbox_inches="tight")
plt.show()6 Seurat-SCTransform scheme without regressing out sequencing depth
# perform Seurat SCTransform
library(anndataR)
library(Seurat)
library(tidyverse)
library(YRUtils)
library(vroom)
h5ad_file <- "/data/users/yangrui/lab_projs/test/data/neuron_counts.qc_filtered.h5ad"
# set continuous variables to regress
# regressing out sequencing depth again is not recommended
# as it can lead to overfitting
# SCTransform is designed to handle sequencing depth
vars_to_regress <- c("pct_counts_MT", "pct_counts_RIBO")
n_hvgs <- 2000
scheme <- "seurat-sctransform.no-depth-regression"
seu <- read_h5ad(h5ad_file, as = "Seurat")
# save raw counts in `X` layer to `counts` layer
LayerData(seu, assay = "RNA", layer = "counts") <- LayerData(seu, assay = "RNA", layer = "X")
# set `counts` as default layer
DefaultLayer(seu[["RNA"]]) <- "counts"
# remove `X` layer
LayerData(seu, assay = "RNA", layer = "X") <- NULL
seu <- SCTransform(
seu,
vars.to.regress = vars_to_regress,
variable.features.n = n_hvgs,
do.correct.umi = TRUE,
do.scale = FALSE,
do.center = TRUE,
vst.flavor = "v2",
return.only.var.genes = FALSE
)
DefaultAssay(seu) <- "RNA"
# save all layers in `SCT` assay to `RNA` assay
LayerData(seu, assay = "RNA", layer = "sct.corrected.counts") <- LayerData(seu, assay = "SCT", layer = "counts")
LayerData(seu, assay = "RNA", layer = "sct.corrected.data") <- LayerData(seu, assay = "SCT", layer = "data")
LayerData(seu, assay = "RNA", layer = "sct.corrected.scale.data") <- LayerData(seu, assay = "SCT", layer = "scale.data")
# mark highly variable features
seu[["RNA"]]@meta.data[["highly_variable"]] <- seu[["RNA"]]@meta.data[["gene_name"]] %in% seu[["SCT"]]@var.features
adata <- as_AnnData(
seu,
assay_name = "RNA",
x_mapping = "corrected.scale.data",
layers_mapping = names(seu[["RNA"]]@layers),
obs_mapping = names(seu@meta.data),
var_mapping = names(seu[["RNA"]]@meta.data)
)
adata$uns$highly_variable_features <- seu[["SCT"]]@var.features
adata$write_h5ad(paste0(h5ad_file, ".", scheme, ".h5ad"))
noise_genes <- seu[["SCT"]]@var.features[grepl(seu[["SCT"]]@var.features, pattern = "^(mt-|Rps|Rpl)", ignore.case = TRUE)]
# plot highly variable feature distribution
# using mean-variance relationship
DefaultAssay(seu) <- "SCT"
p <- VariableFeaturePlot(seu, log = TRUE) +
geom_hline(yintercept = 1, linetype = "dashed", color = "grey") +
scale_y_log10() +
labs(
x = "log10(Geometric Mean of Expression)",
y = "log10(Residual Variance)",
title = paste0("Highly Variable Genes - ", n_hvgs - length(noise_genes), "/", n_hvgs)
)
ppreview(
plot = p,
file = file.path(dirname(dirname(h5ad_file)), "figures", paste0("highly_variable_genes.", scheme, ".pdf")),
width = 600,
height = 300
)
df <- p$data
df$gene_name <- row.names(df)
vroom_write(
df,
file = file.path(dirname(dirname(h5ad_file)), "figures", paste0("highly_variable_genes.", scheme, ".tsv")),
delim = "\t",
col_names = TRUE,
append = FALSE
)
# %%
# Seurat-SCTransform scheme
# without sequencing depth regressed out.
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
import gc
from pathlib import Path
import scanpy as sc
import rapids_singlecell as rsc
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
# set memory management type
import rmm
import cupy as cp
from rmm.allocators.cupy import rmm_cupy_allocator
rmm.reinitialize(
managed_memory=False,
pool_allocator=True,
devices=[0],
initial_pool_size=12 * 1024**3,
)
cp.cuda.set_allocator(rmm_cupy_allocator)
# %%
sc.set_figure_params(dpi=600, facecolor="white")
# %%
adata_file = "/data/users/yangrui/lab_projs/test/data/neuron_counts.qc_filtered.h5ad.seurat-sctransform.no-depth-regression.h5ad"
scheme = "seurat-sctransform.no-depth-regression"
# %%
work_dir = Path(adata_file).parent.parent
os.makedirs(work_dir, exist_ok=True)
os.chdir(work_dir)
# %%
adata = sc.read_h5ad(adata_file)
# %%
adata.X = adata.layers["counts"].copy()
# %%
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
# %% use log-normalized data for visualization
adata.layers["data"] = adata.X.copy()
# %%
adata.raw = adata
# %%
adata.X = adata.layers["sct.corrected.scale.data"].copy()
# %%
import re
noise_genes = [
g for g in adata.var_names if re.match(r"^(mt-|Rps|Rpl)", g, re.IGNORECASE)
]
display(adata.var.loc[adata.var.index.isin(noise_genes) & adata.var["highly_variable"]])
# %%
adata.var.loc[adata.var.index.isin(noise_genes), "highly_variable"] = False
print(
f"# highly variable genes after removing MT/RIBO/HB: {sum(adata.var['highly_variable'])}"
)
# %% load data into GPU memory
rsc.get.anndata_to_GPU(adata)
# %%
rsc.tl.pca(adata, n_comps=100, use_highly_variable=True)
# %% elbow plot
sc.set_figure_params(figsize=(22, 8))
sc.pl.pca_variance_ratio(adata, log=True, n_pcs=100)
# %% customized elbow plot
import pandas as pd
import numpy as np
from plotnine import *
npcs = 100
var_ratio = adata.uns["pca"]["variance_ratio"][:npcs]
df_elbow = pd.DataFrame(
{"PC": np.arange(1, len(var_ratio) + 1), "Variance_Ratio": var_ratio}
)
p = (
ggplot(df_elbow, aes(x="PC", y="Variance_Ratio"))
+ geom_line(color="#e41a1c", size=1)
+ geom_point(color="#e41a1c", size=1.5)
+ scale_y_log10()
+ scale_x_continuous(breaks=np.arange(0, 101, 10))
+ geom_vline(xintercept=np.arange(20, 51, 10), linetype="dashed", color="blue")
+ labs(
x="Principal Component",
y="Variance Ratio (log10)",
title="PCA Elbow Plot - Variance Explained",
)
+ theme_classic()
+ theme(
figure_size=(10, 5),
plot_title=element_text(hjust=0.5),
axis_text=element_text(color="black"),
)
)
p.save(f"figures/pca_variance_ratio.{scheme}.pdf", dpi=600)
p
# %%
rsc.get.anndata_to_CPU(adata)
# %% check spearman correlations between PCs and continuous technical metrics
import scanpy as sc
import pandas as pd
from scipy.stats import spearmanr
from plotnine import *
npcs = 50
eval_metrics = [
"total_counts",
"n_genes_by_counts",
"pct_counts_MT",
"pct_counts_RIBO",
]
pcs = adata.obsm["X_pca"][:, :npcs]
obs_data = adata.obs[eval_metrics]
corr_list = []
for i in range(npcs):
pc_name = f"PC{i+1}"
for eval_metric in eval_metrics:
r, _ = spearmanr(pcs[:, i], obs_data[eval_metric])
corr_list.append(
{"PC": pc_name, "Metric": eval_metric, "Corr": r, "AbsCorr": abs(r)}
)
corr_df = pd.DataFrame(corr_list)
pc_order = [f"PC{i+1}" for i in range(npcs)]
corr_df["PC"] = pd.Categorical(corr_df["PC"], categories=pc_order[::-1], ordered=True)
corr_df["Metric"] = pd.Categorical(
corr_df["Metric"],
categories=eval_metrics,
ordered=True,
)
p = (
ggplot(corr_df, aes(x="Metric", y="PC", fill="Corr"))
+ geom_tile(color="white")
+ geom_text(aes(label="Corr.round(2)"), size=8)
+ scale_fill_gradient2(
low="#313695", mid="#ffffbf", high="#a50026", midpoint=0, limits=[-1, 1]
)
+ theme_minimal()
+ labs(
x="Technical Metrics",
y="Principal Components",
fill="Spearman's Rho",
)
+ theme(
figure_size=(4, 10),
axis_text_x=element_text(angle=45, hjust=1),
panel_grid_major=element_blank(),
)
)
p.save(f"figures/pc_tech_correlations.{scheme}.pdf", dpi=600)
p
# %% pairwise spearman correlations between continuous metrics
import pandas as pd
from plotnine import *
eval_metrics = [
"total_counts",
"n_genes_by_counts",
"pct_counts_MT",
"pct_counts_RIBO",
]
obs_data = adata.obs[eval_metrics]
corr_matrix = obs_data.corr(method="spearman")
corr_df = corr_matrix.reset_index().melt(id_vars="index")
corr_df.columns = ["Metric_Y", "Metric_X", "Corr"]
corr_df["Metric_Y"] = pd.Categorical(
corr_df["Metric_Y"], categories=eval_metrics[::-1], ordered=True
)
corr_df["Metric_X"] = pd.Categorical(
corr_df["Metric_X"], categories=eval_metrics, ordered=True
)
p = (
ggplot(corr_df, aes(x="Metric_X", y="Metric_Y", fill="Corr"))
+ geom_tile(color="white")
+ geom_text(aes(label="Corr.round(2)"), size=10)
+ scale_fill_gradient2(
low="#313695", mid="#ffffbf", high="#a50026", midpoint=0, limits=[-1, 1]
)
+ theme_minimal()
+ labs(
x="",
y="",
fill="Spearman's Rho",
title="Pairwise Correlations",
)
+ theme(
aspect_ratio=1,
figure_size=(5, 5),
axis_text_x=element_text(angle=90, hjust=1),
panel_grid_major=element_blank(),
)
)
p.save(f"figures/tech_correlations.{scheme}.pdf", dpi=600)
p
# %%
npcs = 50
# 关键参数:
# n_neighbors:全局/局部结构平衡
# use_rep/n_pcs
# algorithm:近邻搜索算法-速度与精度平衡
# method:计算连通性权重,如 'gauss' 计算的权重随距离增加呈指数级衰减
# metric:距离度量
rsc.pp.neighbors(
adata,
n_neighbors=15,
n_pcs=npcs,
algorithm="brute",
use_rep="X_pca",
key_added="neighbors",
method="umap",
)
# %%
# key parameters:
# min_dist: The effective minimum distance between embedded points.
# Smaller values will result in a more clustered/clumped embedding
# where nearby points on the manifold are drawn closer together,
# while larger values will result on a more even dispersal of points.
# 让同类细胞聚集更紧或更松。
# spread: The effective scale of embedded points.
# 整体缩小或放大尺度。
rsc.tl.umap(
adata, min_dist=0.1, spread=1, neighbors_key="neighbors", key_added="X_umap"
)
# %%
rsc.tl.leiden(adata, resolution=1, key_added="leiden_res1", neighbors_key="neighbors")
# %%
sc.set_figure_params(figsize=(8, 8))
target_col = "leiden_res1"
sc.pl.umap(
adata,
color=target_col,
legend_loc="on data",
palette="tab20",
show=False,
)
fig = plt.gcf()
fig.savefig(f"figures/{target_col}.umap.{scheme}.pdf", dpi=600, bbox_inches="tight")
plt.show()
# %%
group_keys = [
"leiden_res1",
"n_genes_by_counts",
"total_counts",
"pct_counts_MT",
"pct_counts_RIBO",
"scDblFinder.class",
]
sc.set_figure_params(figsize=(8, 8))
sc.pl.umap(
adata,
color=group_keys,
ncols=2,
wspace=0.25,
hspace=0.25,
palette="tab20",
show=False,
)
fig = plt.gcf()
fig.savefig(f"figures/qc_check.umap.{scheme}.pdf", dpi=600, bbox_inches="tight")
plt.show()7 Seurat-SCTransform scheme with regressing out sequencing depth
# perform Seurat SCTransform
library(anndataR)
library(Seurat)
library(tidyverse)
library(YRUtils)
library(vroom)
h5ad_file <- "/data/users/yangrui/lab_projs/test/data/neuron_counts.qc_filtered.h5ad"
# set continuous variables to regress
# regressing out sequencing depth again is not recommended
# as it can lead to overfitting
# SCTransform is designed to handle sequencing depth
vars_to_regress <- c("total_counts", "pct_counts_MT", "pct_counts_RIBO")
n_hvgs <- 2000
scheme <- "seurat-sctransform.full-regression"
seu <- read_h5ad(h5ad_file, as = "Seurat")
# save raw counts in `X` layer to `counts` layer
LayerData(seu, assay = "RNA", layer = "counts") <- LayerData(seu, assay = "RNA", layer = "X")
# set `counts` as default layer
DefaultLayer(seu[["RNA"]]) <- "counts"
# remove `X` layer
LayerData(seu, assay = "RNA", layer = "X") <- NULL
seu <- SCTransform(
seu,
vars.to.regress = vars_to_regress,
variable.features.n = n_hvgs,
do.correct.umi = TRUE,
do.scale = FALSE,
do.center = TRUE,
vst.flavor = "v2",
return.only.var.genes = FALSE
)
DefaultAssay(seu) <- "RNA"
# save all layers in `SCT` assay to `RNA` assay
LayerData(seu, assay = "RNA", layer = "sct.corrected.counts") <- LayerData(seu, assay = "SCT", layer = "counts")
LayerData(seu, assay = "RNA", layer = "sct.corrected.data") <- LayerData(seu, assay = "SCT", layer = "data")
LayerData(seu, assay = "RNA", layer = "sct.corrected.scale.data") <- LayerData(seu, assay = "SCT", layer = "scale.data")
# mark highly variable features
seu[["RNA"]]@meta.data[["highly_variable"]] <- seu[["RNA"]]@meta.data[["gene_name"]] %in% seu[["SCT"]]@var.features
adata <- as_AnnData(
seu,
assay_name = "RNA",
x_mapping = "corrected.scale.data",
layers_mapping = names(seu[["RNA"]]@layers),
obs_mapping = names(seu@meta.data),
var_mapping = names(seu[["RNA"]]@meta.data)
)
adata$uns$highly_variable_features <- seu[["SCT"]]@var.features
adata$write_h5ad(paste0(h5ad_file, ".", scheme, ".h5ad"))
noise_genes <- seu[["SCT"]]@var.features[grepl(seu[["SCT"]]@var.features, pattern = "^(mt-|Rps|Rpl)", ignore.case = TRUE)]
# plot highly variable feature distribution
# using mean-variance relationship
DefaultAssay(seu) <- "SCT"
p <- VariableFeaturePlot(seu, log = TRUE) +
geom_hline(yintercept = 1, linetype = "dashed", color = "grey") +
scale_y_log10() +
labs(
x = "log10(Geometric Mean of Expression)",
y = "log10(Residual Variance)",
title = paste0("Highly Variable Genes - ", n_hvgs - length(noise_genes), "/", n_hvgs)
)
ppreview(
plot = p,
file = file.path(dirname(dirname(h5ad_file)), "figures", paste0("highly_variable_genes.", scheme, ".pdf")),
width = 600,
height = 300
)
df <- p$data
df$gene_name <- row.names(df)
vroom_write(
df,
file = file.path(dirname(dirname(h5ad_file)), "figures", paste0("highly_variable_genes.", scheme, ".tsv")),
delim = "\t",
col_names = TRUE,
append = FALSE
)
# %%
# Seurat-SCTransform scheme
# with all necessary variables regressed out.
# i.e., sequencing depth, mitochondrial content, ribosomal content, etc.
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
import gc
from pathlib import Path
import scanpy as sc
import rapids_singlecell as rsc
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
# set memory management type
import rmm
import cupy as cp
from rmm.allocators.cupy import rmm_cupy_allocator
rmm.reinitialize(
managed_memory=False,
pool_allocator=True,
devices=[0],
initial_pool_size=12 * 1024**3,
)
cp.cuda.set_allocator(rmm_cupy_allocator)
# %%
sc.set_figure_params(dpi=600, facecolor="white")
# %%
adata_file = "/data/users/yangrui/lab_projs/test/data/neuron_counts.qc_filtered.h5ad.seurat-sctransform.full-regression.h5ad"
scheme = "seurat-sctransform.full-regression"
# %%
work_dir = Path(adata_file).parent.parent
os.makedirs(work_dir, exist_ok=True)
os.chdir(work_dir)
# %%
adata = sc.read_h5ad(adata_file)
# %%
adata.X = adata.layers["counts"].copy()
# %%
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
# %% use log-normalized data for visualization
adata.layers["data"] = adata.X.copy()
# %%
adata.raw = adata
# %%
adata.X = adata.layers["sct.corrected.scale.data"].copy()
# %%
import re
noise_genes = [
g for g in adata.var_names if re.match(r"^(mt-|Rps|Rpl)", g, re.IGNORECASE)
]
display(adata.var.loc[adata.var.index.isin(noise_genes) & adata.var["highly_variable"]])
# %%
adata.var.loc[adata.var.index.isin(noise_genes), "highly_variable"] = False
print(
f"# highly variable genes after removing MT/RIBO/HB: {sum(adata.var['highly_variable'])}"
)
# %% load data into GPU memory
rsc.get.anndata_to_GPU(adata)
# %%
rsc.tl.pca(adata, n_comps=100, use_highly_variable=True)
# %% elbow plot
sc.set_figure_params(figsize=(22, 8))
sc.pl.pca_variance_ratio(adata, log=True, n_pcs=100)
# %% customized elbow plot
import pandas as pd
import numpy as np
from plotnine import *
npcs = 100
var_ratio = adata.uns["pca"]["variance_ratio"][:npcs]
df_elbow = pd.DataFrame(
{"PC": np.arange(1, len(var_ratio) + 1), "Variance_Ratio": var_ratio}
)
p = (
ggplot(df_elbow, aes(x="PC", y="Variance_Ratio"))
+ geom_line(color="#e41a1c", size=1)
+ geom_point(color="#e41a1c", size=1.5)
+ scale_y_log10()
+ scale_x_continuous(breaks=np.arange(0, 101, 10))
+ geom_vline(xintercept=np.arange(20, 51, 10), linetype="dashed", color="blue")
+ labs(
x="Principal Component",
y="Variance Ratio (log10)",
title="PCA Elbow Plot - Variance Explained",
)
+ theme_classic()
+ theme(
figure_size=(10, 5),
plot_title=element_text(hjust=0.5),
axis_text=element_text(color="black"),
)
)
p.save(f"figures/pca_variance_ratio.{scheme}.pdf", dpi=600)
p
# %%
rsc.get.anndata_to_CPU(adata)
# %% check spearman correlations between PCs and continuous technical metrics
import scanpy as sc
import pandas as pd
from scipy.stats import spearmanr
from plotnine import *
npcs = 50
eval_metrics = [
"total_counts",
"n_genes_by_counts",
"pct_counts_MT",
"pct_counts_RIBO",
]
pcs = adata.obsm["X_pca"][:, :npcs]
obs_data = adata.obs[eval_metrics]
corr_list = []
for i in range(npcs):
pc_name = f"PC{i+1}"
for eval_metric in eval_metrics:
r, _ = spearmanr(pcs[:, i], obs_data[eval_metric])
corr_list.append(
{"PC": pc_name, "Metric": eval_metric, "Corr": r, "AbsCorr": abs(r)}
)
corr_df = pd.DataFrame(corr_list)
pc_order = [f"PC{i+1}" for i in range(npcs)]
corr_df["PC"] = pd.Categorical(corr_df["PC"], categories=pc_order[::-1], ordered=True)
corr_df["Metric"] = pd.Categorical(
corr_df["Metric"],
categories=eval_metrics,
ordered=True,
)
p = (
ggplot(corr_df, aes(x="Metric", y="PC", fill="Corr"))
+ geom_tile(color="white")
+ geom_text(aes(label="Corr.round(2)"), size=8)
+ scale_fill_gradient2(
low="#313695", mid="#ffffbf", high="#a50026", midpoint=0, limits=[-1, 1]
)
+ theme_minimal()
+ labs(
x="Technical Metrics",
y="Principal Components",
fill="Spearman's Rho",
)
+ theme(
figure_size=(4, 10),
axis_text_x=element_text(angle=45, hjust=1),
panel_grid_major=element_blank(),
)
)
p.save(f"figures/pc_tech_correlations.{scheme}.pdf", dpi=600)
p
# %% pairwise spearman correlations between continuous metrics
import pandas as pd
from plotnine import *
eval_metrics = [
"total_counts",
"n_genes_by_counts",
"pct_counts_MT",
"pct_counts_RIBO",
]
obs_data = adata.obs[eval_metrics]
corr_matrix = obs_data.corr(method="spearman")
corr_df = corr_matrix.reset_index().melt(id_vars="index")
corr_df.columns = ["Metric_Y", "Metric_X", "Corr"]
corr_df["Metric_Y"] = pd.Categorical(
corr_df["Metric_Y"], categories=eval_metrics[::-1], ordered=True
)
corr_df["Metric_X"] = pd.Categorical(
corr_df["Metric_X"], categories=eval_metrics, ordered=True
)
p = (
ggplot(corr_df, aes(x="Metric_X", y="Metric_Y", fill="Corr"))
+ geom_tile(color="white")
+ geom_text(aes(label="Corr.round(2)"), size=10)
+ scale_fill_gradient2(
low="#313695", mid="#ffffbf", high="#a50026", midpoint=0, limits=[-1, 1]
)
+ theme_minimal()
+ labs(
x="",
y="",
fill="Spearman's Rho",
title="Pairwise Correlations",
)
+ theme(
aspect_ratio=1,
figure_size=(5, 5),
axis_text_x=element_text(angle=90, hjust=1),
panel_grid_major=element_blank(),
)
)
p.save(f"figures/tech_correlations.{scheme}.pdf", dpi=600)
p
# %%
npcs = 50
# 关键参数:
# n_neighbors:全局/局部结构平衡
# use_rep/n_pcs
# algorithm:近邻搜索算法-速度与精度平衡
# method:计算连通性权重,如 'gauss' 计算的权重随距离增加呈指数级衰减
# metric:距离度量
rsc.pp.neighbors(
adata,
n_neighbors=15,
n_pcs=npcs,
algorithm="brute",
use_rep="X_pca",
key_added="neighbors",
method="umap",
)
# %%
# key parameters:
# min_dist: The effective minimum distance between embedded points.
# Smaller values will result in a more clustered/clumped embedding
# where nearby points on the manifold are drawn closer together,
# while larger values will result on a more even dispersal of points.
# 让同类细胞聚集更紧或更松。
# spread: The effective scale of embedded points.
# 整体缩小或放大尺度。
rsc.tl.umap(
adata, min_dist=0.1, spread=1, neighbors_key="neighbors", key_added="X_umap"
)
# %%
rsc.tl.leiden(adata, resolution=1, key_added="leiden_res1", neighbors_key="neighbors")
# %%
sc.set_figure_params(figsize=(8, 8))
target_col = "leiden_res1"
sc.pl.umap(
adata,
color=target_col,
legend_loc="on data",
palette="tab20",
show=False,
)
fig = plt.gcf()
fig.savefig(f"figures/{target_col}.umap.{scheme}.pdf", dpi=600, bbox_inches="tight")
plt.show()
# %%
group_keys = [
"leiden_res1",
"n_genes_by_counts",
"total_counts",
"pct_counts_MT",
"pct_counts_RIBO",
"scDblFinder.class",
]
sc.set_figure_params(figsize=(8, 8))
sc.pl.umap(
adata,
color=group_keys,
ncols=2,
wspace=0.25,
hspace=0.25,
palette="tab20",
show=False,
)
fig = plt.gcf()
fig.savefig(f"figures/qc_check.umap.{scheme}.pdf", dpi=600, bbox_inches="tight")
plt.show()8 Pearson-Residuals scheme without regressing out sequencing depth
# %%
# Pearson-Residuals scheme
# without sequencing depth regressed out.
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
import gc
from pathlib import Path
import scanpy as sc
import rapids_singlecell as rsc
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
# set memory management type
import rmm
import cupy as cp
from rmm.allocators.cupy import rmm_cupy_allocator
rmm.reinitialize(
managed_memory=False,
pool_allocator=True,
devices=[0],
initial_pool_size=12 * 1024**3,
)
cp.cuda.set_allocator(rmm_cupy_allocator)
# %%
sc.set_figure_params(dpi=600, facecolor="white")
# %%
adata_file = "/data/users/yangrui/lab_projs/test/data/neuron_counts.qc_filtered.h5ad"
scheme = "pearson-residuals.no-depth-regression"
# %%
work_dir = Path(adata_file).parent.parent
os.makedirs(work_dir, exist_ok=True)
os.chdir(work_dir)
# %%
adata = sc.read_h5ad(adata_file)
# %% load data into GPU memory
rsc.get.anndata_to_GPU(adata)
# %%
adata.layers["counts"] = adata.X.copy()
# %%
rsc.pp.normalize_total(adata, target_sum=1e4)
rsc.pp.log1p(adata)
# %%
adata.layers["data"] = adata.X.copy()
# %%
adata.raw = adata
# %%
import math
nhvgs = 2000
adata.X = adata.layers["counts"]
# use Seurat clipping threshold (sqrt(n/30))
# not default (sqrt(n))
rsc.pp.highly_variable_genes(
adata,
n_top_genes=nhvgs,
flavor="pearson_residuals",
clip=math.sqrt(adata.shape[0] / 30),
)
# %%
import re
noise_genes = [
g for g in adata.var_names if re.match(r"^(mt-|Rps|Rpl)", g, re.IGNORECASE)
]
display(adata.var.loc[adata.var.index.isin(noise_genes) & adata.var["highly_variable"]])
# %%
adata.var.loc[adata.var.index.isin(noise_genes), "highly_variable"] = False
print(
f"# highly variable genes after removing MT/RIBO/HB: {sum(adata.var['highly_variable'])}"
)
# %% visualize HVGs based on mean expression and residual variance
from plotnine import *
var_df = adata.var.copy()
var_df["highly_variable"] = var_df["highly_variable"].astype(str)
p = (
ggplot(
var_df, aes(x="mean_counts", y="residual_variances", color="highly_variable")
)
+ geom_point(size=0.5, alpha=0.25)
+ geom_hline(yintercept=1, linetype="dashed", alpha=0.5)
+ scale_x_log10()
+ scale_y_log10()
+ scale_color_manual(
values={"False": "#bdbdbd", "True": "#e41a1c"},
name="HVG Status",
labels={"False": "Non-HVG", "True": "Selected HVG"},
)
+ labs(
x="Mean Expression (log10)",
y="Residual Variance (log10)",
title="SCTransform: Residual Variance vs Mean Expression",
)
+ theme_classic()
+ theme(
figure_size=(6, 4),
legend_position="right",
plot_title=element_text(hjust=0.5),
panel_grid=element_blank(),
)
)
p.save(f"figures/highly_variable_genes.{scheme}.pdf", dpi=600)
p
# %%
import math
adata.X = adata.layers["counts"]
# use Seurat clipping threshold (sqrt(n/30))
# not default (sqrt(n))
rsc.pp.normalize_pearson_residuals(adata, clip=math.sqrt(adata.shape[0] / 30))
adata.X
# %%
rsc.pp.regress_out(adata, keys=["pct_counts_MT", "pct_counts_RIBO"])
# %%
# rsc.pp.scale(adata, max_value=10)
# %%
rsc.tl.pca(adata, n_comps=100, use_highly_variable=True)
# %% elbow plot
sc.set_figure_params(figsize=(22, 8))
sc.pl.pca_variance_ratio(adata, log=True, n_pcs=100)
# %% customized elbow plot
import pandas as pd
import numpy as np
from plotnine import *
npcs = 100
var_ratio = adata.uns["pca"]["variance_ratio"][:npcs]
df_elbow = pd.DataFrame(
{"PC": np.arange(1, len(var_ratio) + 1), "Variance_Ratio": var_ratio}
)
p = (
ggplot(df_elbow, aes(x="PC", y="Variance_Ratio"))
+ geom_line(color="#e41a1c", size=1)
+ geom_point(color="#e41a1c", size=1.5)
+ scale_y_log10()
+ scale_x_continuous(breaks=np.arange(0, 101, 10))
+ geom_vline(xintercept=np.arange(20, 51, 10), linetype="dashed", color="blue")
+ labs(
x="Principal Component",
y="Variance Ratio (log10)",
title="PCA Elbow Plot - Variance Explained",
)
+ theme_classic()
+ theme(
figure_size=(10, 5),
plot_title=element_text(hjust=0.5),
axis_text=element_text(color="black"),
)
)
p.save(f"figures/pca_variance_ratio.{scheme}.pdf", dpi=600)
p
# %%
rsc.get.anndata_to_CPU(adata)
# %% check spearman correlations between PCs and continuous technical metrics
import scanpy as sc
import pandas as pd
from scipy.stats import spearmanr
from plotnine import *
npcs = 50
eval_metrics = [
"total_counts",
"n_genes_by_counts",
"pct_counts_MT",
"pct_counts_RIBO",
]
pcs = adata.obsm["X_pca"][:, :npcs]
obs_data = adata.obs[eval_metrics]
corr_list = []
for i in range(npcs):
pc_name = f"PC{i+1}"
for eval_metric in eval_metrics:
r, _ = spearmanr(pcs[:, i], obs_data[eval_metric])
corr_list.append(
{"PC": pc_name, "Metric": eval_metric, "Corr": r, "AbsCorr": abs(r)}
)
corr_df = pd.DataFrame(corr_list)
pc_order = [f"PC{i+1}" for i in range(npcs)]
corr_df["PC"] = pd.Categorical(corr_df["PC"], categories=pc_order[::-1], ordered=True)
corr_df["Metric"] = pd.Categorical(
corr_df["Metric"],
categories=eval_metrics,
ordered=True,
)
p = (
ggplot(corr_df, aes(x="Metric", y="PC", fill="Corr"))
+ geom_tile(color="white")
+ geom_text(aes(label="Corr.round(2)"), size=8)
+ scale_fill_gradient2(
low="#313695", mid="#ffffbf", high="#a50026", midpoint=0, limits=[-1, 1]
)
+ theme_minimal()
+ labs(
x="Technical Metrics",
y="Principal Components",
fill="Spearman's Rho",
)
+ theme(
figure_size=(4, 10),
axis_text_x=element_text(angle=45, hjust=1),
panel_grid_major=element_blank(),
)
)
p.save(f"figures/pc_tech_correlations.{scheme}.pdf", dpi=600)
p
# %% pairwise spearman correlations between continuous metrics
import pandas as pd
from plotnine import *
eval_metrics = [
"total_counts",
"n_genes_by_counts",
"pct_counts_MT",
"pct_counts_RIBO",
]
obs_data = adata.obs[eval_metrics]
corr_matrix = obs_data.corr(method="spearman")
corr_df = corr_matrix.reset_index().melt(id_vars="index")
corr_df.columns = ["Metric_Y", "Metric_X", "Corr"]
corr_df["Metric_Y"] = pd.Categorical(
corr_df["Metric_Y"], categories=eval_metrics[::-1], ordered=True
)
corr_df["Metric_X"] = pd.Categorical(
corr_df["Metric_X"], categories=eval_metrics, ordered=True
)
p = (
ggplot(corr_df, aes(x="Metric_X", y="Metric_Y", fill="Corr"))
+ geom_tile(color="white")
+ geom_text(aes(label="Corr.round(2)"), size=10)
+ scale_fill_gradient2(
low="#313695", mid="#ffffbf", high="#a50026", midpoint=0, limits=[-1, 1]
)
+ theme_minimal()
+ labs(
x="",
y="",
fill="Spearman's Rho",
title="Pairwise Correlations",
)
+ theme(
aspect_ratio=1,
figure_size=(5, 5),
axis_text_x=element_text(angle=90, hjust=1),
panel_grid_major=element_blank(),
)
)
p.save(f"figures/tech_correlations.{scheme}.pdf", dpi=600)
p
# %%
npcs = 50
# 关键参数:
# n_neighbors:全局/局部结构平衡
# use_rep/n_pcs
# algorithm:近邻搜索算法-速度与精度平衡
# method:计算连通性权重,如 'gauss' 计算的权重随距离增加呈指数级衰减
# metric:距离度量
rsc.pp.neighbors(
adata,
n_neighbors=15,
n_pcs=npcs,
algorithm="brute",
use_rep="X_pca",
key_added="neighbors",
method="umap",
)
# %%
# key parameters:
# min_dist: The effective minimum distance between embedded points.
# Smaller values will result in a more clustered/clumped embedding
# where nearby points on the manifold are drawn closer together,
# while larger values will result on a more even dispersal of points.
# 让同类细胞聚集更紧或更松。
# spread: The effective scale of embedded points.
# 整体缩小或放大尺度。
rsc.tl.umap(
adata, min_dist=0.1, spread=1, neighbors_key="neighbors", key_added="X_umap"
)
# %%
rsc.tl.leiden(adata, resolution=1, key_added="leiden_res1", neighbors_key="neighbors")
# %%
sc.set_figure_params(figsize=(8, 8))
target_col = "leiden_res1"
sc.pl.umap(
adata,
color=target_col,
legend_loc="on data",
palette="tab20",
show=False,
)
fig = plt.gcf()
fig.savefig(f"figures/{target_col}.umap.{scheme}.pdf", dpi=600, bbox_inches="tight")
plt.show()
# %%
group_keys = [
"leiden_res1",
"n_genes_by_counts",
"total_counts",
"pct_counts_MT",
"pct_counts_RIBO",
"scDblFinder.class",
]
sc.set_figure_params(figsize=(8, 8))
sc.pl.umap(
adata,
color=group_keys,
ncols=2,
wspace=0.25,
hspace=0.25,
palette="tab20",
show=False,
)
fig = plt.gcf()
fig.savefig(f"figures/qc_check.umap.{scheme}.pdf", dpi=600, bbox_inches="tight")
plt.show()9 Pearson-Residuals scheme with regressing out sequencing depth
# %%
# Pearson-Residuals scheme
# with all necessary variables regressed out.
# i.e., sequencing depth, mitochondrial content, ribosomal content, etc.
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
import gc
from pathlib import Path
import scanpy as sc
import rapids_singlecell as rsc
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
# set memory management type
import rmm
import cupy as cp
from rmm.allocators.cupy import rmm_cupy_allocator
rmm.reinitialize(
managed_memory=False,
pool_allocator=True,
devices=[0],
initial_pool_size=12 * 1024**3,
)
cp.cuda.set_allocator(rmm_cupy_allocator)
# %%
sc.set_figure_params(dpi=600, facecolor="white")
# %%
adata_file = "/data/users/yangrui/lab_projs/test/data/neuron_counts.qc_filtered.h5ad"
scheme = "pearson-residuals.full-regression"
# %%
work_dir = Path(adata_file).parent.parent
os.makedirs(work_dir, exist_ok=True)
os.chdir(work_dir)
# %%
adata = sc.read_h5ad(adata_file)
# %% load data into GPU memory
rsc.get.anndata_to_GPU(adata)
# %%
adata.layers["counts"] = adata.X.copy()
# %%
rsc.pp.normalize_total(adata, target_sum=1e4)
rsc.pp.log1p(adata)
# %%
adata.layers["data"] = adata.X.copy()
# %%
adata.raw = adata
# %%
import math
nhvgs = 2000
adata.X = adata.layers["counts"]
# use Seurat clipping threshold (sqrt(n/30))
# not default (sqrt(n))
rsc.pp.highly_variable_genes(
adata,
n_top_genes=nhvgs,
flavor="pearson_residuals",
clip=math.sqrt(adata.shape[0] / 30),
)
# %%
import re
noise_genes = [
g for g in adata.var_names if re.match(r"^(mt-|Rps|Rpl)", g, re.IGNORECASE)
]
display(adata.var.loc[adata.var.index.isin(noise_genes) & adata.var["highly_variable"]])
# %%
adata.var.loc[adata.var.index.isin(noise_genes), "highly_variable"] = False
print(
f"# highly variable genes after removing MT/RIBO/HB: {sum(adata.var['highly_variable'])}"
)
# %% visualize HVGs based on mean expression and residual variance
from plotnine import *
var_df = adata.var.copy()
var_df["highly_variable"] = var_df["highly_variable"].astype(str)
p = (
ggplot(
var_df, aes(x="mean_counts", y="residual_variances", color="highly_variable")
)
+ geom_point(size=0.5, alpha=0.25)
+ geom_hline(yintercept=1, linetype="dashed", alpha=0.5)
+ scale_x_log10()
+ scale_y_log10()
+ scale_color_manual(
values={"False": "#bdbdbd", "True": "#e41a1c"},
name="HVG Status",
labels={"False": "Non-HVG", "True": "Selected HVG"},
)
+ labs(
x="Mean Expression (log10)",
y="Residual Variance (log10)",
title="SCTransform: Residual Variance vs Mean Expression",
)
+ theme_classic()
+ theme(
figure_size=(6, 4),
legend_position="right",
plot_title=element_text(hjust=0.5),
panel_grid=element_blank(),
)
)
p.save(f"figures/highly_variable_genes.{scheme}.pdf", dpi=600)
p
# %%
import math
adata.X = adata.layers["counts"]
# use Seurat clipping threshold (sqrt(n/30))
# not default (sqrt(n))
rsc.pp.normalize_pearson_residuals(adata, clip=math.sqrt(adata.shape[0] / 30))
adata.X
# %%
rsc.pp.regress_out(adata, keys=["total_counts", "pct_counts_MT", "pct_counts_RIBO"])
# %%
# rsc.pp.scale(adata, max_value=10)
# %%
rsc.tl.pca(adata, n_comps=100, use_highly_variable=True)
# %% elbow plot
sc.set_figure_params(figsize=(22, 8))
sc.pl.pca_variance_ratio(adata, log=True, n_pcs=100)
# %% customized elbow plot
import pandas as pd
import numpy as np
from plotnine import *
npcs = 100
var_ratio = adata.uns["pca"]["variance_ratio"][:npcs]
df_elbow = pd.DataFrame(
{"PC": np.arange(1, len(var_ratio) + 1), "Variance_Ratio": var_ratio}
)
p = (
ggplot(df_elbow, aes(x="PC", y="Variance_Ratio"))
+ geom_line(color="#e41a1c", size=1)
+ geom_point(color="#e41a1c", size=1.5)
+ scale_y_log10()
+ scale_x_continuous(breaks=np.arange(0, 101, 10))
+ geom_vline(xintercept=np.arange(20, 51, 10), linetype="dashed", color="blue")
+ labs(
x="Principal Component",
y="Variance Ratio (log10)",
title="PCA Elbow Plot - Variance Explained",
)
+ theme_classic()
+ theme(
figure_size=(10, 5),
plot_title=element_text(hjust=0.5),
axis_text=element_text(color="black"),
)
)
p.save(f"figures/pca_variance_ratio.{scheme}.pdf", dpi=600)
p
# %%
rsc.get.anndata_to_CPU(adata)
# %% check spearman correlations between PCs and continuous technical metrics
import scanpy as sc
import pandas as pd
from scipy.stats import spearmanr
from plotnine import *
npcs = 50
eval_metrics = [
"total_counts",
"n_genes_by_counts",
"pct_counts_MT",
"pct_counts_RIBO",
]
pcs = adata.obsm["X_pca"][:, :npcs]
obs_data = adata.obs[eval_metrics]
corr_list = []
for i in range(npcs):
pc_name = f"PC{i+1}"
for eval_metric in eval_metrics:
r, _ = spearmanr(pcs[:, i], obs_data[eval_metric])
corr_list.append(
{"PC": pc_name, "Metric": eval_metric, "Corr": r, "AbsCorr": abs(r)}
)
corr_df = pd.DataFrame(corr_list)
pc_order = [f"PC{i+1}" for i in range(npcs)]
corr_df["PC"] = pd.Categorical(corr_df["PC"], categories=pc_order[::-1], ordered=True)
corr_df["Metric"] = pd.Categorical(
corr_df["Metric"],
categories=eval_metrics,
ordered=True,
)
p = (
ggplot(corr_df, aes(x="Metric", y="PC", fill="Corr"))
+ geom_tile(color="white")
+ geom_text(aes(label="Corr.round(2)"), size=8)
+ scale_fill_gradient2(
low="#313695", mid="#ffffbf", high="#a50026", midpoint=0, limits=[-1, 1]
)
+ theme_minimal()
+ labs(
x="Technical Metrics",
y="Principal Components",
fill="Spearman's Rho",
)
+ theme(
figure_size=(4, 10),
axis_text_x=element_text(angle=45, hjust=1),
panel_grid_major=element_blank(),
)
)
p.save(f"figures/pc_tech_correlations.{scheme}.pdf", dpi=600)
p
# %% pairwise spearman correlations between continuous metrics
import pandas as pd
from plotnine import *
eval_metrics = [
"total_counts",
"n_genes_by_counts",
"pct_counts_MT",
"pct_counts_RIBO",
]
obs_data = adata.obs[eval_metrics]
corr_matrix = obs_data.corr(method="spearman")
corr_df = corr_matrix.reset_index().melt(id_vars="index")
corr_df.columns = ["Metric_Y", "Metric_X", "Corr"]
corr_df["Metric_Y"] = pd.Categorical(
corr_df["Metric_Y"], categories=eval_metrics[::-1], ordered=True
)
corr_df["Metric_X"] = pd.Categorical(
corr_df["Metric_X"], categories=eval_metrics, ordered=True
)
p = (
ggplot(corr_df, aes(x="Metric_X", y="Metric_Y", fill="Corr"))
+ geom_tile(color="white")
+ geom_text(aes(label="Corr.round(2)"), size=10)
+ scale_fill_gradient2(
low="#313695", mid="#ffffbf", high="#a50026", midpoint=0, limits=[-1, 1]
)
+ theme_minimal()
+ labs(
x="",
y="",
fill="Spearman's Rho",
title="Pairwise Correlations",
)
+ theme(
aspect_ratio=1,
figure_size=(5, 5),
axis_text_x=element_text(angle=90, hjust=1),
panel_grid_major=element_blank(),
)
)
p.save(f"figures/tech_correlations.{scheme}.pdf", dpi=600)
p
# %%
npcs = 50
# 关键参数:
# n_neighbors:全局/局部结构平衡
# use_rep/n_pcs
# algorithm:近邻搜索算法-速度与精度平衡
# method:计算连通性权重,如 'gauss' 计算的权重随距离增加呈指数级衰减
# metric:距离度量
rsc.pp.neighbors(
adata,
n_neighbors=15,
n_pcs=npcs,
algorithm="brute",
use_rep="X_pca",
key_added="neighbors",
method="umap",
)
# %%
# key parameters:
# min_dist: The effective minimum distance between embedded points.
# Smaller values will result in a more clustered/clumped embedding
# where nearby points on the manifold are drawn closer together,
# while larger values will result on a more even dispersal of points.
# 让同类细胞聚集更紧或更松。
# spread: The effective scale of embedded points.
# 整体缩小或放大尺度。
rsc.tl.umap(
adata, min_dist=0.1, spread=1, neighbors_key="neighbors", key_added="X_umap"
)
# %%
rsc.tl.leiden(adata, resolution=1, key_added="leiden_res1", neighbors_key="neighbors")
# %%
sc.set_figure_params(figsize=(8, 8))
target_col = "leiden_res1"
sc.pl.umap(
adata,
color=target_col,
legend_loc="on data",
palette="tab20",
show=False,
)
fig = plt.gcf()
fig.savefig(f"figures/{target_col}.umap.{scheme}.pdf", dpi=600, bbox_inches="tight")
plt.show()
# %%
group_keys = [
"leiden_res1",
"n_genes_by_counts",
"total_counts",
"pct_counts_MT",
"pct_counts_RIBO",
"scDblFinder.class",
]
sc.set_figure_params(figsize=(8, 8))
sc.pl.umap(
adata,
color=group_keys,
ncols=2,
wspace=0.25,
hspace=0.25,
palette="tab20",
show=False,
)
fig = plt.gcf()
fig.savefig(f"figures/qc_check.umap.{scheme}.pdf", dpi=600, bbox_inches="tight")
plt.show()