kpmp_snrnaseq_prep

[1]:
import numpy as np
import pandas as pd
import scanpy as sc

from collections import Counter

[2]:
adata = sc.read_h5ad("kpmp.h5ad")
[3]:
adata.obs.columns
[3]:
Index(['nCount_RNA', 'nFeature_RNA', 'mypatients', 'percent.er', 'percent.mt',
       'degen.score', 'aEpi.score', 'aStr.score', 'cyc.score',
       'matrisome.score', 'collagen.score', 'glycoprotein.score',
       'proteoglycan.score', 'S.Score', 'G2M.Score', 'experiment', 'specimen',
       'condition.long', 'condition.l1', 'myconditions', 'donor_id',
       'region.l1', 'region.l2', 'percent.cortex', 'percent.medulla',
       'sample_tissue_type', 'id', 'pagoda_k100_infomap_coembed',
       'subclass.full', 'subclass.l3', 'subclass.l2', 'subclass.l1',
       'state.l2', 'state', 'class', 'structure', 'disease_ontology_term_id',
       'sex_ontology_term_id', 'development_stage_ontology_term_id',
       'self_reported_ethnicity_ontology_term_id', 'eGFR', 'BMI',
       'diabetes_history', 'hypertension', 'tissue_ontology_term_id',
       'organism_ontology_term_id', 'assay_ontology_term_id',
       'cell_type_ontology_term_id', 'is_primary_data', 'suspension_type',
       'tissue_type', 'mycelltypes', 'assay', 'disease', 'organism', 'sex',
       'tissue', 'self_reported_ethnicity', 'development_stage',
       'observation_joinid', 'mysubtypes'],
      dtype='object')
[4]:
celltype_col="subclass.l1"
condition_col = "myconditions"
[5]:
outname = "ctcond"
[6]:
sc.pl.umap(adata, color=["subclass.l1"])
_images/kpmp_snrnaseq_prep_5_0.png
[4]:
set(adata.obs[celltype_col])
[4]:
{'ATL',
 'CNT',
 'DCT',
 'DTL',
 'EC',
 'FIB',
 'IC',
 'IMM',
 'NEU',
 'PC',
 'PEC',
 'POD',
 'PT',
 'PapE',
 'TAL',
 'VSM/P'}
[6]:
adata.obs["ct_condition"] = adata.obs[celltype_col].astype(str) + "_" + adata.obs[condition_col].astype(str)
adata.obs["ct_condition"] = pd.Categorical(adata.obs["ct_condition"])
[7]:
set(adata.obs["ct_condition"])
[7]:
{'ATL_AKI',
 'ATL_COV_AKI',
 'ATL_DKD',
 'ATL_H_CKD',
 'ATL_Ref',
 'CNT_AKI',
 'CNT_COV_AKI',
 'CNT_DKD',
 'CNT_H_CKD',
 'CNT_Ref',
 'DCT_AKI',
 'DCT_COV_AKI',
 'DCT_DKD',
 'DCT_H_CKD',
 'DCT_Ref',
 'DTL_AKI',
 'DTL_COV_AKI',
 'DTL_DKD',
 'DTL_H_CKD',
 'DTL_Ref',
 'EC_AKI',
 'EC_COV_AKI',
 'EC_DKD',
 'EC_H_CKD',
 'EC_Ref',
 'FIB_AKI',
 'FIB_COV_AKI',
 'FIB_DKD',
 'FIB_H_CKD',
 'FIB_Ref',
 'IC_AKI',
 'IC_COV_AKI',
 'IC_DKD',
 'IC_H_CKD',
 'IC_Ref',
 'IMM_AKI',
 'IMM_COV_AKI',
 'IMM_DKD',
 'IMM_H_CKD',
 'IMM_Ref',
 'NEU_AKI',
 'NEU_COV_AKI',
 'NEU_DKD',
 'NEU_H_CKD',
 'NEU_Ref',
 'PC_AKI',
 'PC_COV_AKI',
 'PC_DKD',
 'PC_H_CKD',
 'PC_Ref',
 'PEC_AKI',
 'PEC_COV_AKI',
 'PEC_DKD',
 'PEC_H_CKD',
 'PEC_Ref',
 'POD_AKI',
 'POD_COV_AKI',
 'POD_DKD',
 'POD_H_CKD',
 'POD_Ref',
 'PT_AKI',
 'PT_COV_AKI',
 'PT_DKD',
 'PT_H_CKD',
 'PT_Ref',
 'PapE_AKI',
 'PapE_COV_AKI',
 'PapE_DKD',
 'PapE_H_CKD',
 'PapE_Ref',
 'TAL_AKI',
 'TAL_COV_AKI',
 'TAL_DKD',
 'TAL_H_CKD',
 'TAL_Ref',
 'VSM/P_AKI',
 'VSM/P_COV_AKI',
 'VSM/P_DKD',
 'VSM/P_H_CKD',
 'VSM/P_Ref'}
[8]:
sc.pl.umap(adata, color=["ct_condition"])
/mnt/biosoft/software/python/3.11/lib/python3.11/site-packages/scanpy/plotting/_tools/scatterplots.py:392: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored
  cax = scatter(
_images/kpmp_snrnaseq_prep_9_1.png
[12]:
import matplotlib.pyplot as plt
fig, axs = plt.subplots(4, 4, figsize=(14, 14))
faxs = axs.flatten()

df_labels = adata.obs.loc[:,(celltype_col, condition_col)]
df_labels["count"] = 1

for gi, group in enumerate(set(df_labels[celltype_col])):
    gdf = df_labels[df_labels[celltype_col] == group]

    gdf.groupby([condition_col]).sum().plot(
    kind='pie', y='count', autopct='%1.0f%%', startangle=60, ax=faxs[gi])
    faxs[gi].set_title(group)
    faxs[gi].get_legend().remove()

#faxs[gi+1].set_visible(False)

/home/j/joppich/tmp/ipykernel_11122/822748533.py:11: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
  gdf.groupby(['myconditions']).sum().plot(
/home/j/joppich/tmp/ipykernel_11122/822748533.py:11: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
  gdf.groupby(['myconditions']).sum().plot(
/home/j/joppich/tmp/ipykernel_11122/822748533.py:11: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
  gdf.groupby(['myconditions']).sum().plot(
/home/j/joppich/tmp/ipykernel_11122/822748533.py:11: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
  gdf.groupby(['myconditions']).sum().plot(
/home/j/joppich/tmp/ipykernel_11122/822748533.py:11: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
  gdf.groupby(['myconditions']).sum().plot(
/home/j/joppich/tmp/ipykernel_11122/822748533.py:11: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
  gdf.groupby(['myconditions']).sum().plot(
/home/j/joppich/tmp/ipykernel_11122/822748533.py:11: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
  gdf.groupby(['myconditions']).sum().plot(
/home/j/joppich/tmp/ipykernel_11122/822748533.py:11: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
  gdf.groupby(['myconditions']).sum().plot(
/home/j/joppich/tmp/ipykernel_11122/822748533.py:11: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
  gdf.groupby(['myconditions']).sum().plot(
/home/j/joppich/tmp/ipykernel_11122/822748533.py:11: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
  gdf.groupby(['myconditions']).sum().plot(
/home/j/joppich/tmp/ipykernel_11122/822748533.py:11: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
  gdf.groupby(['myconditions']).sum().plot(
/home/j/joppich/tmp/ipykernel_11122/822748533.py:11: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
  gdf.groupby(['myconditions']).sum().plot(
/home/j/joppich/tmp/ipykernel_11122/822748533.py:11: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
  gdf.groupby(['myconditions']).sum().plot(
/home/j/joppich/tmp/ipykernel_11122/822748533.py:11: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
  gdf.groupby(['myconditions']).sum().plot(
/home/j/joppich/tmp/ipykernel_11122/822748533.py:11: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
  gdf.groupby(['myconditions']).sum().plot(
/home/j/joppich/tmp/ipykernel_11122/822748533.py:11: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
  gdf.groupby(['myconditions']).sum().plot(
_images/kpmp_snrnaseq_prep_10_1.png
[13]:
index2cellname = {i: x for i,x in enumerate(adata.obs.index)}
col2genename = {i: x for i,x in enumerate(adata.var.index)}
cellname2group = {cn: cg for cn, cg in zip(adata.obs.index, adata.obs["ct_condition"])}

group2cellcount = Counter()
for cn in cellname2group:
    group2cellcount[cellname2group[cn]] += 1
[14]:
group2cellcount
[14]:
Counter({'PT_Ref': 19433,
         'TAL_H_CKD': 15470,
         'TAL_Ref': 13258,
         'PT_AKI': 10294,
         'PC_Ref': 10030,
         'PT_DKD': 9934,
         'EC_Ref': 8876,
         'FIB_Ref': 8260,
         'TAL_COV_AKI': 6747,
         'PT_H_CKD': 6238,
         'PT_COV_AKI': 5773,
         'IC_Ref': 4806,
         'TAL_DKD': 4716,
         'IMM_Ref': 4512,
         'DTL_Ref': 4216,
         'DCT_Ref': 3812,
         'FIB_AKI': 3517,
         'CNT_Ref': 3415,
         'TAL_AKI': 3348,
         'ATL_Ref': 2865,
         'PC_H_CKD': 2863,
         'EC_H_CKD': 2826,
         'EC_AKI': 2716,
         'IMM_AKI': 2608,
         'FIB_DKD': 2233,
         'EC_DKD': 2049,
         'DTL_H_CKD': 2037,
         'DCT_AKI': 1979,
         'VSM/P_Ref': 1962,
         'FIB_H_CKD': 1920,
         'IMM_DKD': 1822,
         'CNT_AKI': 1810,
         'IC_AKI': 1576,
         'PC_DKD': 1461,
         'IMM_H_CKD': 1316,
         'DCT_COV_AKI': 1284,
         'IC_H_CKD': 1253,
         'IC_DKD': 1185,
         'POD_Ref': 1138,
         'IC_COV_AKI': 1092,
         'EC_COV_AKI': 1028,
         'IMM_COV_AKI': 1015,
         'PC_AKI': 929,
         'DCT_DKD': 815,
         'CNT_COV_AKI': 814,
         'VSM/P_H_CKD': 723,
         'DTL_DKD': 717,
         'PC_COV_AKI': 705,
         'CNT_DKD': 693,
         'DTL_AKI': 627,
         'VSM/P_AKI': 594,
         'PEC_Ref': 527,
         'VSM/P_DKD': 526,
         'POD_DKD': 494,
         'POD_AKI': 480,
         'DTL_COV_AKI': 371,
         'PEC_AKI': 357,
         'PapE_Ref': 322,
         'DCT_H_CKD': 270,
         'FIB_COV_AKI': 251,
         'PEC_DKD': 247,
         'CNT_H_CKD': 232,
         'POD_H_CKD': 232,
         'PEC_COV_AKI': 217,
         'POD_COV_AKI': 211,
         'PEC_H_CKD': 77,
         'VSM/P_COV_AKI': 43,
         'ATL_H_CKD': 37,
         'NEU_Ref': 35,
         'ATL_AKI': 24,
         'NEU_DKD': 16,
         'ATL_COV_AKI': 16,
         'ATL_DKD': 15,
         'NEU_H_CKD': 9,
         'NEU_AKI': 7,
         'NEU_COV_AKI': 5,
         'PapE_COV_AKI': 3,
         'PapE_DKD': 2,
         'PapE_H_CKD': 1,
         'PapE_AKI': 1})
[16]:

def percentile(n): def percentile_(x): return np.percentile(x, n) percentile_.__name__ = 'percentile_%s' % n return percentile_ def sd(): def sd_(x): return np.std(x) sd_.__name__ = "sd" return sd_ def enum(): def enum_(x): return x.size enum_.__name__ = 'num' return enum_

[17]:
coo = adata.X.tocoo(copy=False)
genedf = pd.DataFrame({'rowidx': coo.row, 'col': coo.col, 'data': coo.data})[['rowidx', 'col', 'data']]
[18]:
genedf["cellname"] = genedf.rowidx.map(index2cellname)
genedf["gene"] = genedf.col.map(col2genename)
genedf["group"] = genedf.cellname.map(cellname2group)
genedf
[18]:
rowidx col data cellname gene group
0 0 0 1.0 KB1_AAACCCAAGCCGATAG A1BG PT_DKD
1 0 20 3.0 KB1_AAACCCAAGCCGATAG AAK1 PT_DKD
2 0 21 1.0 KB1_AAACCCAAGCCGATAG AAMDC PT_DKD
3 0 37 2.0 KB1_AAACCCAAGCCGATAG ABCA1 PT_DKD
4 0 40 1.0 KB1_AAACCCAAGCCGATAG ABCA13 PT_DKD
... ... ... ... ... ... ...
379664051 200337 29009 1.0 KBCVD4_TTTGTTGAGACCCGCT-1 ZNF529 EC_COV_AKI
379664052 200337 29150 1.0 KBCVD4_TTTGTTGAGACCCGCT-1 ZNF706 EC_COV_AKI
379664053 200337 29189 1.0 KBCVD4_TTTGTTGAGACCCGCT-1 ZNF77 EC_COV_AKI
379664054 200337 29199 1.0 KBCVD4_TTTGTTGAGACCCGCT-1 ZNF780A EC_COV_AKI
379664055 200337 29258 1.0 KBCVD4_TTTGTTGAGACCCGCT-1 ZNFX1 EC_COV_AKI

379664056 rows × 6 columns

[19]:
genedf2 = genedf[genedf.gene == "ALB"]
[20]:
genedf2
[20]:
rowidx col data cellname gene group
7185 3 8155 1.0 KB1_AAACCCAGTAATGCGG ALB TAL_DKD
104481 51 8155 1.0 KB1_AAAGTCCCACGAGAAC ALB PT_DKD
156583 74 8155 2.0 KB1_AACAACCAGCAGGCAT ALB PT_DKD
670056 325 8155 1.0 KB1_AATGCCACAGAGTTGG ALB PT_DKD
705591 345 8155 1.0 KB1_AATGGCTTCTATCACT ALB EC_DKD
... ... ... ... ... ... ...
378887860 200029 8155 1.0 KBCVD4_TGCTCCAAGAGGCTGT-1 ALB TAL_COV_AKI
379078996 200106 8155 1.0 KBCVD4_TGTCCACCAAGAGCTG-1 ALB TAL_COV_AKI
379131114 200126 8155 1.0 KBCVD4_TGTGTGACAGTCTACA-1 ALB TAL_COV_AKI
379489805 200265 8155 1.0 KBCVD4_TTGGGCGTCTCGCCTA-1 ALB IMM_COV_AKI
379646947 200331 8155 1.0 KBCVD4_TTTGGAGTCGAGTACT-1 ALB TAL_COV_AKI

2380 rows × 6 columns

[21]:
aggdf = genedf.groupby(["group", "gene"])["data"].aggregate(["min", percentile(25), np.median, percentile(75), "max", "mean", enum(), sd()])
aggdf

[21]:
min percentile_25 median percentile_75 max mean num sd
group gene
ATL_AKI A1BG 1.0 1.00 1.0 1.00 1.0 1.000000 1 0.000000
A1BG-AS1 1.0 1.00 1.0 1.00 1.0 1.000000 2 0.000000
A2M 1.0 1.00 1.0 1.00 1.0 1.000000 1 0.000000
A2ML1-AS1 1.0 1.00 1.0 1.00 1.0 1.000000 1 0.000000
A4GALT 1.0 1.25 1.5 1.75 2.0 1.500000 2 0.500000
... ... ... ... ... ... ... ... ... ...
VSM/P_Ref ZYG11A 1.0 1.00 1.0 1.00 1.0 1.000000 21 0.000000
ZYG11B 1.0 1.00 1.0 1.00 4.0 1.275773 388 0.603880
ZYX 1.0 1.00 1.0 1.00 3.0 1.053571 112 0.261837
ZZEF1 1.0 1.00 1.0 2.00 6.0 1.369478 498 0.717000
ZZZ3 1.0 1.00 1.0 2.00 8.0 1.798799 666 1.188685

1614358 rows × 8 columns

[22]:
aggdf["group_cells"] = aggdf.index.get_level_values(0).map(group2cellcount)
aggdf["perc_expr"] = aggdf.num / aggdf.group_cells
aggdf

[22]:
min percentile_25 median percentile_75 max mean num sd group_cells perc_expr
group gene
ATL_AKI A1BG 1.0 1.00 1.0 1.00 1.0 1.000000 1 0.000000 24 0.041667
A1BG-AS1 1.0 1.00 1.0 1.00 1.0 1.000000 2 0.000000 24 0.083333
A2M 1.0 1.00 1.0 1.00 1.0 1.000000 1 0.000000 24 0.041667
A2ML1-AS1 1.0 1.00 1.0 1.00 1.0 1.000000 1 0.000000 24 0.041667
A4GALT 1.0 1.25 1.5 1.75 2.0 1.500000 2 0.500000 24 0.083333
... ... ... ... ... ... ... ... ... ... ... ...
VSM/P_Ref ZYG11A 1.0 1.00 1.0 1.00 1.0 1.000000 21 0.000000 1962 0.010703
ZYG11B 1.0 1.00 1.0 1.00 4.0 1.275773 388 0.603880 1962 0.197757
ZYX 1.0 1.00 1.0 1.00 3.0 1.053571 112 0.261837 1962 0.057085
ZZEF1 1.0 1.00 1.0 2.00 6.0 1.369478 498 0.717000 1962 0.253823
ZZZ3 1.0 1.00 1.0 2.00 8.0 1.798799 666 1.188685 1962 0.339450

1614358 rows × 10 columns

[23]:
aggdf.to_csv("expression_{}_mean_df.tsv".format(outname), sep="\t")
[ ]: