kpmp_snrnaseq_prep

[1]:

import numpy as np
import pandas as pd
import scanpy as sc

from collections import Counter

[2]:

adata = sc.read_h5ad("kpmp.h5ad")

[3]:

adata.obs.columns

[3]:

Index(['nCount_RNA', 'nFeature_RNA', 'mypatients', 'percent.er', 'percent.mt',
       'degen.score', 'aEpi.score', 'aStr.score', 'cyc.score',
       'matrisome.score', 'collagen.score', 'glycoprotein.score',
       'proteoglycan.score', 'S.Score', 'G2M.Score', 'experiment', 'specimen',
       'condition.long', 'condition.l1', 'myconditions', 'donor_id',
       'region.l1', 'region.l2', 'percent.cortex', 'percent.medulla',
       'sample_tissue_type', 'id', 'pagoda_k100_infomap_coembed',
       'subclass.full', 'subclass.l3', 'subclass.l2', 'subclass.l1',
       'state.l2', 'state', 'class', 'structure', 'disease_ontology_term_id',
       'sex_ontology_term_id', 'development_stage_ontology_term_id',
       'self_reported_ethnicity_ontology_term_id', 'eGFR', 'BMI',
       'diabetes_history', 'hypertension', 'tissue_ontology_term_id',
       'organism_ontology_term_id', 'assay_ontology_term_id',
       'cell_type_ontology_term_id', 'is_primary_data', 'suspension_type',
       'tissue_type', 'mycelltypes', 'assay', 'disease', 'organism', 'sex',
       'tissue', 'self_reported_ethnicity', 'development_stage',
       'observation_joinid', 'mysubtypes'],
      dtype='object')

[4]:

celltype_col="subclass.l1"
condition_col = "myconditions"

[5]:

outname = "ctcond"

[6]:

sc.pl.umap(adata, color=["subclass.l1"])

[4]:

set(adata.obs[celltype_col])

[4]:

{'ATL',
 'CNT',
 'DCT',
 'DTL',
 'EC',
 'FIB',
 'IC',
 'IMM',
 'NEU',
 'PC',
 'PEC',
 'POD',
 'PT',
 'PapE',
 'TAL',
 'VSM/P'}

[6]:

adata.obs["ct_condition"] = adata.obs[celltype_col].astype(str) + "_" + adata.obs[condition_col].astype(str)
adata.obs["ct_condition"] = pd.Categorical(adata.obs["ct_condition"])

[7]:

set(adata.obs["ct_condition"])

[7]:

{'ATL_AKI',
 'ATL_COV_AKI',
 'ATL_DKD',
 'ATL_H_CKD',
 'ATL_Ref',
 'CNT_AKI',
 'CNT_COV_AKI',
 'CNT_DKD',
 'CNT_H_CKD',
 'CNT_Ref',
 'DCT_AKI',
 'DCT_COV_AKI',
 'DCT_DKD',
 'DCT_H_CKD',
 'DCT_Ref',
 'DTL_AKI',
 'DTL_COV_AKI',
 'DTL_DKD',
 'DTL_H_CKD',
 'DTL_Ref',
 'EC_AKI',
 'EC_COV_AKI',
 'EC_DKD',
 'EC_H_CKD',
 'EC_Ref',
 'FIB_AKI',
 'FIB_COV_AKI',
 'FIB_DKD',
 'FIB_H_CKD',
 'FIB_Ref',
 'IC_AKI',
 'IC_COV_AKI',
 'IC_DKD',
 'IC_H_CKD',
 'IC_Ref',
 'IMM_AKI',
 'IMM_COV_AKI',
 'IMM_DKD',
 'IMM_H_CKD',
 'IMM_Ref',
 'NEU_AKI',
 'NEU_COV_AKI',
 'NEU_DKD',
 'NEU_H_CKD',
 'NEU_Ref',
 'PC_AKI',
 'PC_COV_AKI',
 'PC_DKD',
 'PC_H_CKD',
 'PC_Ref',
 'PEC_AKI',
 'PEC_COV_AKI',
 'PEC_DKD',
 'PEC_H_CKD',
 'PEC_Ref',
 'POD_AKI',
 'POD_COV_AKI',
 'POD_DKD',
 'POD_H_CKD',
 'POD_Ref',
 'PT_AKI',
 'PT_COV_AKI',
 'PT_DKD',
 'PT_H_CKD',
 'PT_Ref',
 'PapE_AKI',
 'PapE_COV_AKI',
 'PapE_DKD',
 'PapE_H_CKD',
 'PapE_Ref',
 'TAL_AKI',
 'TAL_COV_AKI',
 'TAL_DKD',
 'TAL_H_CKD',
 'TAL_Ref',
 'VSM/P_AKI',
 'VSM/P_COV_AKI',
 'VSM/P_DKD',
 'VSM/P_H_CKD',
 'VSM/P_Ref'}

[8]:

sc.pl.umap(adata, color=["ct_condition"])

/mnt/biosoft/software/python/3.11/lib/python3.11/site-packages/scanpy/plotting/_tools/scatterplots.py:392: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored
  cax = scatter(

[12]:

import matplotlib.pyplot as plt
fig, axs = plt.subplots(4, 4, figsize=(14, 14))
faxs = axs.flatten()

df_labels = adata.obs.loc[:,(celltype_col, condition_col)]
df_labels["count"] = 1

for gi, group in enumerate(set(df_labels[celltype_col])):
    gdf = df_labels[df_labels[celltype_col] == group]

    gdf.groupby([condition_col]).sum().plot(
    kind='pie', y='count', autopct='%1.0f%%', startangle=60, ax=faxs[gi])
    faxs[gi].set_title(group)
    faxs[gi].get_legend().remove()

#faxs[gi+1].set_visible(False)

/home/j/joppich/tmp/ipykernel_11122/822748533.py:11: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
  gdf.groupby(['myconditions']).sum().plot(
/home/j/joppich/tmp/ipykernel_11122/822748533.py:11: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
  gdf.groupby(['myconditions']).sum().plot(
/home/j/joppich/tmp/ipykernel_11122/822748533.py:11: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
  gdf.groupby(['myconditions']).sum().plot(
/home/j/joppich/tmp/ipykernel_11122/822748533.py:11: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
  gdf.groupby(['myconditions']).sum().plot(
/home/j/joppich/tmp/ipykernel_11122/822748533.py:11: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
  gdf.groupby(['myconditions']).sum().plot(
/home/j/joppich/tmp/ipykernel_11122/822748533.py:11: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
  gdf.groupby(['myconditions']).sum().plot(
/home/j/joppich/tmp/ipykernel_11122/822748533.py:11: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
  gdf.groupby(['myconditions']).sum().plot(
/home/j/joppich/tmp/ipykernel_11122/822748533.py:11: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
  gdf.groupby(['myconditions']).sum().plot(
/home/j/joppich/tmp/ipykernel_11122/822748533.py:11: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
  gdf.groupby(['myconditions']).sum().plot(
/home/j/joppich/tmp/ipykernel_11122/822748533.py:11: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
  gdf.groupby(['myconditions']).sum().plot(
/home/j/joppich/tmp/ipykernel_11122/822748533.py:11: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
  gdf.groupby(['myconditions']).sum().plot(
/home/j/joppich/tmp/ipykernel_11122/822748533.py:11: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
  gdf.groupby(['myconditions']).sum().plot(
/home/j/joppich/tmp/ipykernel_11122/822748533.py:11: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
  gdf.groupby(['myconditions']).sum().plot(
/home/j/joppich/tmp/ipykernel_11122/822748533.py:11: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
  gdf.groupby(['myconditions']).sum().plot(
/home/j/joppich/tmp/ipykernel_11122/822748533.py:11: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
  gdf.groupby(['myconditions']).sum().plot(
/home/j/joppich/tmp/ipykernel_11122/822748533.py:11: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
  gdf.groupby(['myconditions']).sum().plot(

[13]:

index2cellname = {i: x for i,x in enumerate(adata.obs.index)}
col2genename = {i: x for i,x in enumerate(adata.var.index)}
cellname2group = {cn: cg for cn, cg in zip(adata.obs.index, adata.obs["ct_condition"])}

group2cellcount = Counter()
for cn in cellname2group:
    group2cellcount[cellname2group[cn]] += 1

[14]:

group2cellcount

[14]:

Counter({'PT_Ref': 19433,
         'TAL_H_CKD': 15470,
         'TAL_Ref': 13258,
         'PT_AKI': 10294,
         'PC_Ref': 10030,
         'PT_DKD': 9934,
         'EC_Ref': 8876,
         'FIB_Ref': 8260,
         'TAL_COV_AKI': 6747,
         'PT_H_CKD': 6238,
         'PT_COV_AKI': 5773,
         'IC_Ref': 4806,
         'TAL_DKD': 4716,
         'IMM_Ref': 4512,
         'DTL_Ref': 4216,
         'DCT_Ref': 3812,
         'FIB_AKI': 3517,
         'CNT_Ref': 3415,
         'TAL_AKI': 3348,
         'ATL_Ref': 2865,
         'PC_H_CKD': 2863,
         'EC_H_CKD': 2826,
         'EC_AKI': 2716,
         'IMM_AKI': 2608,
         'FIB_DKD': 2233,
         'EC_DKD': 2049,
         'DTL_H_CKD': 2037,
         'DCT_AKI': 1979,
         'VSM/P_Ref': 1962,
         'FIB_H_CKD': 1920,
         'IMM_DKD': 1822,
         'CNT_AKI': 1810,
         'IC_AKI': 1576,
         'PC_DKD': 1461,
         'IMM_H_CKD': 1316,
         'DCT_COV_AKI': 1284,
         'IC_H_CKD': 1253,
         'IC_DKD': 1185,
         'POD_Ref': 1138,
         'IC_COV_AKI': 1092,
         'EC_COV_AKI': 1028,
         'IMM_COV_AKI': 1015,
         'PC_AKI': 929,
         'DCT_DKD': 815,
         'CNT_COV_AKI': 814,
         'VSM/P_H_CKD': 723,
         'DTL_DKD': 717,
         'PC_COV_AKI': 705,
         'CNT_DKD': 693,
         'DTL_AKI': 627,
         'VSM/P_AKI': 594,
         'PEC_Ref': 527,
         'VSM/P_DKD': 526,
         'POD_DKD': 494,
         'POD_AKI': 480,
         'DTL_COV_AKI': 371,
         'PEC_AKI': 357,
         'PapE_Ref': 322,
         'DCT_H_CKD': 270,
         'FIB_COV_AKI': 251,
         'PEC_DKD': 247,
         'CNT_H_CKD': 232,
         'POD_H_CKD': 232,
         'PEC_COV_AKI': 217,
         'POD_COV_AKI': 211,
         'PEC_H_CKD': 77,
         'VSM/P_COV_AKI': 43,
         'ATL_H_CKD': 37,
         'NEU_Ref': 35,
         'ATL_AKI': 24,
         'NEU_DKD': 16,
         'ATL_COV_AKI': 16,
         'ATL_DKD': 15,
         'NEU_H_CKD': 9,
         'NEU_AKI': 7,
         'NEU_COV_AKI': 5,
         'PapE_COV_AKI': 3,
         'PapE_DKD': 2,
         'PapE_H_CKD': 1,
         'PapE_AKI': 1})

[16]:

def percentile(n):
    def percentile_(x):
        return np.percentile(x, n)
    percentile_.__name__ = 'percentile_%s' % n
    return percentile_

def sd():
    def sd_(x):
        return np.std(x)
    sd_.__name__ = "sd"
    return sd_

def enum():
    def enum_(x):
        return x.size
    enum_.__name__ = 'num'
    return enum_

[17]:

coo = adata.X.tocoo(copy=False)
genedf = pd.DataFrame({'rowidx': coo.row, 'col': coo.col, 'data': coo.data})[['rowidx', 'col', 'data']]

[18]:

genedf["cellname"] = genedf.rowidx.map(index2cellname)
genedf["gene"] = genedf.col.map(col2genename)
genedf["group"] = genedf.cellname.map(cellname2group)
genedf

[18]:

	rowidx	col	data	cellname	gene	group
0	0	0	1.0	KB1_AAACCCAAGCCGATAG	A1BG	PT_DKD
1	0	20	3.0	KB1_AAACCCAAGCCGATAG	AAK1	PT_DKD
2	0	21	1.0	KB1_AAACCCAAGCCGATAG	AAMDC	PT_DKD
3	0	37	2.0	KB1_AAACCCAAGCCGATAG	ABCA1	PT_DKD
4	0	40	1.0	KB1_AAACCCAAGCCGATAG	ABCA13	PT_DKD
...	...	...	...	...	...	...
379664051	200337	29009	1.0	KBCVD4_TTTGTTGAGACCCGCT-1	ZNF529	EC_COV_AKI
379664052	200337	29150	1.0	KBCVD4_TTTGTTGAGACCCGCT-1	ZNF706	EC_COV_AKI
379664053	200337	29189	1.0	KBCVD4_TTTGTTGAGACCCGCT-1	ZNF77	EC_COV_AKI
379664054	200337	29199	1.0	KBCVD4_TTTGTTGAGACCCGCT-1	ZNF780A	EC_COV_AKI
379664055	200337	29258	1.0	KBCVD4_TTTGTTGAGACCCGCT-1	ZNFX1	EC_COV_AKI

379664056 rows × 6 columns

[19]:

genedf2 = genedf[genedf.gene == "ALB"]

[20]:

genedf2

[20]:

	rowidx	col	data	cellname	gene	group
7185	3	8155	1.0	KB1_AAACCCAGTAATGCGG	ALB	TAL_DKD
104481	51	8155	1.0	KB1_AAAGTCCCACGAGAAC	ALB	PT_DKD
156583	74	8155	2.0	KB1_AACAACCAGCAGGCAT	ALB	PT_DKD
670056	325	8155	1.0	KB1_AATGCCACAGAGTTGG	ALB	PT_DKD
705591	345	8155	1.0	KB1_AATGGCTTCTATCACT	ALB	EC_DKD
...	...	...	...	...	...	...
378887860	200029	8155	1.0	KBCVD4_TGCTCCAAGAGGCTGT-1	ALB	TAL_COV_AKI
379078996	200106	8155	1.0	KBCVD4_TGTCCACCAAGAGCTG-1	ALB	TAL_COV_AKI
379131114	200126	8155	1.0	KBCVD4_TGTGTGACAGTCTACA-1	ALB	TAL_COV_AKI
379489805	200265	8155	1.0	KBCVD4_TTGGGCGTCTCGCCTA-1	ALB	IMM_COV_AKI
379646947	200331	8155	1.0	KBCVD4_TTTGGAGTCGAGTACT-1	ALB	TAL_COV_AKI

2380 rows × 6 columns

[21]:

aggdf = genedf.groupby(["group", "gene"])["data"].aggregate(["min", percentile(25), np.median, percentile(75), "max", "mean", enum(), sd()])
aggdf

[21]:

		min	percentile_25	median	percentile_75	max	mean	num	sd
group	gene
ATL_AKI	A1BG	1.0	1.00	1.0	1.00	1.0	1.000000	1	0.000000
	A1BG-AS1	1.0	1.00	1.0	1.00	1.0	1.000000	2	0.000000
	A2M	1.0	1.00	1.0	1.00	1.0	1.000000	1	0.000000
	A2ML1-AS1	1.0	1.00	1.0	1.00	1.0	1.000000	1	0.000000
	A4GALT	1.0	1.25	1.5	1.75	2.0	1.500000	2	0.500000
...	...	...	...	...	...	...	...	...	...
VSM/P_Ref	ZYG11A	1.0	1.00	1.0	1.00	1.0	1.000000	21	0.000000
	ZYG11B	1.0	1.00	1.0	1.00	4.0	1.275773	388	0.603880
	ZYX	1.0	1.00	1.0	1.00	3.0	1.053571	112	0.261837
	ZZEF1	1.0	1.00	1.0	2.00	6.0	1.369478	498	0.717000
	ZZZ3	1.0	1.00	1.0	2.00	8.0	1.798799	666	1.188685

1614358 rows × 8 columns

[22]:

aggdf["group_cells"] = aggdf.index.get_level_values(0).map(group2cellcount)
aggdf["perc_expr"] = aggdf.num / aggdf.group_cells
aggdf

[22]:

		min	percentile_25	median	percentile_75	max	mean	num	sd	group_cells	perc_expr
group	gene
ATL_AKI	A1BG	1.0	1.00	1.0	1.00	1.0	1.000000	1	0.000000	24	0.041667
	A1BG-AS1	1.0	1.00	1.0	1.00	1.0	1.000000	2	0.000000	24	0.083333
	A2M	1.0	1.00	1.0	1.00	1.0	1.000000	1	0.000000	24	0.041667
	A2ML1-AS1	1.0	1.00	1.0	1.00	1.0	1.000000	1	0.000000	24	0.041667
	A4GALT	1.0	1.25	1.5	1.75	2.0	1.500000	2	0.500000	24	0.083333
...	...	...	...	...	...	...	...	...	...	...	...
VSM/P_Ref	ZYG11A	1.0	1.00	1.0	1.00	1.0	1.000000	21	0.000000	1962	0.010703
	ZYG11B	1.0	1.00	1.0	1.00	4.0	1.275773	388	0.603880	1962	0.197757
	ZYX	1.0	1.00	1.0	1.00	3.0	1.053571	112	0.261837	1962	0.057085
	ZZEF1	1.0	1.00	1.0	2.00	6.0	1.369478	498	0.717000	1962	0.253823
	ZZZ3	1.0	1.00	1.0	2.00	8.0	1.798799	666	1.188685	1962	0.339450

1614358 rows × 10 columns

[23]:

aggdf.to_csv("expression_{}_mean_df.tsv".format(outname), sep="\t")

[ ]: