Tutorial notebook - TTN

[1]:

# This is code to parse a h5ad file into a single delimited cell type
# for generation of a Brooklyn plot. The output files are a new delimited
# h5ad file of a single cell type and a csv file of genes with their
# chromosome locations and mean expression from that cell type.
# Code written by Arun Patil and edited by Marc Halushka
# CC BY 2022

[2]:

import numpy as np
import scipy as sp
import scanpy as sc
import pandas as pd
import matplotlib.pyplot as plt
import glob
import os, fnmatch
import requests
import io
import seaborn as sns
import scipy
import pybiomart
sc.settings.verbosity = 3
sc.settings.set_figure_params(dpi = 120, color_map = 'RdBu_r')
from scipy import stats

Import the original .h5ad file here. Chose the appropriate file location.

[3]:

orig_h5ad = sc.read_h5ad('subset_Cells_DCM_normalized_humanLV_112122.h5ad')

Obtain the observations (obs) list of the h5ad file.

[4]:

orig_h5ad

[4]:

AnnData object with n_obs × n_vars = 69150 × 33234
    obs: 'Sample', 'Patient', 'Region_x', 'Primary.Genetic.Diagnosis', 'n_genes', 'n_counts', 'percent_mito', 'percent_ribo', 'scrublet_score_z', 'scrublet_score_log', 'solo_score', 'cell_states', 'Assigned', 'ethnicity_ontology_term_id', 'disease_ontology_term_id', 'cell_type_ontology_term_id', 'sex_ontology_term_id', 'assay_ontology_term_id', 'organism_ontology_term_id', 'is_primary_data', 'tissue_ontology_term_id', 'development_stage_ontology_term_id', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'ethnicity', 'development_stage'
    var: 'vst.mean', 'vst.variance', 'vst.variance.expected', 'vst.variance.standardized', 'vst.variable', 'feature_biotype', 'feature_is_filtered', 'feature_name', 'feature_reference'
    uns: 'X_normalization', 'cell_states_colors', 'cell_type_ontology_term_id_colors', 'layer_descriptions', 'schema_version', 'title'
    obsm: 'X_pca', 'X_umap'

[5]:

orig_h5ad.obs

[5]:

	Sample	Patient	Region_x	Primary.Genetic.Diagnosis	n_genes	n_counts	percent_mito	percent_ribo	scrublet_score_z	scrublet_score_log	...	tissue_ontology_term_id	development_stage_ontology_term_id	cell_type	assay	disease	organism	sex	tissue	ethnicity	development_stage
3350	ED_DT4_LV0_premrna	DT4	LV	TTN	2536	2920.292236	0.000278	0.001113	0.066042	0.020795	...	UBERON:0002084	HsapDv:0000241	cardiac muscle cell	10x 3' v3	dilated cardiomyopathy	Homo sapiens	male	heart left ventricle	unknown	seventh decade human stage
3351	ED_DT4_LV0_premrna	DT4	LV	TTN	887	1920.013428	0.001704	0.000000	0.183444	0.001330	...	UBERON:0002084	HsapDv:0000241	cardiac muscle cell	10x 3' v3	dilated cardiomyopathy	Homo sapiens	male	heart left ventricle	unknown	seventh decade human stage
3352	ED_DT4_LV0_premrna	DT4	LV	TTN	2568	3276.649414	0.003481	0.001266	0.090403	0.083051	...	UBERON:0002084	HsapDv:0000241	cardiac muscle cell	10x 3' v3	dilated cardiomyopathy	Homo sapiens	male	heart left ventricle	unknown	seventh decade human stage
3353	ED_DT4_LV0_premrna	DT4	LV	TTN	1643	2521.693604	0.000748	0.000499	0.067423	0.004540	...	UBERON:0002084	HsapDv:0000241	cardiac muscle cell	10x 3' v3	dilated cardiomyopathy	Homo sapiens	male	heart left ventricle	unknown	seventh decade human stage
3354	ED_DT4_LV0_premrna	DT4	LV	TTN	1832	3079.880615	0.000304	0.000304	0.094364	0.007266	...	UBERON:0002084	HsapDv:0000241	cardiac muscle cell	10x 3' v3	dilated cardiomyopathy	Homo sapiens	male	heart left ventricle	unknown	seventh decade human stage
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
310662	IC_H04_LV0_premrna	IC_H04	LV	PVneg	1894	2627.427490	0.003090	0.000813	0.009396	0.004142	...	UBERON:0002084	HsapDv:0000241	cardiac muscle cell	10x 3' v3	dilated cardiomyopathy	Homo sapiens	female	heart left ventricle	unknown	seventh decade human stage
310663	IC_H04_LV0_premrna	IC_H04	LV	PVneg	2941	3399.208252	0.001667	0.001334	0.028765	0.005743	...	UBERON:0002084	HsapDv:0000241	cardiac muscle cell	10x 3' v3	dilated cardiomyopathy	Homo sapiens	female	heart left ventricle	unknown	seventh decade human stage
310664	IC_H04_LV0_premrna	IC_H04	LV	PVneg	2479	3142.647461	0.000858	0.000572	0.020787	0.004737	...	UBERON:0002084	HsapDv:0000241	cardiac muscle cell	10x 3' v3	dilated cardiomyopathy	Homo sapiens	female	heart left ventricle	unknown	seventh decade human stage
310665	IC_H04_LV0_premrna	IC_H04	LV	PVneg	2645	3106.231201	0.001000	0.002000	0.020787	0.003347	...	UBERON:0002084	HsapDv:0000241	cardiac muscle cell	10x 3' v3	dilated cardiomyopathy	Homo sapiens	female	heart left ventricle	unknown	seventh decade human stage
310666	IC_H04_LV0_premrna	IC_H04	LV	PVneg	474	1415.674561	0.000000	0.001773	0.063112	0.013699	...	UBERON:0002084	HsapDv:0000241	cardiac muscle cell	10x 3' v3	dilated cardiomyopathy	Homo sapiens	female	heart left ventricle	unknown	seventh decade human stage

69150 rows × 30 columns

[6]:

orig_h5ad.obs.columns

[6]:

Index(['Sample', 'Patient', 'Region_x', 'Primary.Genetic.Diagnosis', 'n_genes',
       'n_counts', 'percent_mito', 'percent_ribo', 'scrublet_score_z',
       'scrublet_score_log', 'solo_score', 'cell_states', 'Assigned',
       'ethnicity_ontology_term_id', 'disease_ontology_term_id',
       'cell_type_ontology_term_id', 'sex_ontology_term_id',
       'assay_ontology_term_id', 'organism_ontology_term_id',
       'is_primary_data', 'tissue_ontology_term_id',
       'development_stage_ontology_term_id', 'cell_type', 'assay', 'disease',
       'organism', 'sex', 'tissue', 'ethnicity', 'development_stage'],
      dtype='object')

[7]:

orig_h5ad.var

[7]:

	vst.mean	vst.variance	vst.variance.expected	vst.variance.standardized	vst.variable	feature_biotype	feature_is_filtered	feature_name	feature_reference
gene_ids
ENSG00000243485	0.000071	0.000071	0.000074	0.958714	0	gene	False	MIR1302-2HG	NCBITaxon:9606
ENSG00000237613	0.000003	0.000003	0.000003	0.997981	0	gene	False	FAM138A	NCBITaxon:9606
ENSG00000186092	0.000000	0.000000	0.000000	0.000000	0	gene	False	OR4F5	NCBITaxon:9606
ENSG00000238009	0.031093	0.032638	0.035845	0.910519	0	gene	False	RP11-34P13.7	NCBITaxon:9606
ENSG00000239945	0.000000	0.000000	0.000000	0.000000	0	gene	False	RP11-34P13.8	NCBITaxon:9606
...	...	...	...	...	...	...	...	...	...
ENSG00000277856	0.000000	0.000000	0.000000	0.000000	0	gene	False	ENSG00000277856	NCBITaxon:9606
ENSG00000275063	0.000000	0.000000	0.000000	0.000000	0	gene	False	ENSG00000275063	NCBITaxon:9606
ENSG00000271254	0.005992	0.006174	0.006542	0.943884	0	gene	False	ENSG00000271254	NCBITaxon:9606
ENSG00000277475	0.000000	0.000000	0.000000	0.000000	0	gene	False	ENSG00000277475	NCBITaxon:9606
ENSG00000268674	0.000003	0.000003	0.000003	0.997981	0	gene	False	ENSG00000268674	NCBITaxon:9606

33234 rows × 9 columns

Determine all of the cell types available in the .h5ad file. Determining the options in any obs can be done as similar as for cell_type.

[8]:

orig_h5ad.obs["cell_type"]

[8]:

3350      cardiac muscle cell
3351      cardiac muscle cell
3352      cardiac muscle cell
3353      cardiac muscle cell
3354      cardiac muscle cell
                 ...
310662    cardiac muscle cell
310663    cardiac muscle cell
310664    cardiac muscle cell
310665    cardiac muscle cell
310666    cardiac muscle cell
Name: cell_type, Length: 69150, dtype: category
Categories (1, object): ['cardiac muscle cell']

Obtain a single cell type from the h5ad file from a single ‘obs’. Or use the option below to add a single cell type bases on multiple delimiters.

[9]:

orig_h5ad.obs["Primary.Genetic.Diagnosis"]

[9]:

3350        TTN
3351        TTN
3352        TTN
3353        TTN
3354        TTN
          ...
310662    PVneg
310663    PVneg
310664    PVneg
310665    PVneg
310666    PVneg
Name: Primary.Genetic.Diagnosis, Length: 69150, dtype: category
Categories (11, object): ['DES', 'DSP', 'FKTN', 'FLNC', ..., 'RBM20', 'TNNC1', 'TNNT2', 'TTN']

[11]:

orig_h5ad.obs["disease"]

[11]:

3350      dilated cardiomyopathy
3351      dilated cardiomyopathy
3352      dilated cardiomyopathy
3353      dilated cardiomyopathy
3354      dilated cardiomyopathy
                   ...
310662    dilated cardiomyopathy
310663    dilated cardiomyopathy
310664    dilated cardiomyopathy
310665    dilated cardiomyopathy
310666    dilated cardiomyopathy
Name: disease, Length: 69150, dtype: category
Categories (1, object): ['dilated cardiomyopathy']

[11]:

onecell_h5ad = orig_h5ad[orig_h5ad.obs['cell_type'] == 'cardiac muscle cell']

[12]:

# If you have multiple delimiters for your cell type of interest, remove the '#' below and use this code
# to futher subset based on different obs codes of orig_h5ad above.  Check the column names of obs to use
# this correctly.  Column names will vary between h5ad sets so please edit accordingly.
# example1 - onecell_h5ad = orig_h5ad[(orig_h5ad.obs['disease'] == 'dilated cardiomyopathy') & (orig_h5ad.obs['Region_x'] == 'LV')]
# example2 - onecell_h5ad = orig_h5ad[(orig_h5ad.obs['disease'] == 'dilated cardiomyopathy') & (orig_h5ad.obs['Region_x'] == 'LV') & (orig_h5ad.obs['Primary.Genetic.Diagnosis'] == 'PVneg')]

onecell_h5ad = orig_h5ad[(orig_h5ad.obs['disease'] == 'dilated cardiomyopathy') & (orig_h5ad.obs['Region_x'] == 'LV') & (orig_h5ad.obs['Primary.Genetic.Diagnosis'] == 'TTN')]

Establish the shape of the subset matrix and compare it to the original matrix. The onecell_h5ad have fewer rows.

[13]:

orig_h5ad.shape

[13]:

(69150, 33234)

[14]:

onecell_h5ad.shape

[14]:

(17965, 33234)

[15]:

onecell_h5ad.var

[15]:

	vst.mean	vst.variance	vst.variance.expected	vst.variance.standardized	vst.variable	feature_biotype	feature_is_filtered	feature_name	feature_reference
gene_ids
ENSG00000243485	0.000071	0.000071	0.000074	0.958714	0	gene	False	MIR1302-2HG	NCBITaxon:9606
ENSG00000237613	0.000003	0.000003	0.000003	0.997981	0	gene	False	FAM138A	NCBITaxon:9606
ENSG00000186092	0.000000	0.000000	0.000000	0.000000	0	gene	False	OR4F5	NCBITaxon:9606
ENSG00000238009	0.031093	0.032638	0.035845	0.910519	0	gene	False	RP11-34P13.7	NCBITaxon:9606
ENSG00000239945	0.000000	0.000000	0.000000	0.000000	0	gene	False	RP11-34P13.8	NCBITaxon:9606
...	...	...	...	...	...	...	...	...	...
ENSG00000277856	0.000000	0.000000	0.000000	0.000000	0	gene	False	ENSG00000277856	NCBITaxon:9606
ENSG00000275063	0.000000	0.000000	0.000000	0.000000	0	gene	False	ENSG00000275063	NCBITaxon:9606
ENSG00000271254	0.005992	0.006174	0.006542	0.943884	0	gene	False	ENSG00000271254	NCBITaxon:9606
ENSG00000277475	0.000000	0.000000	0.000000	0.000000	0	gene	False	ENSG00000277475	NCBITaxon:9606
ENSG00000268674	0.000003	0.000003	0.000003	0.997981	0	gene	False	ENSG00000268674	NCBITaxon:9606

33234 rows × 9 columns

Create the subsetted h5ad file with the specific cell type - delimited by cell type and any other qualifiers needed.

[16]:

onecell_h5ad.write_h5ad("subset_seidman_TTN.h5ad")

Dataset is a variable obtained from the biomart annotation needed for the Brooklyn plot.

[17]:

dataset = sc.queries.biomart_annotations(
        "hsapiens",
        ["ensembl_gene_id", "start_position", "end_position", "chromosome_name", "hgnc_symbol", "band"],
    ).set_index("ensembl_gene_id")

[18]:

dataset

[18]:

	start_position	end_position	chromosome_name	hgnc_symbol	band
ensembl_gene_id
ENSG00000210049	577	647	MT	MT-TF	NaN
ENSG00000211459	648	1601	MT	MT-RNR1	NaN
ENSG00000210077	1602	1670	MT	MT-TV	NaN
ENSG00000210082	1671	3229	MT	MT-RNR2	NaN
ENSG00000209082	3230	3304	MT	MT-TL1	NaN
...	...	...	...	...	...
ENSG00000162437	64745075	64833232	1	RAVER2	p31.3
ENSG00000122432	84506300	84567379	1	SPATA1	p22.3
ENSG00000284882	84574114	84583620	1	NaN	p22.3
ENSG00000289881	84614068	84621061	1	NaN	p22.3
ENSG00000285325	84785427	84786714	1	NaN	p22.3

69305 rows × 5 columns

Converting raw expression count data and converting to a numpy array.

[19]:

data = onecell_h5ad.X.toarray()

This demonstrates what the array values look like.

[20]:

data[:5]

[20]:

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

This indicates how many genes are in the array (length), which can be important for generating the Brooklyn plot. Ideally, the original h5ad array does not limit the number of genes from the original sequencing.

[21]:

onecell_h5ad.var_names

[21]:

Index(['ENSG00000243485', 'ENSG00000237613', 'ENSG00000186092',
       'ENSG00000238009', 'ENSG00000239945', 'ENSG00000239906',
       'ENSG00000241599', 'ENSG00000236601', 'ENSG00000284733',
       'ENSG00000235146',
       ...
       'ENSG00000277196', 'ENSG00000277630', 'ENSG00000278384',
       'ENSG00000278633', 'ENSG00000276345', 'ENSG00000277856',
       'ENSG00000275063', 'ENSG00000271254', 'ENSG00000277475',
       'ENSG00000268674'],
      dtype='object', name='gene_ids', length=33234)

[22]:

orig_h5ad.X

[22]:

<69150x33234 sparse matrix of type '<class 'numpy.float32'>'
        with 161117875 stored elements in Compressed Sparse Row format>

This code generates mean values for each gene (ENSG ID), from raw expression data, and appends this to gene annotations from biomart.

[23]:

xmen = onecell_h5ad.raw.X.mean(0)
type(xmen)
xmendf = pd.DataFrame(xmen.T, columns = ['xMean'], index = onecell_h5ad.var_names)
xmendf
df_cd = pd.merge(xmendf, dataset, left_index=True, right_index=True)
df_cd

[23]:

	xMean	start_position	end_position	chromosome_name	hgnc_symbol	band
ENSG00000000003	0.004342	100627108	100639991	X	TSPAN6	q22.1
ENSG00000000005	0.001058	100584936	100599885	X	TNMD	q22.1
ENSG00000000419	0.360610	50934867	50959140	20	DPM1	q13.13
ENSG00000000457	0.165547	169849631	169894267	1	SCYL3	q24.2
ENSG00000000460	0.155693	169662007	169854080	1	C1orf112	q24.2
...	...	...	...	...	...	...
ENSG00000285492	0.002394	159051674	159121506	6	NaN	q25.3
ENSG00000285505	0.054773	41956879	41994232	19	NaN	q13.2
ENSG00000285508	0.000000	10413520	10431922	20	NaN	p12.2
ENSG00000285509	0.012970	121024125	121113108	11	TBCEL-TECTA	q23.3
ENSG00000285513	0.000000	116820645	116821541	11	NaN	q23.3

33147 rows × 6 columns

This resets the index and adds a name for the first column (ENSG ID).

[24]:

df_cd_export = df_cd.reset_index().rename(columns={'index': 'gene_ids'})

[25]:

df_cd_export

[25]:

	gene_ids	xMean	start_position	end_position	chromosome_name	hgnc_symbol	band
0	ENSG00000000003	0.004342	100627108	100639991	X	TSPAN6	q22.1
1	ENSG00000000005	0.001058	100584936	100599885	X	TNMD	q22.1
2	ENSG00000000419	0.360610	50934867	50959140	20	DPM1	q13.13
3	ENSG00000000457	0.165547	169849631	169894267	1	SCYL3	q24.2
4	ENSG00000000460	0.155693	169662007	169854080	1	C1orf112	q24.2
...	...	...	...	...	...	...	...
33142	ENSG00000285492	0.002394	159051674	159121506	6	NaN	q25.3
33143	ENSG00000285505	0.054773	41956879	41994232	19	NaN	q13.2
33144	ENSG00000285508	0.000000	10413520	10431922	20	NaN	p12.2
33145	ENSG00000285509	0.012970	121024125	121113108	11	TBCEL-TECTA	q23.3
33146	ENSG00000285513	0.000000	116820645	116821541	11	NaN	q23.3

33147 rows × 7 columns

This command exports a csv file that can be used to pick (automated or manually), genes spread across the whole genome for Brooklyn plots.

[26]:

df_cd_export.to_csv("seidmanttn_var_biomart.csv", index=False)

This part of the code removes the top 3,500 genes by raw xMean, then sorts by chromosome position and generates gene lists of all 3,500 genes and 350 interspersed genes to generate the Brooklyn plot needed for the next python step.

[27]:

df_cd_export2 = df_cd_export.sort_values(by = ['xMean'],ascending = False).reset_index(drop = True)

[28]:

df_cd_export_top = df_cd_export2.iloc[:3500,:]

[29]:

df_cd_export_top

[29]:

	gene_ids	xMean	start_position	end_position	chromosome_name	hgnc_symbol	band
0	ENSG00000251562	1618.217773	65497688	65506516	11	MALAT1	q13.1
1	ENSG00000198626	146.410553	237042184	237833988	1	RYR2	q43
2	ENSG00000155657	100.666641	178525989	178830802	2	TTN	q31.2
3	ENSG00000245532	39.976116	65422774	65445540	11	NEAT1	q13.1
4	ENSG00000183023	38.942490	40097270	40611053	2	SLC8A1	p22.1
...	...	...	...	...	...	...	...
3495	ENSG00000231672	0.338896	217284019	217756593	2	DIRC3	q35
3496	ENSG00000059758	0.338786	96278261	96400480	12	CDK17	q23.1
3497	ENSG00000134375	0.338454	201955503	201970664	1	TIMM17A	q32.1
3498	ENSG00000070018	0.338398	12116025	12267044	12	LRP6	p13.2
3499	ENSG00000110811	0.338174	6828407	6839847	12	P3H3	p13.31

3500 rows × 7 columns

[30]:

df_cd_export_genelist = df_cd_export_top.sort_values(by = ['chromosome_name','end_position']).reset_index(drop = True)

[31]:

df_cd_export_genelist

[31]:

	gene_ids	xMean	start_position	end_position	chromosome_name	hgnc_symbol	band
0	ENSG00000187642	0.430749	975198	982117	1	PERM1	p36.33
1	ENSG00000221978	0.356993	1385711	1399335	1	CCNL2	p36.33
2	ENSG00000189409	0.413938	1632163	1635263	1	MMP23B	p36.33
3	ENSG00000078369	0.475395	1785285	1892292	1	GNB1	p36.33
4	ENSG00000142611	0.866854	3069168	3438621	1	PRDM16	p36.32
...	...	...	...	...	...	...	...
3495	ENSG00000114374	0.857443	12537650	12860839	Y	USP9Y	q11.221
3496	ENSG00000183878	1.383058	13234577	13480673	Y	UTY	q11.221
3497	ENSG00000176728	3.638888	18772706	19077416	Y	TTTY14	q11.222
3498	ENSG00000229236	0.921071	20464916	20575519	Y	TTTY10	q11.223
3499	ENSG00000198692	0.371463	20575776	20593154	Y	EIF1AY	q11.223

3500 rows × 7 columns

[32]:

df_cd_export_againstlist = df_cd_export_genelist.iloc[:3500,:1]

[33]:

df_cd_export_againstlist

[33]:

	gene_ids
0	ENSG00000187642
1	ENSG00000221978
2	ENSG00000189409
3	ENSG00000078369
4	ENSG00000142611
...	...
3495	ENSG00000114374
3496	ENSG00000183878
3497	ENSG00000176728
3498	ENSG00000229236
3499	ENSG00000198692

3500 rows × 1 columns

[34]:

df_cd_export_againstlist.to_csv("againstlist.csv", index=False)

[35]:

df_cd_export_genelist_parse = df_cd_export_genelist.iloc[1::10, :]

[36]:

df_cd_export_towrite = df_cd_export_genelist_parse.iloc[:350,:1]

[37]:

df_cd_export_towrite

[37]:

	gene_ids
1	ENSG00000221978
11	ENSG00000180758
21	ENSG00000175206
31	ENSG00000037637
41	ENSG00000169641
...	...
3451	ENSG00000232593
3461	ENSG00000229807
3471	ENSG00000166432
3481	ENSG00000101972
3491	ENSG00000102125

350 rows × 1 columns

[38]:

df_cd_export_towrite.to_csv("genelist.csv", index=False)

[ ]: