Tutorial notebook - TTN

[1]:
# This is code to parse a h5ad file into a single delimited cell type
# for generation of a Brooklyn plot. The output files are a new delimited
# h5ad file of a single cell type and a csv file of genes with their
# chromosome locations and mean expression from that cell type.
# Code written by Arun Patil and edited by Marc Halushka
# CC BY 2022
[2]:
import numpy as np
import scipy as sp
import scanpy as sc
import pandas as pd
import matplotlib.pyplot as plt
import glob
import os, fnmatch
import requests
import io
import seaborn as sns
import scipy
import pybiomart
sc.settings.verbosity = 3
sc.settings.set_figure_params(dpi = 120, color_map = 'RdBu_r')
from scipy import stats

Import the original .h5ad file here. Chose the appropriate file location.

[3]:
orig_h5ad = sc.read_h5ad('subset_Cells_DCM_normalized_humanLV_112122.h5ad')

Obtain the observations (obs) list of the h5ad file.

[4]:
orig_h5ad
[4]:
AnnData object with n_obs × n_vars = 69150 × 33234
    obs: 'Sample', 'Patient', 'Region_x', 'Primary.Genetic.Diagnosis', 'n_genes', 'n_counts', 'percent_mito', 'percent_ribo', 'scrublet_score_z', 'scrublet_score_log', 'solo_score', 'cell_states', 'Assigned', 'ethnicity_ontology_term_id', 'disease_ontology_term_id', 'cell_type_ontology_term_id', 'sex_ontology_term_id', 'assay_ontology_term_id', 'organism_ontology_term_id', 'is_primary_data', 'tissue_ontology_term_id', 'development_stage_ontology_term_id', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'ethnicity', 'development_stage'
    var: 'vst.mean', 'vst.variance', 'vst.variance.expected', 'vst.variance.standardized', 'vst.variable', 'feature_biotype', 'feature_is_filtered', 'feature_name', 'feature_reference'
    uns: 'X_normalization', 'cell_states_colors', 'cell_type_ontology_term_id_colors', 'layer_descriptions', 'schema_version', 'title'
    obsm: 'X_pca', 'X_umap'
[5]:
orig_h5ad.obs
[5]:
Sample Patient Region_x Primary.Genetic.Diagnosis n_genes n_counts percent_mito percent_ribo scrublet_score_z scrublet_score_log ... tissue_ontology_term_id development_stage_ontology_term_id cell_type assay disease organism sex tissue ethnicity development_stage
3350 ED_DT4_LV0_premrna DT4 LV TTN 2536 2920.292236 0.000278 0.001113 0.066042 0.020795 ... UBERON:0002084 HsapDv:0000241 cardiac muscle cell 10x 3' v3 dilated cardiomyopathy Homo sapiens male heart left ventricle unknown seventh decade human stage
3351 ED_DT4_LV0_premrna DT4 LV TTN 887 1920.013428 0.001704 0.000000 0.183444 0.001330 ... UBERON:0002084 HsapDv:0000241 cardiac muscle cell 10x 3' v3 dilated cardiomyopathy Homo sapiens male heart left ventricle unknown seventh decade human stage
3352 ED_DT4_LV0_premrna DT4 LV TTN 2568 3276.649414 0.003481 0.001266 0.090403 0.083051 ... UBERON:0002084 HsapDv:0000241 cardiac muscle cell 10x 3' v3 dilated cardiomyopathy Homo sapiens male heart left ventricle unknown seventh decade human stage
3353 ED_DT4_LV0_premrna DT4 LV TTN 1643 2521.693604 0.000748 0.000499 0.067423 0.004540 ... UBERON:0002084 HsapDv:0000241 cardiac muscle cell 10x 3' v3 dilated cardiomyopathy Homo sapiens male heart left ventricle unknown seventh decade human stage
3354 ED_DT4_LV0_premrna DT4 LV TTN 1832 3079.880615 0.000304 0.000304 0.094364 0.007266 ... UBERON:0002084 HsapDv:0000241 cardiac muscle cell 10x 3' v3 dilated cardiomyopathy Homo sapiens male heart left ventricle unknown seventh decade human stage
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
310662 IC_H04_LV0_premrna IC_H04 LV PVneg 1894 2627.427490 0.003090 0.000813 0.009396 0.004142 ... UBERON:0002084 HsapDv:0000241 cardiac muscle cell 10x 3' v3 dilated cardiomyopathy Homo sapiens female heart left ventricle unknown seventh decade human stage
310663 IC_H04_LV0_premrna IC_H04 LV PVneg 2941 3399.208252 0.001667 0.001334 0.028765 0.005743 ... UBERON:0002084 HsapDv:0000241 cardiac muscle cell 10x 3' v3 dilated cardiomyopathy Homo sapiens female heart left ventricle unknown seventh decade human stage
310664 IC_H04_LV0_premrna IC_H04 LV PVneg 2479 3142.647461 0.000858 0.000572 0.020787 0.004737 ... UBERON:0002084 HsapDv:0000241 cardiac muscle cell 10x 3' v3 dilated cardiomyopathy Homo sapiens female heart left ventricle unknown seventh decade human stage
310665 IC_H04_LV0_premrna IC_H04 LV PVneg 2645 3106.231201 0.001000 0.002000 0.020787 0.003347 ... UBERON:0002084 HsapDv:0000241 cardiac muscle cell 10x 3' v3 dilated cardiomyopathy Homo sapiens female heart left ventricle unknown seventh decade human stage
310666 IC_H04_LV0_premrna IC_H04 LV PVneg 474 1415.674561 0.000000 0.001773 0.063112 0.013699 ... UBERON:0002084 HsapDv:0000241 cardiac muscle cell 10x 3' v3 dilated cardiomyopathy Homo sapiens female heart left ventricle unknown seventh decade human stage

69150 rows × 30 columns

[6]:
orig_h5ad.obs.columns
[6]:
Index(['Sample', 'Patient', 'Region_x', 'Primary.Genetic.Diagnosis', 'n_genes',
       'n_counts', 'percent_mito', 'percent_ribo', 'scrublet_score_z',
       'scrublet_score_log', 'solo_score', 'cell_states', 'Assigned',
       'ethnicity_ontology_term_id', 'disease_ontology_term_id',
       'cell_type_ontology_term_id', 'sex_ontology_term_id',
       'assay_ontology_term_id', 'organism_ontology_term_id',
       'is_primary_data', 'tissue_ontology_term_id',
       'development_stage_ontology_term_id', 'cell_type', 'assay', 'disease',
       'organism', 'sex', 'tissue', 'ethnicity', 'development_stage'],
      dtype='object')
[7]:
orig_h5ad.var
[7]:
vst.mean vst.variance vst.variance.expected vst.variance.standardized vst.variable feature_biotype feature_is_filtered feature_name feature_reference
gene_ids
ENSG00000243485 0.000071 0.000071 0.000074 0.958714 0 gene False MIR1302-2HG NCBITaxon:9606
ENSG00000237613 0.000003 0.000003 0.000003 0.997981 0 gene False FAM138A NCBITaxon:9606
ENSG00000186092 0.000000 0.000000 0.000000 0.000000 0 gene False OR4F5 NCBITaxon:9606
ENSG00000238009 0.031093 0.032638 0.035845 0.910519 0 gene False RP11-34P13.7 NCBITaxon:9606
ENSG00000239945 0.000000 0.000000 0.000000 0.000000 0 gene False RP11-34P13.8 NCBITaxon:9606
... ... ... ... ... ... ... ... ... ...
ENSG00000277856 0.000000 0.000000 0.000000 0.000000 0 gene False ENSG00000277856 NCBITaxon:9606
ENSG00000275063 0.000000 0.000000 0.000000 0.000000 0 gene False ENSG00000275063 NCBITaxon:9606
ENSG00000271254 0.005992 0.006174 0.006542 0.943884 0 gene False ENSG00000271254 NCBITaxon:9606
ENSG00000277475 0.000000 0.000000 0.000000 0.000000 0 gene False ENSG00000277475 NCBITaxon:9606
ENSG00000268674 0.000003 0.000003 0.000003 0.997981 0 gene False ENSG00000268674 NCBITaxon:9606

33234 rows × 9 columns

Determine all of the cell types available in the .h5ad file. Determining the options in any obs can be done as similar as for cell_type.

[8]:
orig_h5ad.obs["cell_type"]
[8]:
3350      cardiac muscle cell
3351      cardiac muscle cell
3352      cardiac muscle cell
3353      cardiac muscle cell
3354      cardiac muscle cell
                 ...
310662    cardiac muscle cell
310663    cardiac muscle cell
310664    cardiac muscle cell
310665    cardiac muscle cell
310666    cardiac muscle cell
Name: cell_type, Length: 69150, dtype: category
Categories (1, object): ['cardiac muscle cell']

Obtain a single cell type from the h5ad file from a single ‘obs’. Or use the option below to add a single cell type bases on multiple delimiters.

[9]:
orig_h5ad.obs["Primary.Genetic.Diagnosis"]
[9]:
3350        TTN
3351        TTN
3352        TTN
3353        TTN
3354        TTN
          ...
310662    PVneg
310663    PVneg
310664    PVneg
310665    PVneg
310666    PVneg
Name: Primary.Genetic.Diagnosis, Length: 69150, dtype: category
Categories (11, object): ['DES', 'DSP', 'FKTN', 'FLNC', ..., 'RBM20', 'TNNC1', 'TNNT2', 'TTN']
[11]:
orig_h5ad.obs["disease"]
[11]:
3350      dilated cardiomyopathy
3351      dilated cardiomyopathy
3352      dilated cardiomyopathy
3353      dilated cardiomyopathy
3354      dilated cardiomyopathy
                   ...
310662    dilated cardiomyopathy
310663    dilated cardiomyopathy
310664    dilated cardiomyopathy
310665    dilated cardiomyopathy
310666    dilated cardiomyopathy
Name: disease, Length: 69150, dtype: category
Categories (1, object): ['dilated cardiomyopathy']
[11]:
onecell_h5ad = orig_h5ad[orig_h5ad.obs['cell_type'] == 'cardiac muscle cell']
[12]:
# If you have multiple delimiters for your cell type of interest, remove the '#' below and use this code
# to futher subset based on different obs codes of orig_h5ad above.  Check the column names of obs to use
# this correctly.  Column names will vary between h5ad sets so please edit accordingly.
# example1 - onecell_h5ad = orig_h5ad[(orig_h5ad.obs['disease'] == 'dilated cardiomyopathy') & (orig_h5ad.obs['Region_x'] == 'LV')]
# example2 - onecell_h5ad = orig_h5ad[(orig_h5ad.obs['disease'] == 'dilated cardiomyopathy') & (orig_h5ad.obs['Region_x'] == 'LV') & (orig_h5ad.obs['Primary.Genetic.Diagnosis'] == 'PVneg')]

onecell_h5ad = orig_h5ad[(orig_h5ad.obs['disease'] == 'dilated cardiomyopathy') & (orig_h5ad.obs['Region_x'] == 'LV') & (orig_h5ad.obs['Primary.Genetic.Diagnosis'] == 'TTN')]

Establish the shape of the subset matrix and compare it to the original matrix. The onecell_h5ad have fewer rows.

[13]:
orig_h5ad.shape
[13]:
(69150, 33234)
[14]:
onecell_h5ad.shape
[14]:
(17965, 33234)
[15]:
onecell_h5ad.var
[15]:
vst.mean vst.variance vst.variance.expected vst.variance.standardized vst.variable feature_biotype feature_is_filtered feature_name feature_reference
gene_ids
ENSG00000243485 0.000071 0.000071 0.000074 0.958714 0 gene False MIR1302-2HG NCBITaxon:9606
ENSG00000237613 0.000003 0.000003 0.000003 0.997981 0 gene False FAM138A NCBITaxon:9606
ENSG00000186092 0.000000 0.000000 0.000000 0.000000 0 gene False OR4F5 NCBITaxon:9606
ENSG00000238009 0.031093 0.032638 0.035845 0.910519 0 gene False RP11-34P13.7 NCBITaxon:9606
ENSG00000239945 0.000000 0.000000 0.000000 0.000000 0 gene False RP11-34P13.8 NCBITaxon:9606
... ... ... ... ... ... ... ... ... ...
ENSG00000277856 0.000000 0.000000 0.000000 0.000000 0 gene False ENSG00000277856 NCBITaxon:9606
ENSG00000275063 0.000000 0.000000 0.000000 0.000000 0 gene False ENSG00000275063 NCBITaxon:9606
ENSG00000271254 0.005992 0.006174 0.006542 0.943884 0 gene False ENSG00000271254 NCBITaxon:9606
ENSG00000277475 0.000000 0.000000 0.000000 0.000000 0 gene False ENSG00000277475 NCBITaxon:9606
ENSG00000268674 0.000003 0.000003 0.000003 0.997981 0 gene False ENSG00000268674 NCBITaxon:9606

33234 rows × 9 columns

Create the subsetted h5ad file with the specific cell type - delimited by cell type and any other qualifiers needed.

[16]:
onecell_h5ad.write_h5ad("subset_seidman_TTN.h5ad")

Dataset is a variable obtained from the biomart annotation needed for the Brooklyn plot.

[17]:
dataset = sc.queries.biomart_annotations(
        "hsapiens",
        ["ensembl_gene_id", "start_position", "end_position", "chromosome_name", "hgnc_symbol", "band"],
    ).set_index("ensembl_gene_id")

[18]:
dataset
[18]:
start_position end_position chromosome_name hgnc_symbol band
ensembl_gene_id
ENSG00000210049 577 647 MT MT-TF NaN
ENSG00000211459 648 1601 MT MT-RNR1 NaN
ENSG00000210077 1602 1670 MT MT-TV NaN
ENSG00000210082 1671 3229 MT MT-RNR2 NaN
ENSG00000209082 3230 3304 MT MT-TL1 NaN
... ... ... ... ... ...
ENSG00000162437 64745075 64833232 1 RAVER2 p31.3
ENSG00000122432 84506300 84567379 1 SPATA1 p22.3
ENSG00000284882 84574114 84583620 1 NaN p22.3
ENSG00000289881 84614068 84621061 1 NaN p22.3
ENSG00000285325 84785427 84786714 1 NaN p22.3

69305 rows × 5 columns

Converting raw expression count data and converting to a numpy array.

[19]:
data = onecell_h5ad.X.toarray()

This demonstrates what the array values look like.

[20]:
data[:5]
[20]:
array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

This indicates how many genes are in the array (length), which can be important for generating the Brooklyn plot. Ideally, the original h5ad array does not limit the number of genes from the original sequencing.

[21]:
onecell_h5ad.var_names
[21]:
Index(['ENSG00000243485', 'ENSG00000237613', 'ENSG00000186092',
       'ENSG00000238009', 'ENSG00000239945', 'ENSG00000239906',
       'ENSG00000241599', 'ENSG00000236601', 'ENSG00000284733',
       'ENSG00000235146',
       ...
       'ENSG00000277196', 'ENSG00000277630', 'ENSG00000278384',
       'ENSG00000278633', 'ENSG00000276345', 'ENSG00000277856',
       'ENSG00000275063', 'ENSG00000271254', 'ENSG00000277475',
       'ENSG00000268674'],
      dtype='object', name='gene_ids', length=33234)
[22]:
orig_h5ad.X
[22]:
<69150x33234 sparse matrix of type '<class 'numpy.float32'>'
        with 161117875 stored elements in Compressed Sparse Row format>

This code generates mean values for each gene (ENSG ID), from raw expression data, and appends this to gene annotations from biomart.

[23]:
xmen = onecell_h5ad.raw.X.mean(0)
type(xmen)
xmendf = pd.DataFrame(xmen.T, columns = ['xMean'], index = onecell_h5ad.var_names)
xmendf
df_cd = pd.merge(xmendf, dataset, left_index=True, right_index=True)
df_cd
[23]:
xMean start_position end_position chromosome_name hgnc_symbol band
ENSG00000000003 0.004342 100627108 100639991 X TSPAN6 q22.1
ENSG00000000005 0.001058 100584936 100599885 X TNMD q22.1
ENSG00000000419 0.360610 50934867 50959140 20 DPM1 q13.13
ENSG00000000457 0.165547 169849631 169894267 1 SCYL3 q24.2
ENSG00000000460 0.155693 169662007 169854080 1 C1orf112 q24.2
... ... ... ... ... ... ...
ENSG00000285492 0.002394 159051674 159121506 6 NaN q25.3
ENSG00000285505 0.054773 41956879 41994232 19 NaN q13.2
ENSG00000285508 0.000000 10413520 10431922 20 NaN p12.2
ENSG00000285509 0.012970 121024125 121113108 11 TBCEL-TECTA q23.3
ENSG00000285513 0.000000 116820645 116821541 11 NaN q23.3

33147 rows × 6 columns

This resets the index and adds a name for the first column (ENSG ID).

[24]:
df_cd_export = df_cd.reset_index().rename(columns={'index': 'gene_ids'})
[25]:
df_cd_export
[25]:
gene_ids xMean start_position end_position chromosome_name hgnc_symbol band
0 ENSG00000000003 0.004342 100627108 100639991 X TSPAN6 q22.1
1 ENSG00000000005 0.001058 100584936 100599885 X TNMD q22.1
2 ENSG00000000419 0.360610 50934867 50959140 20 DPM1 q13.13
3 ENSG00000000457 0.165547 169849631 169894267 1 SCYL3 q24.2
4 ENSG00000000460 0.155693 169662007 169854080 1 C1orf112 q24.2
... ... ... ... ... ... ... ...
33142 ENSG00000285492 0.002394 159051674 159121506 6 NaN q25.3
33143 ENSG00000285505 0.054773 41956879 41994232 19 NaN q13.2
33144 ENSG00000285508 0.000000 10413520 10431922 20 NaN p12.2
33145 ENSG00000285509 0.012970 121024125 121113108 11 TBCEL-TECTA q23.3
33146 ENSG00000285513 0.000000 116820645 116821541 11 NaN q23.3

33147 rows × 7 columns

This command exports a csv file that can be used to pick (automated or manually), genes spread across the whole genome for Brooklyn plots.

[26]:
df_cd_export.to_csv("seidmanttn_var_biomart.csv", index=False)

This part of the code removes the top 3,500 genes by raw xMean, then sorts by chromosome position and generates gene lists of all 3,500 genes and 350 interspersed genes to generate the Brooklyn plot needed for the next python step.

[27]:
df_cd_export2 = df_cd_export.sort_values(by = ['xMean'],ascending = False).reset_index(drop = True)
[28]:
df_cd_export_top = df_cd_export2.iloc[:3500,:]
[29]:
df_cd_export_top
[29]:
gene_ids xMean start_position end_position chromosome_name hgnc_symbol band
0 ENSG00000251562 1618.217773 65497688 65506516 11 MALAT1 q13.1
1 ENSG00000198626 146.410553 237042184 237833988 1 RYR2 q43
2 ENSG00000155657 100.666641 178525989 178830802 2 TTN q31.2
3 ENSG00000245532 39.976116 65422774 65445540 11 NEAT1 q13.1
4 ENSG00000183023 38.942490 40097270 40611053 2 SLC8A1 p22.1
... ... ... ... ... ... ... ...
3495 ENSG00000231672 0.338896 217284019 217756593 2 DIRC3 q35
3496 ENSG00000059758 0.338786 96278261 96400480 12 CDK17 q23.1
3497 ENSG00000134375 0.338454 201955503 201970664 1 TIMM17A q32.1
3498 ENSG00000070018 0.338398 12116025 12267044 12 LRP6 p13.2
3499 ENSG00000110811 0.338174 6828407 6839847 12 P3H3 p13.31

3500 rows × 7 columns

[30]:
df_cd_export_genelist = df_cd_export_top.sort_values(by = ['chromosome_name','end_position']).reset_index(drop = True)
[31]:
df_cd_export_genelist
[31]:
gene_ids xMean start_position end_position chromosome_name hgnc_symbol band
0 ENSG00000187642 0.430749 975198 982117 1 PERM1 p36.33
1 ENSG00000221978 0.356993 1385711 1399335 1 CCNL2 p36.33
2 ENSG00000189409 0.413938 1632163 1635263 1 MMP23B p36.33
3 ENSG00000078369 0.475395 1785285 1892292 1 GNB1 p36.33
4 ENSG00000142611 0.866854 3069168 3438621 1 PRDM16 p36.32
... ... ... ... ... ... ... ...
3495 ENSG00000114374 0.857443 12537650 12860839 Y USP9Y q11.221
3496 ENSG00000183878 1.383058 13234577 13480673 Y UTY q11.221
3497 ENSG00000176728 3.638888 18772706 19077416 Y TTTY14 q11.222
3498 ENSG00000229236 0.921071 20464916 20575519 Y TTTY10 q11.223
3499 ENSG00000198692 0.371463 20575776 20593154 Y EIF1AY q11.223

3500 rows × 7 columns

[32]:
df_cd_export_againstlist = df_cd_export_genelist.iloc[:3500,:1]
[33]:
df_cd_export_againstlist
[33]:
gene_ids
0 ENSG00000187642
1 ENSG00000221978
2 ENSG00000189409
3 ENSG00000078369
4 ENSG00000142611
... ...
3495 ENSG00000114374
3496 ENSG00000183878
3497 ENSG00000176728
3498 ENSG00000229236
3499 ENSG00000198692

3500 rows × 1 columns

[34]:
df_cd_export_againstlist.to_csv("againstlist.csv", index=False)
[35]:
df_cd_export_genelist_parse = df_cd_export_genelist.iloc[1::10, :]
[36]:
df_cd_export_towrite = df_cd_export_genelist_parse.iloc[:350,:1]
[37]:
df_cd_export_towrite
[37]:
gene_ids
1 ENSG00000221978
11 ENSG00000180758
21 ENSG00000175206
31 ENSG00000037637
41 ENSG00000169641
... ...
3451 ENSG00000232593
3461 ENSG00000229807
3471 ENSG00000166432
3481 ENSG00000101972
3491 ENSG00000102125

350 rows × 1 columns

[38]:
df_cd_export_towrite.to_csv("genelist.csv", index=False)
[ ]: