Tutorial notebook - TTN
[1]:
# This is code to parse a h5ad file into a single delimited cell type
# for generation of a Brooklyn plot. The output files are a new delimited
# h5ad file of a single cell type and a csv file of genes with their
# chromosome locations and mean expression from that cell type.
# Code written by Arun Patil and edited by Marc Halushka
# CC BY 2022
[2]:
import numpy as np
import scipy as sp
import scanpy as sc
import pandas as pd
import matplotlib.pyplot as plt
import glob
import os, fnmatch
import requests
import io
import seaborn as sns
import scipy
import pybiomart
sc.settings.verbosity = 3
sc.settings.set_figure_params(dpi = 120, color_map = 'RdBu_r')
from scipy import stats
Import the original .h5ad file here. Chose the appropriate file location.
[3]:
orig_h5ad = sc.read_h5ad('subset_Cells_DCM_normalized_humanLV_112122.h5ad')
Obtain the observations (obs) list of the h5ad file.
[4]:
orig_h5ad
[4]:
AnnData object with n_obs × n_vars = 69150 × 33234
obs: 'Sample', 'Patient', 'Region_x', 'Primary.Genetic.Diagnosis', 'n_genes', 'n_counts', 'percent_mito', 'percent_ribo', 'scrublet_score_z', 'scrublet_score_log', 'solo_score', 'cell_states', 'Assigned', 'ethnicity_ontology_term_id', 'disease_ontology_term_id', 'cell_type_ontology_term_id', 'sex_ontology_term_id', 'assay_ontology_term_id', 'organism_ontology_term_id', 'is_primary_data', 'tissue_ontology_term_id', 'development_stage_ontology_term_id', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'ethnicity', 'development_stage'
var: 'vst.mean', 'vst.variance', 'vst.variance.expected', 'vst.variance.standardized', 'vst.variable', 'feature_biotype', 'feature_is_filtered', 'feature_name', 'feature_reference'
uns: 'X_normalization', 'cell_states_colors', 'cell_type_ontology_term_id_colors', 'layer_descriptions', 'schema_version', 'title'
obsm: 'X_pca', 'X_umap'
[5]:
orig_h5ad.obs
[5]:
Sample | Patient | Region_x | Primary.Genetic.Diagnosis | n_genes | n_counts | percent_mito | percent_ribo | scrublet_score_z | scrublet_score_log | ... | tissue_ontology_term_id | development_stage_ontology_term_id | cell_type | assay | disease | organism | sex | tissue | ethnicity | development_stage | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
3350 | ED_DT4_LV0_premrna | DT4 | LV | TTN | 2536 | 2920.292236 | 0.000278 | 0.001113 | 0.066042 | 0.020795 | ... | UBERON:0002084 | HsapDv:0000241 | cardiac muscle cell | 10x 3' v3 | dilated cardiomyopathy | Homo sapiens | male | heart left ventricle | unknown | seventh decade human stage |
3351 | ED_DT4_LV0_premrna | DT4 | LV | TTN | 887 | 1920.013428 | 0.001704 | 0.000000 | 0.183444 | 0.001330 | ... | UBERON:0002084 | HsapDv:0000241 | cardiac muscle cell | 10x 3' v3 | dilated cardiomyopathy | Homo sapiens | male | heart left ventricle | unknown | seventh decade human stage |
3352 | ED_DT4_LV0_premrna | DT4 | LV | TTN | 2568 | 3276.649414 | 0.003481 | 0.001266 | 0.090403 | 0.083051 | ... | UBERON:0002084 | HsapDv:0000241 | cardiac muscle cell | 10x 3' v3 | dilated cardiomyopathy | Homo sapiens | male | heart left ventricle | unknown | seventh decade human stage |
3353 | ED_DT4_LV0_premrna | DT4 | LV | TTN | 1643 | 2521.693604 | 0.000748 | 0.000499 | 0.067423 | 0.004540 | ... | UBERON:0002084 | HsapDv:0000241 | cardiac muscle cell | 10x 3' v3 | dilated cardiomyopathy | Homo sapiens | male | heart left ventricle | unknown | seventh decade human stage |
3354 | ED_DT4_LV0_premrna | DT4 | LV | TTN | 1832 | 3079.880615 | 0.000304 | 0.000304 | 0.094364 | 0.007266 | ... | UBERON:0002084 | HsapDv:0000241 | cardiac muscle cell | 10x 3' v3 | dilated cardiomyopathy | Homo sapiens | male | heart left ventricle | unknown | seventh decade human stage |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
310662 | IC_H04_LV0_premrna | IC_H04 | LV | PVneg | 1894 | 2627.427490 | 0.003090 | 0.000813 | 0.009396 | 0.004142 | ... | UBERON:0002084 | HsapDv:0000241 | cardiac muscle cell | 10x 3' v3 | dilated cardiomyopathy | Homo sapiens | female | heart left ventricle | unknown | seventh decade human stage |
310663 | IC_H04_LV0_premrna | IC_H04 | LV | PVneg | 2941 | 3399.208252 | 0.001667 | 0.001334 | 0.028765 | 0.005743 | ... | UBERON:0002084 | HsapDv:0000241 | cardiac muscle cell | 10x 3' v3 | dilated cardiomyopathy | Homo sapiens | female | heart left ventricle | unknown | seventh decade human stage |
310664 | IC_H04_LV0_premrna | IC_H04 | LV | PVneg | 2479 | 3142.647461 | 0.000858 | 0.000572 | 0.020787 | 0.004737 | ... | UBERON:0002084 | HsapDv:0000241 | cardiac muscle cell | 10x 3' v3 | dilated cardiomyopathy | Homo sapiens | female | heart left ventricle | unknown | seventh decade human stage |
310665 | IC_H04_LV0_premrna | IC_H04 | LV | PVneg | 2645 | 3106.231201 | 0.001000 | 0.002000 | 0.020787 | 0.003347 | ... | UBERON:0002084 | HsapDv:0000241 | cardiac muscle cell | 10x 3' v3 | dilated cardiomyopathy | Homo sapiens | female | heart left ventricle | unknown | seventh decade human stage |
310666 | IC_H04_LV0_premrna | IC_H04 | LV | PVneg | 474 | 1415.674561 | 0.000000 | 0.001773 | 0.063112 | 0.013699 | ... | UBERON:0002084 | HsapDv:0000241 | cardiac muscle cell | 10x 3' v3 | dilated cardiomyopathy | Homo sapiens | female | heart left ventricle | unknown | seventh decade human stage |
69150 rows × 30 columns
[6]:
orig_h5ad.obs.columns
[6]:
Index(['Sample', 'Patient', 'Region_x', 'Primary.Genetic.Diagnosis', 'n_genes',
'n_counts', 'percent_mito', 'percent_ribo', 'scrublet_score_z',
'scrublet_score_log', 'solo_score', 'cell_states', 'Assigned',
'ethnicity_ontology_term_id', 'disease_ontology_term_id',
'cell_type_ontology_term_id', 'sex_ontology_term_id',
'assay_ontology_term_id', 'organism_ontology_term_id',
'is_primary_data', 'tissue_ontology_term_id',
'development_stage_ontology_term_id', 'cell_type', 'assay', 'disease',
'organism', 'sex', 'tissue', 'ethnicity', 'development_stage'],
dtype='object')
[7]:
orig_h5ad.var
[7]:
vst.mean | vst.variance | vst.variance.expected | vst.variance.standardized | vst.variable | feature_biotype | feature_is_filtered | feature_name | feature_reference | |
---|---|---|---|---|---|---|---|---|---|
gene_ids | |||||||||
ENSG00000243485 | 0.000071 | 0.000071 | 0.000074 | 0.958714 | 0 | gene | False | MIR1302-2HG | NCBITaxon:9606 |
ENSG00000237613 | 0.000003 | 0.000003 | 0.000003 | 0.997981 | 0 | gene | False | FAM138A | NCBITaxon:9606 |
ENSG00000186092 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0 | gene | False | OR4F5 | NCBITaxon:9606 |
ENSG00000238009 | 0.031093 | 0.032638 | 0.035845 | 0.910519 | 0 | gene | False | RP11-34P13.7 | NCBITaxon:9606 |
ENSG00000239945 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0 | gene | False | RP11-34P13.8 | NCBITaxon:9606 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
ENSG00000277856 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0 | gene | False | ENSG00000277856 | NCBITaxon:9606 |
ENSG00000275063 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0 | gene | False | ENSG00000275063 | NCBITaxon:9606 |
ENSG00000271254 | 0.005992 | 0.006174 | 0.006542 | 0.943884 | 0 | gene | False | ENSG00000271254 | NCBITaxon:9606 |
ENSG00000277475 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0 | gene | False | ENSG00000277475 | NCBITaxon:9606 |
ENSG00000268674 | 0.000003 | 0.000003 | 0.000003 | 0.997981 | 0 | gene | False | ENSG00000268674 | NCBITaxon:9606 |
33234 rows × 9 columns
Determine all of the cell types available in the .h5ad file. Determining the options in any obs can be done as similar as for cell_type.
[8]:
orig_h5ad.obs["cell_type"]
[8]:
3350 cardiac muscle cell
3351 cardiac muscle cell
3352 cardiac muscle cell
3353 cardiac muscle cell
3354 cardiac muscle cell
...
310662 cardiac muscle cell
310663 cardiac muscle cell
310664 cardiac muscle cell
310665 cardiac muscle cell
310666 cardiac muscle cell
Name: cell_type, Length: 69150, dtype: category
Categories (1, object): ['cardiac muscle cell']
Obtain a single cell type from the h5ad file from a single ‘obs’. Or use the option below to add a single cell type bases on multiple delimiters.
[9]:
orig_h5ad.obs["Primary.Genetic.Diagnosis"]
[9]:
3350 TTN
3351 TTN
3352 TTN
3353 TTN
3354 TTN
...
310662 PVneg
310663 PVneg
310664 PVneg
310665 PVneg
310666 PVneg
Name: Primary.Genetic.Diagnosis, Length: 69150, dtype: category
Categories (11, object): ['DES', 'DSP', 'FKTN', 'FLNC', ..., 'RBM20', 'TNNC1', 'TNNT2', 'TTN']
[11]:
orig_h5ad.obs["disease"]
[11]:
3350 dilated cardiomyopathy
3351 dilated cardiomyopathy
3352 dilated cardiomyopathy
3353 dilated cardiomyopathy
3354 dilated cardiomyopathy
...
310662 dilated cardiomyopathy
310663 dilated cardiomyopathy
310664 dilated cardiomyopathy
310665 dilated cardiomyopathy
310666 dilated cardiomyopathy
Name: disease, Length: 69150, dtype: category
Categories (1, object): ['dilated cardiomyopathy']
[11]:
onecell_h5ad = orig_h5ad[orig_h5ad.obs['cell_type'] == 'cardiac muscle cell']
[12]:
# If you have multiple delimiters for your cell type of interest, remove the '#' below and use this code
# to futher subset based on different obs codes of orig_h5ad above. Check the column names of obs to use
# this correctly. Column names will vary between h5ad sets so please edit accordingly.
# example1 - onecell_h5ad = orig_h5ad[(orig_h5ad.obs['disease'] == 'dilated cardiomyopathy') & (orig_h5ad.obs['Region_x'] == 'LV')]
# example2 - onecell_h5ad = orig_h5ad[(orig_h5ad.obs['disease'] == 'dilated cardiomyopathy') & (orig_h5ad.obs['Region_x'] == 'LV') & (orig_h5ad.obs['Primary.Genetic.Diagnosis'] == 'PVneg')]
onecell_h5ad = orig_h5ad[(orig_h5ad.obs['disease'] == 'dilated cardiomyopathy') & (orig_h5ad.obs['Region_x'] == 'LV') & (orig_h5ad.obs['Primary.Genetic.Diagnosis'] == 'TTN')]
Establish the shape of the subset matrix and compare it to the original matrix. The onecell_h5ad have fewer rows.
[13]:
orig_h5ad.shape
[13]:
(69150, 33234)
[14]:
onecell_h5ad.shape
[14]:
(17965, 33234)
[15]:
onecell_h5ad.var
[15]:
vst.mean | vst.variance | vst.variance.expected | vst.variance.standardized | vst.variable | feature_biotype | feature_is_filtered | feature_name | feature_reference | |
---|---|---|---|---|---|---|---|---|---|
gene_ids | |||||||||
ENSG00000243485 | 0.000071 | 0.000071 | 0.000074 | 0.958714 | 0 | gene | False | MIR1302-2HG | NCBITaxon:9606 |
ENSG00000237613 | 0.000003 | 0.000003 | 0.000003 | 0.997981 | 0 | gene | False | FAM138A | NCBITaxon:9606 |
ENSG00000186092 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0 | gene | False | OR4F5 | NCBITaxon:9606 |
ENSG00000238009 | 0.031093 | 0.032638 | 0.035845 | 0.910519 | 0 | gene | False | RP11-34P13.7 | NCBITaxon:9606 |
ENSG00000239945 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0 | gene | False | RP11-34P13.8 | NCBITaxon:9606 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
ENSG00000277856 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0 | gene | False | ENSG00000277856 | NCBITaxon:9606 |
ENSG00000275063 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0 | gene | False | ENSG00000275063 | NCBITaxon:9606 |
ENSG00000271254 | 0.005992 | 0.006174 | 0.006542 | 0.943884 | 0 | gene | False | ENSG00000271254 | NCBITaxon:9606 |
ENSG00000277475 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0 | gene | False | ENSG00000277475 | NCBITaxon:9606 |
ENSG00000268674 | 0.000003 | 0.000003 | 0.000003 | 0.997981 | 0 | gene | False | ENSG00000268674 | NCBITaxon:9606 |
33234 rows × 9 columns
Create the subsetted h5ad file with the specific cell type - delimited by cell type and any other qualifiers needed.
[16]:
onecell_h5ad.write_h5ad("subset_seidman_TTN.h5ad")
Dataset is a variable obtained from the biomart annotation needed for the Brooklyn plot.
[17]:
dataset = sc.queries.biomart_annotations(
"hsapiens",
["ensembl_gene_id", "start_position", "end_position", "chromosome_name", "hgnc_symbol", "band"],
).set_index("ensembl_gene_id")
[18]:
dataset
[18]:
start_position | end_position | chromosome_name | hgnc_symbol | band | |
---|---|---|---|---|---|
ensembl_gene_id | |||||
ENSG00000210049 | 577 | 647 | MT | MT-TF | NaN |
ENSG00000211459 | 648 | 1601 | MT | MT-RNR1 | NaN |
ENSG00000210077 | 1602 | 1670 | MT | MT-TV | NaN |
ENSG00000210082 | 1671 | 3229 | MT | MT-RNR2 | NaN |
ENSG00000209082 | 3230 | 3304 | MT | MT-TL1 | NaN |
... | ... | ... | ... | ... | ... |
ENSG00000162437 | 64745075 | 64833232 | 1 | RAVER2 | p31.3 |
ENSG00000122432 | 84506300 | 84567379 | 1 | SPATA1 | p22.3 |
ENSG00000284882 | 84574114 | 84583620 | 1 | NaN | p22.3 |
ENSG00000289881 | 84614068 | 84621061 | 1 | NaN | p22.3 |
ENSG00000285325 | 84785427 | 84786714 | 1 | NaN | p22.3 |
69305 rows × 5 columns
Converting raw expression count data and converting to a numpy array.
[19]:
data = onecell_h5ad.X.toarray()
This demonstrates what the array values look like.
[20]:
data[:5]
[20]:
array([[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.]], dtype=float32)
This indicates how many genes are in the array (length), which can be important for generating the Brooklyn plot. Ideally, the original h5ad array does not limit the number of genes from the original sequencing.
[21]:
onecell_h5ad.var_names
[21]:
Index(['ENSG00000243485', 'ENSG00000237613', 'ENSG00000186092',
'ENSG00000238009', 'ENSG00000239945', 'ENSG00000239906',
'ENSG00000241599', 'ENSG00000236601', 'ENSG00000284733',
'ENSG00000235146',
...
'ENSG00000277196', 'ENSG00000277630', 'ENSG00000278384',
'ENSG00000278633', 'ENSG00000276345', 'ENSG00000277856',
'ENSG00000275063', 'ENSG00000271254', 'ENSG00000277475',
'ENSG00000268674'],
dtype='object', name='gene_ids', length=33234)
[22]:
orig_h5ad.X
[22]:
<69150x33234 sparse matrix of type '<class 'numpy.float32'>'
with 161117875 stored elements in Compressed Sparse Row format>
This code generates mean values for each gene (ENSG ID), from raw expression data, and appends this to gene annotations from biomart.
[23]:
xmen = onecell_h5ad.raw.X.mean(0)
type(xmen)
xmendf = pd.DataFrame(xmen.T, columns = ['xMean'], index = onecell_h5ad.var_names)
xmendf
df_cd = pd.merge(xmendf, dataset, left_index=True, right_index=True)
df_cd
[23]:
xMean | start_position | end_position | chromosome_name | hgnc_symbol | band | |
---|---|---|---|---|---|---|
ENSG00000000003 | 0.004342 | 100627108 | 100639991 | X | TSPAN6 | q22.1 |
ENSG00000000005 | 0.001058 | 100584936 | 100599885 | X | TNMD | q22.1 |
ENSG00000000419 | 0.360610 | 50934867 | 50959140 | 20 | DPM1 | q13.13 |
ENSG00000000457 | 0.165547 | 169849631 | 169894267 | 1 | SCYL3 | q24.2 |
ENSG00000000460 | 0.155693 | 169662007 | 169854080 | 1 | C1orf112 | q24.2 |
... | ... | ... | ... | ... | ... | ... |
ENSG00000285492 | 0.002394 | 159051674 | 159121506 | 6 | NaN | q25.3 |
ENSG00000285505 | 0.054773 | 41956879 | 41994232 | 19 | NaN | q13.2 |
ENSG00000285508 | 0.000000 | 10413520 | 10431922 | 20 | NaN | p12.2 |
ENSG00000285509 | 0.012970 | 121024125 | 121113108 | 11 | TBCEL-TECTA | q23.3 |
ENSG00000285513 | 0.000000 | 116820645 | 116821541 | 11 | NaN | q23.3 |
33147 rows × 6 columns
This resets the index and adds a name for the first column (ENSG ID).
[24]:
df_cd_export = df_cd.reset_index().rename(columns={'index': 'gene_ids'})
[25]:
df_cd_export
[25]:
gene_ids | xMean | start_position | end_position | chromosome_name | hgnc_symbol | band | |
---|---|---|---|---|---|---|---|
0 | ENSG00000000003 | 0.004342 | 100627108 | 100639991 | X | TSPAN6 | q22.1 |
1 | ENSG00000000005 | 0.001058 | 100584936 | 100599885 | X | TNMD | q22.1 |
2 | ENSG00000000419 | 0.360610 | 50934867 | 50959140 | 20 | DPM1 | q13.13 |
3 | ENSG00000000457 | 0.165547 | 169849631 | 169894267 | 1 | SCYL3 | q24.2 |
4 | ENSG00000000460 | 0.155693 | 169662007 | 169854080 | 1 | C1orf112 | q24.2 |
... | ... | ... | ... | ... | ... | ... | ... |
33142 | ENSG00000285492 | 0.002394 | 159051674 | 159121506 | 6 | NaN | q25.3 |
33143 | ENSG00000285505 | 0.054773 | 41956879 | 41994232 | 19 | NaN | q13.2 |
33144 | ENSG00000285508 | 0.000000 | 10413520 | 10431922 | 20 | NaN | p12.2 |
33145 | ENSG00000285509 | 0.012970 | 121024125 | 121113108 | 11 | TBCEL-TECTA | q23.3 |
33146 | ENSG00000285513 | 0.000000 | 116820645 | 116821541 | 11 | NaN | q23.3 |
33147 rows × 7 columns
This command exports a csv file that can be used to pick (automated or manually), genes spread across the whole genome for Brooklyn plots.
[26]:
df_cd_export.to_csv("seidmanttn_var_biomart.csv", index=False)
This part of the code removes the top 3,500 genes by raw xMean, then sorts by chromosome position and generates gene lists of all 3,500 genes and 350 interspersed genes to generate the Brooklyn plot needed for the next python step.
[27]:
df_cd_export2 = df_cd_export.sort_values(by = ['xMean'],ascending = False).reset_index(drop = True)
[28]:
df_cd_export_top = df_cd_export2.iloc[:3500,:]
[29]:
df_cd_export_top
[29]:
gene_ids | xMean | start_position | end_position | chromosome_name | hgnc_symbol | band | |
---|---|---|---|---|---|---|---|
0 | ENSG00000251562 | 1618.217773 | 65497688 | 65506516 | 11 | MALAT1 | q13.1 |
1 | ENSG00000198626 | 146.410553 | 237042184 | 237833988 | 1 | RYR2 | q43 |
2 | ENSG00000155657 | 100.666641 | 178525989 | 178830802 | 2 | TTN | q31.2 |
3 | ENSG00000245532 | 39.976116 | 65422774 | 65445540 | 11 | NEAT1 | q13.1 |
4 | ENSG00000183023 | 38.942490 | 40097270 | 40611053 | 2 | SLC8A1 | p22.1 |
... | ... | ... | ... | ... | ... | ... | ... |
3495 | ENSG00000231672 | 0.338896 | 217284019 | 217756593 | 2 | DIRC3 | q35 |
3496 | ENSG00000059758 | 0.338786 | 96278261 | 96400480 | 12 | CDK17 | q23.1 |
3497 | ENSG00000134375 | 0.338454 | 201955503 | 201970664 | 1 | TIMM17A | q32.1 |
3498 | ENSG00000070018 | 0.338398 | 12116025 | 12267044 | 12 | LRP6 | p13.2 |
3499 | ENSG00000110811 | 0.338174 | 6828407 | 6839847 | 12 | P3H3 | p13.31 |
3500 rows × 7 columns
[30]:
df_cd_export_genelist = df_cd_export_top.sort_values(by = ['chromosome_name','end_position']).reset_index(drop = True)
[31]:
df_cd_export_genelist
[31]:
gene_ids | xMean | start_position | end_position | chromosome_name | hgnc_symbol | band | |
---|---|---|---|---|---|---|---|
0 | ENSG00000187642 | 0.430749 | 975198 | 982117 | 1 | PERM1 | p36.33 |
1 | ENSG00000221978 | 0.356993 | 1385711 | 1399335 | 1 | CCNL2 | p36.33 |
2 | ENSG00000189409 | 0.413938 | 1632163 | 1635263 | 1 | MMP23B | p36.33 |
3 | ENSG00000078369 | 0.475395 | 1785285 | 1892292 | 1 | GNB1 | p36.33 |
4 | ENSG00000142611 | 0.866854 | 3069168 | 3438621 | 1 | PRDM16 | p36.32 |
... | ... | ... | ... | ... | ... | ... | ... |
3495 | ENSG00000114374 | 0.857443 | 12537650 | 12860839 | Y | USP9Y | q11.221 |
3496 | ENSG00000183878 | 1.383058 | 13234577 | 13480673 | Y | UTY | q11.221 |
3497 | ENSG00000176728 | 3.638888 | 18772706 | 19077416 | Y | TTTY14 | q11.222 |
3498 | ENSG00000229236 | 0.921071 | 20464916 | 20575519 | Y | TTTY10 | q11.223 |
3499 | ENSG00000198692 | 0.371463 | 20575776 | 20593154 | Y | EIF1AY | q11.223 |
3500 rows × 7 columns
[32]:
df_cd_export_againstlist = df_cd_export_genelist.iloc[:3500,:1]
[33]:
df_cd_export_againstlist
[33]:
gene_ids | |
---|---|
0 | ENSG00000187642 |
1 | ENSG00000221978 |
2 | ENSG00000189409 |
3 | ENSG00000078369 |
4 | ENSG00000142611 |
... | ... |
3495 | ENSG00000114374 |
3496 | ENSG00000183878 |
3497 | ENSG00000176728 |
3498 | ENSG00000229236 |
3499 | ENSG00000198692 |
3500 rows × 1 columns
[34]:
df_cd_export_againstlist.to_csv("againstlist.csv", index=False)
[35]:
df_cd_export_genelist_parse = df_cd_export_genelist.iloc[1::10, :]
[36]:
df_cd_export_towrite = df_cd_export_genelist_parse.iloc[:350,:1]
[37]:
df_cd_export_towrite
[37]:
gene_ids | |
---|---|
1 | ENSG00000221978 |
11 | ENSG00000180758 |
21 | ENSG00000175206 |
31 | ENSG00000037637 |
41 | ENSG00000169641 |
... | ... |
3451 | ENSG00000232593 |
3461 | ENSG00000229807 |
3471 | ENSG00000166432 |
3481 | ENSG00000101972 |
3491 | ENSG00000102125 |
350 rows × 1 columns
[38]:
df_cd_export_towrite.to_csv("genelist.csv", index=False)
[ ]: