import celltypist
from celltypist import models
import scanpy as sc
import pandas as pd
import numpy as np
import anndata
import re
import h5py
import scipy.sparse as scs
import concurrent.futures
import scanpy.external as sce
import gc
from concurrent.futures import ThreadPoolExecutor
from concurrent.futures import ProcessPoolExecutor
import copy
def run_leiden(adata, resolution, key_added):
# Make a copy of adata for thread safety
adata_copy = copy.deepcopy(adata)
adata_clustering = sc.tl.leiden(adata_copy, resolution=resolution, key_added=key_added, copy=True)
return adata_clustering.obs
def run_leiden_parallel(adata, tasks):
with ProcessPoolExecutor(max_workers=5) as executor:
# Make deep copies of adata for each task to ensure thread safety
futures = [executor.submit(run_leiden, copy.deepcopy(adata), resolution, key_added) for resolution, key_added in tasks]
results = [future.result() for future in futures]
# Assign the results back to the original AnnData object
for result, (_, key_added) in zip(results, tasks):
adata.obs[key_added] = result[key_added]
return adata
adata=sc.read_h5ad('adata_processed.h5ad')
Myeloid_cell_population=["0","12","14","15","16","18","21","23","24"]
adata_subset=adata[adata.obs['leiden'].isin(Myeloid_cell_population)]
del adata
gc.collect()
12
adata_subset=adata_subset.raw.to_adata()
adata_subset.write_h5ad('Myeloid/Myeloidcells_raw.h5ad')
adata_subset=sc.read_h5ad('Myeloid/Myeloidcells_raw.h5ad')
adata_subset.raw= adata_subset
sc.pp.normalize_total(adata_subset, target_sum=1e4)
sc.pp.log1p(adata_subset)
sc.pp.highly_variable_genes(adata_subset)
adata_subset = adata_subset[:, adata_subset.var_names[adata_subset.var['highly_variable']]]
WARNING: adata.X seems to be already log-transformed.
sc.pp.scale(adata_subset)
sc.tl.pca(adata_subset, svd_solver='arpack')
sce.pp.harmony_integrate(adata_subset, 'cohort.cohortGuid',max_iter_harmony = 30)
2023-11-27 15:32:55,968 - harmonypy - INFO - Computing initial centroids with sklearn.KMeans... Computing initial centroids with sklearn.KMeans... 2023-11-27 15:35:45,952 - harmonypy - INFO - sklearn.KMeans initialization complete. sklearn.KMeans initialization complete. 2023-11-27 15:35:47,803 - harmonypy - INFO - Iteration 1 of 30 Iteration 1 of 30 2023-11-27 15:39:17,228 - harmonypy - INFO - Iteration 2 of 30 Iteration 2 of 30 2023-11-27 15:42:49,699 - harmonypy - INFO - Converged after 2 iterations Converged after 2 iterations
sc.pp.neighbors(adata_subset, n_neighbors=50,use_rep='X_pca_harmony', n_pcs=30)
sc.tl.umap(adata_subset,min_dist=0.05)
IOStream.flush timed out IOStream.flush timed out IOStream.flush timed out
adata_subset.write_h5ad('Myeloid/Myeloidcells_processed_pre_leiden.h5ad')
tasks = [(1, "leiden_resolution_1"),(1.5, "leiden_resolution_1.5"),(2, "leiden_resolution_2"),(2.5, "leiden_resolution_2.5"),(3, "leiden_resolution_3")]
adata_subset = run_leiden_parallel(adata_subset, tasks)
adata_subset.write_h5ad('Myeloid/Myeloidcells_processed.h5ad')
adata_subset=adata_subset.raw.to_adata()
sc.pp.normalize_total(adata_subset, target_sum=1e4)
sc.pp.log1p(adata_subset)
sc.tl.rank_genes_groups(adata_subset, 'leiden_resolution_1', method='wilcoxon')
df_resolution_1=sc.get.rank_genes_groups_df(adata_subset,group=None)
df_resolution_1.to_csv('Myeloid/Myeloid_res1.csv')
sc.tl.rank_genes_groups(adata_subset, 'leiden_resolution_1.5', method='wilcoxon')
df_resolution_1_5=sc.get.rank_genes_groups_df(adata_subset,group=None)
df_resolution_1_5.to_csv('Myeloid/Myeloid_res1.5.csv')
sc.tl.rank_genes_groups(adata_subset, 'leiden_resolution_2', method='wilcoxon')
df_resolution_2=sc.get.rank_genes_groups_df(adata_subset,group=None)
df_resolution_2.to_csv('Myeloid/Myeloid_res2.csv')
sc.tl.rank_genes_groups(adata_subset, 'leiden_resolution_2.5', method='wilcoxon')
df_resolution_2_5=sc.get.rank_genes_groups_df(adata_subset,group=None)
df_resolution_2_5.to_csv('Myeloid/Myeloid_res2.5.csv')
sc.tl.rank_genes_groups(adata_subset, 'leiden_resolution_3', method='wilcoxon')
df_resolution_3=sc.get.rank_genes_groups_df(adata_subset,group=None)
df_resolution_3.to_csv('Myeloid/Myeloid_res3.csv')
adata_subset
AnnData object with n_obs × n_vars = 397526 × 1109 obs: 'barcodes', 'batch_id', 'cell_name', 'cell_uuid', 'chip_id', 'hto_barcode', 'hto_category', 'n_genes', 'n_mito_umis', 'n_reads', 'n_umis', 'original_barcodes', 'pbmc_sample_id', 'pool_id', 'seurat_pbmc_type', 'seurat_pbmc_type_score', 'umap_1', 'umap_2', 'well_id', 'subject.biologicalSex', 'subject.ethnicity', 'subject.partnerCode', 'subject.race', 'subject.subjectGuid', 'cohort.cohortGuid', 'sample.visitName', 'sample.visitDetails', 'subject.birthYear', 'CMV.IgG.Serology.Result.Interpretation', 'BMI', 'predicted.celltype.l1.score', 'predicted.celltype.l1', 'predicted.celltype.l2.score', 'predicted.celltype.l2', 'predicted.celltype.l3.score', 'predicted.celltype.l3', 'predicted.celltype.l2.5.score', 'predicted.celltype.l2.5', 'predicted_labels_celltypist', 'majority_voting_celltypist', 'predicted_doublet', 'doublet_score', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'total_counts_mito', 'log1p_total_counts_mito', 'pct_counts_mito', 'leiden' var: 'mito', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std' uns: 'hvg', 'leiden', 'log1p', 'neighbors', 'pca', 'umap' obsm: 'X_pca', 'X_pca_harmony', 'X_umap' varm: 'PCs' obsp: 'connectivities', 'distances'
adata_subset=sc.read_h5ad('Myeloid/Myeloidcells_processed.h5ad')
sc.tl.umap(adata_subset,min_dist=0.05)
adata_subset.write_h5ad('Myeloid/Myeloidcells_processed_20231107.h5ad')
sc.pl.umap(adata_subset, color=['doublet_score'], size=2,show=False,ncols=1 ,frameon=False)
sc.pl.umap(adata_subset, color=['predicted.celltype.l2.5'], size=2,show=False,ncols=1 ,frameon=False)
sc.pl.umap(adata_subset, color=['predicted_labels_celltypist'], size=2,show=False,ncols=1 ,frameon=False)
sc.pl.umap(adata_subset, color=['majority_voting_celltypist'], size=2,show=False,ncols=1 ,frameon=False)
sc.pl.umap(adata_subset, color=['leiden'],
size=0.5,show=False,ncols=1 ,frameon=False)
sc.pl.umap(adata_subset, color=['leiden_resolution_1','leiden_resolution_1.5','leiden_resolution_2'],
size=0.5,show=False,ncols=1 ,frameon=False)
sc.pl.umap(adata_subset, color=['CMV.IgG.Serology.Result.Interpretation'], size=2,show=False,ncols=1 ,frameon=False)
df=pd.read_csv('Myeloid/Myeloid_res2.csv')
df=df.groupby('group').head(20).reset_index(drop=True)
import matplotlib.pyplot as plt
groups = df.groupby('group')
fig, axs = plt.subplots(9, 6, figsize=(10, 30), squeeze=False)
# Loop through each group and create a scatter plot in the corresponding subplot
for i, (name, group) in enumerate(groups):
row, col = i // 6, i % 6
axs[row, col].scatter(group['scores'], group['names'])
axs[row, col].invert_yaxis()
axs[row, col].set_title(str(name)+" vs Rest")
axs[row, col].set_xlabel('U scores')
axs[row, col].set_ylabel('Gene')
fig.tight_layout()
plt.savefig('scatter_plot.png')
adata_subset=sc.read_h5ad('Myeloid/Myeloidcells_processed_20231107.h5ad')
adata_subset=adata_subset.raw.to_adata()
sc.pp.normalize_total(adata_subset, target_sum=1e4)
sc.pp.log1p(adata_subset)
sc.pl.umap(adata_subset, color=['ITGAX','MS4A1','CD22'],use_raw=False, size=2,show=False,ncols=1 ,frameon=False)