#!/usr/bin/env python # coding: utf-8 # In[1]: import celltypist from celltypist import models import scanpy as sc import pandas as pd import numpy as np import anndata import re import h5py import scipy.sparse as scs import concurrent.futures # In[2]: def read_mat(h5_con): mat = scs.csc_matrix( (h5_con['matrix']['data'][:], # Count values h5_con['matrix']['indices'][:], # Row indices h5_con['matrix']['indptr'][:]), # Pointers for column positions shape = tuple(h5_con['matrix']['shape'][:]) # Matrix dimensions ) return mat # define a function to obeservation (i.e. metadata) def read_obs(h5con): bc = h5con['matrix']['barcodes'][:] bc = [x.decode('UTF-8') for x in bc] # Initialized the DataFrame with cell barcodes obs_df = pd.DataFrame({ 'barcodes' : bc }) # Get the list of available metadata columns obs_columns = h5con['matrix']['observations'].keys() # For each column for col in obs_columns: # Read the values values = h5con['matrix']['observations'][col][:] # Check for byte storage if(isinstance(values[0], (bytes, bytearray))): # Decode byte strings values = [x.decode('UTF-8') for x in values] # Add column to the DataFrame obs_df[col] = values return obs_df # define a function to construct anndata object from a h5 file def read_h5_anndata(h5_file): h5_con = h5py.File(h5_file, mode = 'r') # extract the expression matrix mat = read_mat(h5_con) # extract gene names genes = h5_con['matrix']['features']['name'][:] genes = [x.decode('UTF-8') for x in genes] # extract metadata obs_df = read_obs(h5_con) # construct anndata adata = anndata.AnnData(mat.T, obs = obs_df) # make sure the gene names aligned adata.var_names = genes adata.var_names_make_unique() return adata # In[3]: meta_data=pd.read_csv("hise_meta_data_2023-11-19.csv") # In[4]: results = [] for file_name in meta_data['file.path']: result = read_h5_anndata(file_name) results.append(result) adata = anndata.concat(results) # In[5]: adata.write_h5ad('adata_all_raw.h5ad') # In[ ]: