from profile_binr import ProfileBin
import pandas as pd
!curl -fOL https://github.com/pinellolab/STREAM/raw/master/stream/tests/datasets/Nestorowa_2016/data_Nestorowa.tsv.gz
% Total % Received % Xferd Average Speed Time Time Time Current Dload Upload Total Spent Left Speed 100 183 100 183 0 0 516 0 --:--:-- --:--:-- --:--:-- 515 100 29.7M 100 29.7M 0 0 11.2M 0 0:00:02 0:00:02 --:--:-- 27.4M0:24 0:00:02 0:00:22 5475k
Important columns should be genes, rows should be cells
data = pd.read_csv("data_Nestorowa.tsv.gz", compression="gzip", sep="\t", index_col=0).T
data.head()
Clec1b | Kdm3a | Coro2b | 8430408G22Rik | Clec9a | Phf6 | Usp14 | Tmem167b | Kbtbd7 | Rag2 | ... | Zfp438 | Rab18 | Mzb1 | B4galt6 | Rnf125 | Impact | Taf4b | Zfp521 | Hrh4 | Psma8 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
HSPC_025 | 0.0 | 4.891604 | 1.426148 | 0.0 | 0.0 | 2.599758 | 2.954035 | 6.357369 | 2.129140 | 1.426148 | ... | 1.426148 | 9.660368 | 1.426148 | 1.426148 | 2.12914 | 8.177546 | 1.426148 | 1.426148 | 0.0 | 7.869409 |
HSPC_031 | 0.0 | 6.877725 | 0.000000 | 0.0 | 0.0 | 2.423483 | 1.804914 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.699126 | 0.000000 | 6.562672 | 0.00000 | 5.439604 | 0.699126 | 0.000000 | 0.0 | 0.000000 |
HSPC_037 | 0.0 | 0.000000 | 6.913384 | 0.0 | 0.0 | 2.051659 | 8.265465 | 0.000000 | 1.363402 | 0.000000 | ... | 1.363402 | 8.885311 | 0.000000 | 1.363402 | 0.00000 | 8.068215 | 0.000000 | 2.051659 | 0.0 | 1.363402 |
LT-HSC_001 | 0.0 | 0.000000 | 8.178374 | 0.0 | 0.0 | 6.419817 | 3.453502 | 2.579528 | 2.579528 | 0.000000 | ... | 2.579528 | 6.501342 | 4.947883 | 0.000000 | 0.00000 | 0.000000 | 2.579528 | 8.178374 | 0.0 | 2.579528 |
HSPC_001 | 0.0 | 0.000000 | 9.475577 | 0.0 | 0.0 | 7.733370 | 1.478900 | 0.000000 | 10.045601 | 0.532906 | ... | 0.000000 | 1.693409 | 7.975432 | 8.561045 | 0.00000 | 6.539920 | 0.532906 | 0.000000 | 0.0 | 0.532906 |
5 rows × 4768 columns
probin = ProfileBin(data)
%time probin.fit()
CPU times: user 24.3 s, sys: 182 ms, total: 24.5 s Wall time: 58.4 s
probin.criteria.head()
Dip | BI | Kurtosis | DropOutRate | MeanNZ | DenPeak | Amplitude | Category | |
---|---|---|---|---|---|---|---|---|
Clec1b | 0.358107 | 1.635698 | 54.017736 | 0.876208 | 1.520978 | -0.007249 | 8.852181 | ZeroInf |
Kbtbd7 | 0.000000 | 2.137107 | 3.577214 | 0.571256 | 2.928988 | -0.000556 | 10.910051 | ZeroInf |
Mns1 | 0.000000 | 2.066577 | -1.260230 | 0.100242 | 4.743786 | 1.884848 | 10.862714 | Bimodal |
Nek1 | 0.000000 | 2.292238 | -0.196326 | 0.304348 | 3.339581 | 0.218894 | 10.153184 | Bimodal |
Zfp758 | 0.000000 | 2.656696 | 0.424673 | 0.410024 | 3.392367 | 0.032378 | 10.114596 | Bimodal |
%time bindata = probin.binarize()
CPU times: user 35.8 s, sys: 89.8 ms, total: 35.9 s Wall time: 35.9 s
bindata.head()
Clec1b | Kdm3a | Coro2b | 8430408G22Rik | Clec9a | Phf6 | Usp14 | Tmem167b | Kbtbd7 | Rag2 | ... | Zfp438 | Rab18 | Mzb1 | B4galt6 | Rnf125 | Impact | Taf4b | Zfp521 | Hrh4 | Psma8 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
HSPC_025 | NaN | 1.0 | NaN | NaN | NaN | 0.0 | 0.0 | 1.0 | NaN | NaN | ... | 1.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | NaN | NaN | NaN | 1.0 |
HSPC_031 | NaN | 1.0 | NaN | NaN | NaN | 0.0 | 0.0 | 0.0 | NaN | NaN | ... | NaN | 0.0 | NaN | 1.0 | 0.0 | NaN | NaN | NaN | NaN | NaN |
HSPC_037 | NaN | 0.0 | 1.0 | NaN | NaN | 0.0 | 1.0 | 0.0 | NaN | NaN | ... | 1.0 | 1.0 | NaN | 0.0 | 0.0 | 1.0 | NaN | NaN | NaN | NaN |
LT-HSC_001 | NaN | 0.0 | 1.0 | NaN | NaN | 1.0 | 0.0 | 0.0 | NaN | NaN | ... | 1.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | NaN | 1.0 | NaN | NaN |
HSPC_001 | NaN | 0.0 | 1.0 | NaN | NaN | 1.0 | 0.0 | 0.0 | 1.0 | NaN | ... | NaN | 0.0 | 1.0 | 1.0 | 0.0 | 1.0 | NaN | NaN | NaN | NaN |
5 rows × 4768 columns