In this tutorial, we will cover
#! pip install perturb-tools==0.1.5
import pandas as pd
import perturb_tools as pt
We will download public CRISPR/Cas9 Knock-out dataset: TKO HeLa data.
# ! wget http://tko.ccbr.utoronto.ca/Data/readcount-HeLa-lib1.gz
# ! wget http://tko.ccbr.utoronto.ca/Data/readcount-HeLa-lib2.gz
# ! gunzip readcount-HeLa-lib1.gz -y
# ! gunzip readcount-HeLa-lib2.gz -y
# !cut -f 1,2 readcount-HeLa-lib1 > guide_info-HeLa-lib1.tsv
# !cut -f 1,3- readcount-HeLa-lib1 > guide_count-HeLa-lib1.tsv
# !cut -f 1,2 readcount-HeLa-lib2 > guide_info-HeLa-lib2.tsv
# !cut -f 1,3- readcount-HeLa-lib2 > guide_count-HeLa-lib2.tsv
Basic structure of Screen object contains 3 types of information:
Screen.X
: guide count matrix (numpy array with shape (n_guides, n_samples)Screen.guides
: guide RNA information ex) sequence, target element (pandas DataFrame with length n_guides)Screen.samples
: sample information that gave rise to the guide counts (pandas DataFrame with length n_samples)You can construct Screen object using any number of these three elements.
screen = pt.read_csv(X_path="guide_count-HeLa-lib1.tsv", guides_path="guide_info-HeLa-lib1.tsv", samples_path=None, sep="\t")
/data/pinello/SHARED_SOFTWARE/anaconda_latest/envs/jy_anbe_py38/lib/python3.8/site-packages/anndata/_core/anndata.py:121: ImplicitModificationWarning: Transforming to str index. warnings.warn("Transforming to str index.", ImplicitModificationWarning)
screen.X
array([[310, 226, 338, ..., 49, 296, 469], [ 46, 1, 0, ..., 1, 52, 213], [239, 216, 285, ..., 250, 269, 363], ..., [508, 479, 248, ..., 331, 386, 566], [ 97, 50, 10, ..., 28, 23, 3], [ 91, 115, 130, ..., 76, 120, 390]])
Alternatively, you can manually read the file and initialize Screen object.
tbl = pd.read_csv("readcount-HeLa-lib1", sep = "\t")
tbl2 = pd.read_csv("readcount-HeLa-lib2", sep = "\t")
tbl
GENE_CLONE | GENE | T08A | T08B | T08C | T12A | T12B | T12C | T15A | T15B | T15C | T18A | T18B | T18C | T0 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | A1BG_CACCTTCGAGCTGCTGCGCG | A1BG | 310 | 226 | 338 | 356 | 249 | 224 | 186 | 60 | 296 | 125 | 49 | 296 | 469 |
1 | A1BG_AAGAGCGCCTCGGTCCCAGC | A1BG | 46 | 1 | 0 | 7 | 22 | 142 | 0 | 1 | 52 | 0 | 1 | 52 | 213 |
2 | A1BG_TGGACTTCCAGCTACGGCGC | A1BG | 239 | 216 | 285 | 117 | 244 | 116 | 172 | 298 | 269 | 119 | 250 | 269 | 363 |
3 | A1BG_CACTGGCGCCATCGAGAGCC | A1BG | 289 | 83 | 166 | 164 | 111 | 14 | 184 | 160 | 214 | 122 | 137 | 214 | 678 |
4 | A1BG_GCTCGGGCTTGTCCACAGGA | A1BG | 205 | 34 | 217 | 205 | 148 | 355 | 326 | 100 | 432 | 212 | 85 | 432 | 559 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
91315 | luciferase_CCTCTAGAGGATGGAACCGC | luciferase | 153 | 101 | 264 | 121 | 155 | 317 | 245 | 103 | 490 | 165 | 88 | 490 | 200 |
91316 | luciferase_ACAACTTTACCGACCGCGCC | luciferase | 159 | 101 | 152 | 363 | 232 | 412 | 216 | 441 | 248 | 159 | 378 | 248 | 170 |
91317 | luciferase_CTTGTCGTATCCCTGGAAGA | luciferase | 508 | 479 | 248 | 206 | 294 | 631 | 421 | 394 | 386 | 270 | 331 | 386 | 566 |
91318 | luciferase_GGCTATGAAGAGATACGCCC | luciferase | 97 | 50 | 10 | 0 | 14 | 61 | 29 | 33 | 23 | 19 | 28 | 23 | 3 |
91319 | luciferase_GGCATGCGAGAATCTGACGC | luciferase | 91 | 115 | 130 | 218 | 60 | 54 | 5 | 88 | 120 | 4 | 76 | 120 | 390 |
91320 rows × 15 columns
pd.DataFrame(tbl.columns[2:]).rename(columns={0:"index"}).set_index("index")
index |
---|
T08A |
T08B |
T08C |
T12A |
T12B |
T12C |
T15A |
T15B |
T15C |
T18A |
T18B |
T18C |
T0 |
sample_df = pd.DataFrame(tbl.columns[2:]).rename(columns={0:"index"}).set_index("index")
sample_df.index.str[-1]
Index(['A', 'B', 'C', 'A', 'B', 'C', 'A', 'B', 'C', 'A', 'B', 'C', '0'], dtype='object', name='index')
sample_df['time'] = sample_df.index.to_series().str[1:-1].map(lambda s: int(s) if s else -1)
sample_df
time | |
---|---|
index | |
T08A | 8 |
T08B | 8 |
T08C | 8 |
T12A | 12 |
T12B | 12 |
T12C | 12 |
T15A | 15 |
T15B | 15 |
T15C | 15 |
T18A | 18 |
T18B | 18 |
T18C | 18 |
T0 | -1 |
def make_screen(tbl, guide_name_col = "GENE_CLONE"):
tbl = tbl.rename(columns={guide_name_col:"name"}).set_index("name")
sample_df = pd.DataFrame(tbl.columns[1:]).rename(columns={0:"index"}).set_index("index")
sample_df["replicate"] = sample_df.index.to_series().str[-1]
sample_df["time"] = sample_df.index.to_series().str[1:-1].map(lambda s: int(s) if s else -1)
print(sample_df)
return pt.Screen(X=tbl.values[:,1:].astype(int), guides=tbl.iloc[:,:2],
samples=sample_df,
)
adata = make_screen(tbl)
bdata = make_screen(tbl2)
replicate time index T08A A 8 T08B B 8 T08C C 8 T12A A 12 T12B B 12 T12C C 12 T15A A 15 T15B B 15 T15C C 15 T18A A 18 T18B B 18 T18C C 18 T0 0 -1 replicate time index T08A A 8 T08B B 8 T08C C 8 T12A A 12 T12B B 12 T12C C 12 T15A A 15 T15B B 15 T15C C 15 T18A A 18 T18B B 18 T18C C 18 T0 0 -1
adata.guides
GENE | T08A | |
---|---|---|
name | ||
A1BG_CACCTTCGAGCTGCTGCGCG | A1BG | 310 |
A1BG_AAGAGCGCCTCGGTCCCAGC | A1BG | 46 |
A1BG_TGGACTTCCAGCTACGGCGC | A1BG | 239 |
A1BG_CACTGGCGCCATCGAGAGCC | A1BG | 289 |
A1BG_GCTCGGGCTTGTCCACAGGA | A1BG | 205 |
... | ... | ... |
luciferase_CCTCTAGAGGATGGAACCGC | luciferase | 153 |
luciferase_ACAACTTTACCGACCGCGCC | luciferase | 159 |
luciferase_CTTGTCGTATCCCTGGAAGA | luciferase | 508 |
luciferase_GGCTATGAAGAGATACGCCC | luciferase | 97 |
luciferase_GGCATGCGAGAATCTGACGC | luciferase | 91 |
91320 rows × 2 columns
adata.samples
replicate | time | |
---|---|---|
index | ||
T08A | A | 8 |
T08B | B | 8 |
T08C | C | 8 |
T12A | A | 12 |
T12B | B | 12 |
T12C | C | 12 |
T15A | A | 15 |
T15B | B | 15 |
T15C | C | 15 |
T18A | A | 18 |
T18B | B | 18 |
T18C | C | 18 |
T0 | 0 | -1 |
adata_cut = adata[adata.guides.GENE == "A1BG", :]
adata_cut
Genome Editing Screen comprised of n_guides x n_conditions = 6 x 13 guides: 'GENE', 'T08A' samples: 'replicate', 'time' samples_m: samples_p: layers: uns:
adata_t8 = adata[:, adata.samples.time == 8]
adata_t8
Genome Editing Screen comprised of n_guides x n_conditions = 91320 x 3 guides: 'GENE', 'T08A' samples: 'replicate', 'time' samples_m: samples_p: layers: uns:
adata.X
array([[310, 226, 338, ..., 49, 296, 469], [ 46, 1, 0, ..., 1, 52, 213], [239, 216, 285, ..., 250, 269, 363], ..., [508, 479, 248, ..., 331, 386, 566], [ 97, 50, 10, ..., 28, 23, 3], [ 91, 115, 130, ..., 76, 120, 390]])
adata.write("./HeLa_lib1.h5ad")
# import anndata as ad
# adata_ann = ad.read_h5ad("HeLa_lib1.h5ad")
# adata_pt = pt.Screen.from_adata(adata_ann)
Compatible with .h5ad file output of AnnData.
If the guides and samples index are exactly the same, objects can be added (ex. technical replicates).
adata + adata
Genome Editing Screen comprised of n_guides x n_conditions = 91320 x 13 guides: 'GENE', 'T08A' samples: 'replicate', 'time' samples_m: samples_p: layers: uns:
Biological replicates can be concatenated along 'samples' axis.
import importlib
importlib.reload(pt)
<module 'perturb_tools' from '/data/pinello/PROJECTS/2021_08_ANBE/software/perturb-tools/perturb_tools/__init__.py'>
pt.concat((adata, adata))
/data/pinello/SHARED_SOFTWARE/anaconda_latest/envs/jy_anbe_py38/lib/python3.8/site-packages/anndata/_core/anndata.py:1828: UserWarning: Observation names are not unique. To make them unique, call `.obs_names_make_unique`. utils.warn_names_duplicates("obs")
Genome Editing Screen comprised of n_guides x n_conditions = 182640 x 13 guides: 'GENE', 'T08A' samples: samples_m: samples_p: layers: uns:
adata.log_norm()
adata
Genome Editing Screen comprised of n_guides x n_conditions = 91320 x 13 guides: 'GENE', 'T08A' samples: 'replicate', 'time' samples_m: samples_p: layers: 'lognorm_counts' uns:
adata.layers['lognorm_counts']
array([[3.83616552, 3.72344557, 4.36367384, ..., 2.09998439, 4.01236793, 3.57526448], [1.57091719, 0.07590494, 0. , ..., 0.09367186, 1.87158158, 2.57512434], [3.4906079 , 3.66320179, 4.13058593, ..., 4.15142457, 3.88332224, 3.24056381], ..., [4.50880773, 4.74827466, 3.94220047, ..., 4.53632282, 4.37437528, 3.82558522], [2.366281 , 1.88795355, 0.65946991, ..., 1.5252234 , 1.1218406 , 0.09740613], [2.29249441, 2.85050474, 3.09314233, ..., 2.60840185, 2.8353031 , 3.33346213]])
Calculating the LFC between T=18 vs T=8
adata.samples
replicate | time | |
---|---|---|
index | ||
T08A | A | 8 |
T08B | B | 8 |
T08C | C | 8 |
T12A | A | 12 |
T12B | B | 12 |
T12C | C | 12 |
T15A | A | 15 |
T15B | B | 15 |
T15C | C | 15 |
T18A | A | 18 |
T18B | B | 18 |
T18C | C | 18 |
T0 | 0 | -1 |
adata.log_fold_change("T18A", "T08A")
adata.guides
GENE | T08A | T18A_T08A.lfc | |
---|---|---|---|
name | |||
A1BG_CACCTTCGAGCTGCTGCGCG | A1BG | 310 | -0.341628 |
A1BG_AAGAGCGCCTCGGTCCCAGC | A1BG | 46 | -1.570917 |
A1BG_TGGACTTCCAGCTACGGCGC | A1BG | 239 | -0.060598 |
A1BG_CACTGGCGCCATCGAGAGCC | A1BG | 289 | -0.279654 |
A1BG_GCTCGGGCTTGTCCACAGGA | A1BG | 205 | 0.912812 |
... | ... | ... | ... |
luciferase_CCTCTAGAGGATGGAACCGC | luciferase | 153 | 0.946178 |
luciferase_ACAACTTTACCGACCGCGCC | luciferase | 159 | 0.848203 |
luciferase_CTTGTCGTATCCCTGGAAGA | luciferase | 508 | 0.026328 |
luciferase_GGCTATGAAGAGATACGCCC | luciferase | 97 | -1.009470 |
luciferase_GGCATGCGAGAATCTGACGC | luciferase | 91 | -1.882511 |
91320 rows × 3 columns
Calculating the T=18 vs T=8 across all replicates
adata_t = adata[:, adata.samples.replicate != "0"]
adata
Genome Editing Screen comprised of n_guides x n_conditions = 91320 x 13 guides: 'GENE', 'T08A', 'T18A_T08A.lfc' samples: 'replicate', 'time' samples_m: samples_p: layers: 'lognorm_counts' uns:
adata_t
Genome Editing Screen comprised of n_guides x n_conditions = 91320 x 12 guides: 'GENE', 'T08A', 'T18A_T08A.lfc' samples: 'replicate', 'time' samples_m: samples_p: layers: 'lognorm_counts' uns:
adata_t.log_fold_change_reps(18, 8, rep_col="replicate", compare_col="time")
A.18_8.lfc | B.18_8.lfc | C.18_8.lfc | |
---|---|---|---|
name | |||
A1BG_CACCTTCGAGCTGCTGCGCG | -0.341628 | -1.623461 | -0.351306 |
A1BG_AAGAGCGCCTCGGTCCCAGC | -1.570917 | 0.017767 | 1.871582 |
A1BG_TGGACTTCCAGCTACGGCGC | -0.060598 | 0.488223 | -0.247264 |
A1BG_CACTGGCGCCATCGAGAGCC | -0.279654 | 0.893945 | 0.169561 |
A1BG_GCTCGGGCTTGTCCACAGGA | 0.912812 | 1.240359 | 0.766479 |
... | ... | ... | ... |
luciferase_CCTCTAGAGGATGGAACCGC | 0.946178 | 0.096590 | 0.677018 |
luciferase_ACAACTTTACCGACCGCGCC | 0.848203 | 2.029437 | 0.480288 |
luciferase_CTTGTCGTATCCCTGGAAGA | 0.026328 | -0.211952 | 0.432175 |
luciferase_GGCTATGAAGAGATACGCCC | -1.009470 | -0.362730 | 0.462371 |
luciferase_GGCATGCGAGAATCTGACGC | -1.882511 | -0.242103 | -0.257839 |
91320 rows × 3 columns
Aggregate the LFCs based on aggregate_fn [median, mean, sd]
.
adata_t.log_fold_change_agg(8, 18, agg_col="replicate", compare_col="time", agg_fn = "median")
adata_t.guides
GENE | T08A | T18A_T08A.lfc | 8_18.lfc.median | |
---|---|---|---|---|
name | ||||
A1BG_CACCTTCGAGCTGCTGCGCG | A1BG | 310 | -0.341628 | 0.351306 |
A1BG_AAGAGCGCCTCGGTCCCAGC | A1BG | 46 | -1.570917 | -0.017767 |
A1BG_TGGACTTCCAGCTACGGCGC | A1BG | 239 | -0.060598 | 0.060598 |
A1BG_CACTGGCGCCATCGAGAGCC | A1BG | 289 | -0.279654 | -0.169561 |
A1BG_GCTCGGGCTTGTCCACAGGA | A1BG | 205 | 0.912812 | -0.912812 |
... | ... | ... | ... | ... |
luciferase_CCTCTAGAGGATGGAACCGC | luciferase | 153 | 0.946178 | -0.677018 |
luciferase_ACAACTTTACCGACCGCGCC | luciferase | 159 | 0.848203 | -0.848203 |
luciferase_CTTGTCGTATCCCTGGAAGA | luciferase | 508 | 0.026328 | -0.026328 |
luciferase_GGCTATGAAGAGATACGCCC | luciferase | 97 | -1.009470 | 0.362730 |
luciferase_GGCATGCGAGAATCTGACGC | luciferase | 91 | -1.882511 | 0.257839 |
91320 rows × 4 columns
adata.to_Excel("Hela_lib1.xlsx")
Writing to: Hela_lib1.xlsx Sheet 1: X Sheet 2: lognorm_counts Sheet 3: guides Sheet 4: samples
adata.to_mageck_input("Hela_mageck_input.txt", target_column="GENE")
! head Hela_mageck_input.txt
sgRNA gene T08A T08B T08C T12A T12B T12C T15A T15B T15C T18A T18B T18C T0 A1BG_CACCTTCGAGCTGCTGCGCG A1BG 310 226 338 356 249 224 186 60 296 125 49 296 469 A1BG_AAGAGCGCCTCGGTCCCAGC A1BG 46 1 0 7 22 142 0 1 52 0 1 52 213 A1BG_TGGACTTCCAGCTACGGCGC A1BG 239 216 285 117 244 116 172 298 269 119 250 269 363 A1BG_CACTGGCGCCATCGAGAGCC A1BG 289 83 166 164 111 14 184 160 214 122 137 214 678 A1BG_GCTCGGGCTTGTCCACAGGA A1BG 205 34 217 205 148 355 326 100 432 212 85 432 559 A1BG_CAAGAGAAAGACCACGAGCA A1BG 389 331 468 1074 364 158 664 286 499 464 235 499 647 A1CF_CGTGGCTATTTGGCATACAC A1CF 452 240 390 630 509 261 471 255 301 322 210 301 898 A1CF_GGTATACTCTCCTTGCAGCA A1CF 71 30 29 119 155 153 131 76 56 94 61 56 199 A1CF_GACATGGTATTGCAGTAGAC A1CF 207 227 223 118 141 173 176 198 42 118 166 42 271