import pandas as pd
import numpy as np
d=pd.read_csv('histone_genes.csv')
d
Histone type | Histone variant | HGNC symbol | NCBI gene ID | Ensembl gene ID | Expr. timing | Expr. pattern | Biotype | Bona fide canonical | PMIDs | |
---|---|---|---|---|---|---|---|---|---|---|
0 | H1 | H1.0 | H1-0 | 3005 | ENSG00000189060 | RI | NaN | COD | NaN | 26689747 |
1 | H1 | H1.1 | H1-1 | 3024 | ENSG00000124610 | RD | NaN | COD | NaN | 26689747 |
2 | H1 | H1.2 | H1-2 | 3006 | ENSG00000187837 | Mixed | NaN | COD | NaN | 26689747 |
3 | H1 | H1.3 | H1-3 | 3007 | ENSG00000124575 | RD | NaN | COD | NaN | 26689747 |
4 | H1 | H1.4 | H1-4 | 3008 | ENSG00000168298 | RD | NaN | COD | NaN | 26689747 |
5 | H1 | H1.5 | H1-5 | 3009 | ENSG00000184357 | RD | NaN | COD | NaN | 26689747 |
6 | H1 | TS H1.6 | H1-6 | 3010 | ENSG00000187475 | RD | TS | COD | NaN | 26689747 |
7 | H1 | TS H1.7 | H1-7 | 341567 | ENSG00000187166 | RI | TS | COD | NaN | 26689747 |
8 | H1 | OO H1.8 | H1-8 | 132243 | ENSG00000178804 | RI | OO | COD | NaN | 26689747 |
9 | H1 | TS H1.9(?) | H1-9P | 373861 | ENSG00000188662 | RI | TS | COD | NaN | 12920187 26689747 17852044 |
10 | H1 | H1.10 | H1-10 | 8971 | ENSG00000184897 | RI | NaN | COD | NaN | 26689747 |
11 | H1 | NaN | H1-12P | 387325 | ENSG00000216331 | NaN | NaN | PS | NaN | NaN |
12 | H2A | TS H2A.1 | H2AC1 | 221613 | ENSG00000164508 | Mixed | TS | COD | NaN | 2011515 7068607 24506885 |
13 | H2A | NaN | H2AC2P | 387319 | ENSG00000216436 | NaN | NaN | PS | canonical | 12408966 25731851 |
14 | H2A | NaN | H2AC3P | 85303 | ENSG00000242387 | NaN | NaN | PS | NaN | NaN |
15 | H2A | canonical H2A | H2AC4 | 8335 | ENSG00000278463 | RD | NaN | COD | canonical | 12408966 25731851 |
16 | H2A | NaN | H2AC5P | 10341 | ENSG00000234816 | NaN | NaN | PS | NaN | NaN |
17 | H2A | canonical H2A | H2AC6 | 8334 | ENSG00000180573 | RD | NaN | COD | canonical | 12408966 25731851 |
18 | H2A | canonical H2A | H2AC7 | 3013 | ENSG00000196866 | RD | NaN | COD | canonical | 12408966 25731851 |
19 | H2A | canonical H2A | H2AC8 | 3012 | ENSG00000277075 | RD | NaN | COD | canonical | 12408966 25731851 |
20 | H2A | NaN | H2AC9P | 387323 | ENSG00000218281 | NaN | NaN | PS | NaN | NaN |
21 | H2A | NaN | H2AC10P | 8333 | ENSG00000218690 | NaN | NaN | PS | NaN | NaN |
22 | H2A | canonical H2A | H2AC11 | 8969 | ENSG00000196787 | RD | NaN | COD | canonical | 12408966 25731851 |
23 | H2A | canonical H2A | H2AC12 | 85235 | ENSG00000274997 | RD | NaN | COD | canonical | 12408966 25731851 |
24 | H2A | canonical H2A | H2AC13 | 8329 | ENSG00000196747 | RD | NaN | COD | canonical | 12408966 25731851 |
25 | H2A | canonical H2A | H2AC14 | 8331 | ENSG00000276368 | RD | NaN | COD | canonical | 12408966 25731851 |
26 | H2A | canonical H2A | H2AC15 | 8330 | ENSG00000275221 | RD | NaN | COD | canonical | 12408966 25731851 |
27 | H2A | canonical H2A | H2AC16 | 8332 | ENSG00000276903 | RD | NaN | COD | canonical | 12408966 25731851 |
28 | H2A | canonical H2A | H2AC17 | 8336 | ENSG00000278677 | RD | NaN | COD | canonical | 12408966 25731851 |
29 | H2A | canonical H2A | H2AC18 | 8337 | ENSG00000203812 | RD | NaN | COD | canonical | 12408966 25731851 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
103 | H3 | canonical H3.2 | H3C14 | 126961 | ENSG00000203811 | RD | NaN | COD | canonical | 12408966 |
104 | H3 | canonical H3.1 | H3C15 | 333932 | ENSG00000203852 | RD | NaN | COD | canonical | 12408966 |
105 | H3 | H3.Y.1 | H3Y1 | 391769 | ENSG00000269466 | RI | NaN | COD | NaN | 20819935 |
106 | H3 | H3.Y.2 | H3Y2 | 340096 | ENSG00000268799 | RI | NaN | COD | NaN | 20819935 |
107 | H3 | canonical H3(?) | H3-2 | 440686 | ENSG00000273213 | RD | NaN | COD | NaN | 12408966 |
108 | H3 | H3.3 | H3-3A | 3020 | ENSG00000163041 | RI | NaN | COD | NaN | 19412883 |
109 | H3 | H3.3 | H3-3B | 3021 | ENSG00000132475 | RI | NaN | COD | NaN | 19412883 |
110 | H3 | TS H3.4 | H3-4 | 8290 | ENSG00000168148 | RI | TS | COD | NaN | 8986613 |
111 | H3 | H3.5 | H3-5 | 440093 | ENSG00000188375 | RI | TS | COD | NaN | 21274551 |
112 | H3 | cenH3 | CENPA | 1058 | ENSG00000115163 | RI | NaN | COD | NaN | 23324462 |
113 | H3 | NaN | H3P26 | 10338 | ENSG00000224447 | NaN | NaN | PS | NaN | NaN |
114 | H3 | NaN | H3P4 | 106479023 | ENSG00000213244 | NaN | NaN | PS | NaN | NaN |
115 | H3 | NaN | H3P37 | 664611 | ENSG00000270433 | NaN | NaN | PS | NaN | NaN |
116 | H3 | NaN | H3P38 | 654505 | ENSG00000259389 | NaN | NaN | PS | NaN | NaN |
117 | H4 | canonical H4 | H4C1 | 8359 | ENSG00000278637 | RD | NaN | COD | canonical | 12408966 |
118 | H4 | canonical H4 | H4C2 | 8366 | ENSG00000278705 | RD | NaN | COD | canonical | 12408966 |
119 | H4 | canonical H4 | H4C3 | 8364 | ENSG00000197061 | RD | NaN | COD | canonical | 12408966 |
120 | H4 | canonical H4 | H4C4 | 8360 | ENSG00000277157 | RD | NaN | COD | canonical | 12408966 |
121 | H4 | canonical H4 | H4C5 | 8367 | ENSG00000276966 | RD | NaN | COD | canonical | 12408966 |
122 | H4 | canonical H4 | H4C6 | 8361 | ENSG00000274618 | RD | NaN | COD | canonical | 12408966 |
123 | H4 | canonical H4 | H4C7 | 8369 | ENSG00000275663 | RD | NaN | COD | canonical | 12408966 |
124 | H4 | canonical H4 | H4C8 | 8365 | ENSG00000158406 | RD | NaN | COD | canonical | 12408966 |
125 | H4 | canonical H4 | H4C9 | 8294 | ENSG00000276180 | RD | NaN | COD | canonical | 12408966 |
126 | H4 | NaN | H4C10P | 10337 | ENSG00000217862 | NaN | NaN | PS | NaN | NaN |
127 | H4 | canonical H4 | H4C11 | 8363 | ENSG00000197238 | RD | NaN | COD | canonical | 12408966 |
128 | H4 | canonical H4 | H4C12 | 8362 | ENSG00000273542 | RD | NaN | COD | canonical | 12408966 |
129 | H4 | canonical H4 | H4C13 | 8368 | ENSG00000275126 | RD | NaN | COD | canonical | 12408966 |
130 | H4 | canonical H4 | H4C14 | 8370 | ENSG00000270882 | RD | NaN | COD | canonical | 12408966 |
131 | H4 | canonical H4 | H4C15 | 554313 | ENSG00000270276 | RD | NaN | COD | canonical | 12408966 |
132 | H4 | canonical H4 | H4-16 | 121504 | ENSG00000197837 | RD | NaN | COD | canonical | 12408966 |
133 rows × 10 columns
hist_genes=(d[d['Biotype'].isin(['COD'])])[['Histone type','Histone variant','HGNC symbol','NCBI gene ID','Ensembl gene ID']]
hist_genes
Histone type | Histone variant | HGNC symbol | NCBI gene ID | Ensembl gene ID | |
---|---|---|---|---|---|
0 | H1 | H1.0 | H1-0 | 3005 | ENSG00000189060 |
1 | H1 | H1.1 | H1-1 | 3024 | ENSG00000124610 |
2 | H1 | H1.2 | H1-2 | 3006 | ENSG00000187837 |
3 | H1 | H1.3 | H1-3 | 3007 | ENSG00000124575 |
4 | H1 | H1.4 | H1-4 | 3008 | ENSG00000168298 |
5 | H1 | H1.5 | H1-5 | 3009 | ENSG00000184357 |
6 | H1 | TS H1.6 | H1-6 | 3010 | ENSG00000187475 |
7 | H1 | TS H1.7 | H1-7 | 341567 | ENSG00000187166 |
8 | H1 | OO H1.8 | H1-8 | 132243 | ENSG00000178804 |
9 | H1 | TS H1.9(?) | H1-9P | 373861 | ENSG00000188662 |
10 | H1 | H1.10 | H1-10 | 8971 | ENSG00000184897 |
12 | H2A | TS H2A.1 | H2AC1 | 221613 | ENSG00000164508 |
15 | H2A | canonical H2A | H2AC4 | 8335 | ENSG00000278463 |
17 | H2A | canonical H2A | H2AC6 | 8334 | ENSG00000180573 |
18 | H2A | canonical H2A | H2AC7 | 3013 | ENSG00000196866 |
19 | H2A | canonical H2A | H2AC8 | 3012 | ENSG00000277075 |
22 | H2A | canonical H2A | H2AC11 | 8969 | ENSG00000196787 |
23 | H2A | canonical H2A | H2AC12 | 85235 | ENSG00000274997 |
24 | H2A | canonical H2A | H2AC13 | 8329 | ENSG00000196747 |
25 | H2A | canonical H2A | H2AC14 | 8331 | ENSG00000276368 |
26 | H2A | canonical H2A | H2AC15 | 8330 | ENSG00000275221 |
27 | H2A | canonical H2A | H2AC16 | 8332 | ENSG00000276903 |
28 | H2A | canonical H2A | H2AC17 | 8336 | ENSG00000278677 |
29 | H2A | canonical H2A | H2AC18 | 8337 | ENSG00000203812 |
30 | H2A | canonical H2A | H2AC19 | 723790 | ENSG00000272196 |
31 | H2A | canonical H2A | H2AC20 | 8338 | ENSG00000184260 |
32 | H2A | canonical H2A | H2AC21 | 317772 | ENSG00000184270 |
33 | H2A | H2A.J(?) | H2AJ | 55766 | ENSG00000246705 |
34 | H2A | canonical H2A | H2AW | 92815 | ENSG00000181218 |
35 | H2A | H2A.X | H2AX | 3014 | ENSG00000188486 |
... | ... | ... | ... | ... | ... |
97 | H3 | canonical H3.1 | H3C8 | 8355 | ENSG00000273983 |
99 | H3 | canonical H3.1 | H3C10 | 8357 | ENSG00000278828 |
100 | H3 | canonical H3.1 | H3C11 | 8354 | ENSG00000275379 |
101 | H3 | canonical H3.1 | H3C12 | 8356 | ENSG00000197153 |
102 | H3 | canonical H3.1 | H3C13 | 653604 | ENSG00000183598 |
103 | H3 | canonical H3.2 | H3C14 | 126961 | ENSG00000203811 |
104 | H3 | canonical H3.1 | H3C15 | 333932 | ENSG00000203852 |
105 | H3 | H3.Y.1 | H3Y1 | 391769 | ENSG00000269466 |
106 | H3 | H3.Y.2 | H3Y2 | 340096 | ENSG00000268799 |
107 | H3 | canonical H3(?) | H3-2 | 440686 | ENSG00000273213 |
108 | H3 | H3.3 | H3-3A | 3020 | ENSG00000163041 |
109 | H3 | H3.3 | H3-3B | 3021 | ENSG00000132475 |
110 | H3 | TS H3.4 | H3-4 | 8290 | ENSG00000168148 |
111 | H3 | H3.5 | H3-5 | 440093 | ENSG00000188375 |
112 | H3 | cenH3 | CENPA | 1058 | ENSG00000115163 |
117 | H4 | canonical H4 | H4C1 | 8359 | ENSG00000278637 |
118 | H4 | canonical H4 | H4C2 | 8366 | ENSG00000278705 |
119 | H4 | canonical H4 | H4C3 | 8364 | ENSG00000197061 |
120 | H4 | canonical H4 | H4C4 | 8360 | ENSG00000277157 |
121 | H4 | canonical H4 | H4C5 | 8367 | ENSG00000276966 |
122 | H4 | canonical H4 | H4C6 | 8361 | ENSG00000274618 |
123 | H4 | canonical H4 | H4C7 | 8369 | ENSG00000275663 |
124 | H4 | canonical H4 | H4C8 | 8365 | ENSG00000158406 |
125 | H4 | canonical H4 | H4C9 | 8294 | ENSG00000276180 |
127 | H4 | canonical H4 | H4C11 | 8363 | ENSG00000197238 |
128 | H4 | canonical H4 | H4C12 | 8362 | ENSG00000273542 |
129 | H4 | canonical H4 | H4C13 | 8368 | ENSG00000275126 |
130 | H4 | canonical H4 | H4C14 | 8370 | ENSG00000270882 |
131 | H4 | canonical H4 | H4C15 | 554313 | ENSG00000270276 |
132 | H4 | canonical H4 | H4-16 | 121504 | ENSG00000197837 |
96 rows × 5 columns
from pybiomart import Dataset
dataset = Dataset(name='hsapiens_gene_ensembl',
host='http://www.ensembl.org')
dataset
<biomart.Dataset name='hsapiens_gene_ensembl', display_name=''>
genedata = dataset.query(attributes=['ensembl_gene_id','ensembl_transcript_id','ensembl_peptide_id','refseq_mrna','refseq_peptide','transcript_biotype'], only_unique=False)
histone_proteins=hist_genes.merge(genedata,left_on='Ensembl gene ID',right_on='Gene stable ID',suffixes=('','_y'),how='left').drop(columns='Gene stable ID')
histone_proteins
Histone type | Histone variant | HGNC symbol | NCBI gene ID | Ensembl gene ID | Transcript stable ID | Protein stable ID | RefSeq mRNA ID | RefSeq peptide ID | Transcript type | |
---|---|---|---|---|---|---|---|---|---|---|
0 | H1 | H1.0 | H1-0 | 3005 | ENSG00000189060 | ENST00000340857 | ENSP00000344504 | NM_005318 | NP_005309 | protein_coding |
1 | H1 | H1.1 | H1-1 | 3024 | ENSG00000124610 | ENST00000244573 | ENSP00000244573 | NM_005325 | NP_005316 | protein_coding |
2 | H1 | H1.2 | H1-2 | 3006 | ENSG00000187837 | ENST00000343677 | ENSP00000339566 | NM_005319 | NP_005310 | protein_coding |
3 | H1 | H1.3 | H1-3 | 3007 | ENSG00000124575 | ENST00000244534 | ENSP00000244534 | NM_005320 | NP_005311 | protein_coding |
4 | H1 | H1.4 | H1-4 | 3008 | ENSG00000168298 | ENST00000304218 | ENSP00000307705 | NM_005321 | NP_005312 | protein_coding |
5 | H1 | H1.5 | H1-5 | 3009 | ENSG00000184357 | ENST00000331442 | ENSP00000330074 | NM_005322 | NP_005313 | protein_coding |
6 | H1 | TS H1.6 | H1-6 | 3010 | ENSG00000187475 | ENST00000338379 | ENSP00000341214 | NM_005323 | NP_005314 | protein_coding |
7 | H1 | TS H1.7 | H1-7 | 341567 | ENSG00000187166 | ENST00000335017 | ENSP00000334805 | NM_181788 | NP_861453 | protein_coding |
8 | H1 | OO H1.8 | H1-8 | 132243 | ENSG00000178804 | ENST00000324382 | ENSP00000319799 | NM_153833 | NP_722575 | protein_coding |
9 | H1 | OO H1.8 | H1-8 | 132243 | ENSG00000178804 | ENST00000503977 | ENSP00000422964 | NM_001308262 | NP_001295191 | protein_coding |
10 | H1 | TS H1.9(?) | H1-9P | 373861 | ENSG00000188662 | NaN | NaN | NaN | NaN | NaN |
11 | H1 | H1.10 | H1-10 | 8971 | ENSG00000184897 | ENST00000333762 | ENSP00000329662 | NM_006026 | NP_006017 | protein_coding |
12 | H2A | TS H2A.1 | H2AC1 | 221613 | ENSG00000164508 | ENST00000297012 | ENSP00000297012 | NM_170745 | NP_734466 | protein_coding |
13 | H2A | canonical H2A | H2AC4 | 8335 | ENSG00000278463 | ENST00000615868 | ENSP00000483842 | NM_003513 | NP_003504 | protein_coding |
14 | H2A | canonical H2A | H2AC6 | 8334 | ENSG00000180573 | ENST00000314088 | ENSP00000321389 | NaN | NaN | nonsense_mediated_decay |
15 | H2A | canonical H2A | H2AC6 | 8334 | ENSG00000180573 | ENST00000602637 | ENSP00000473534 | NaN | NaN | protein_coding |
16 | H2A | canonical H2A | H2AC6 | 8334 | ENSG00000180573 | ENST00000377791 | ENSP00000367022 | NM_003512 | NP_003503 | protein_coding |
17 | H2A | canonical H2A | H2AC7 | 3013 | ENSG00000196866 | ENST00000341023 | ENSP00000341094 | NM_021065 | NP_066409 | protein_coding |
18 | H2A | canonical H2A | H2AC8 | 3012 | ENSG00000277075 | ENST00000303910 | ENSP00000303373 | NM_021052 | NP_066390 | protein_coding |
19 | H2A | canonical H2A | H2AC11 | 8969 | ENSG00000196787 | ENST00000359193 | ENSP00000352119 | NM_021064 | NP_066408 | protein_coding |
20 | H2A | canonical H2A | H2AC12 | 85235 | ENSG00000274997 | ENST00000377459 | ENSP00000366679 | NM_080596 | NP_542163 | protein_coding |
21 | H2A | canonical H2A | H2AC13 | 8329 | ENSG00000196747 | ENST00000358739 | ENSP00000351589 | NM_003509 | NP_003500 | protein_coding |
22 | H2A | canonical H2A | H2AC14 | 8331 | ENSG00000276368 | ENST00000333151 | ENSP00000328484 | NM_021066 | NP_066544 | protein_coding |
23 | H2A | canonical H2A | H2AC15 | 8330 | ENSG00000275221 | ENST00000618958 | ENSP00000482431 | NM_003510 | NP_003501 | protein_coding |
24 | H2A | canonical H2A | H2AC16 | 8332 | ENSG00000276903 | ENST00000613174 | ENSP00000482538 | NM_003511 | NP_003502 | protein_coding |
25 | H2A | canonical H2A | H2AC17 | 8336 | ENSG00000278677 | ENST00000359611 | ENSP00000352627 | NM_003514 | NP_003505 | protein_coding |
26 | H2A | canonical H2A | H2AC18 | 8337 | ENSG00000203812 | ENST00000369159 | ENSP00000358155 | NM_003516 | NP_003507 | protein_coding |
27 | H2A | canonical H2A | H2AC19 | 723790 | ENSG00000272196 | ENST00000607355 | ENSP00000475814 | NM_001040874 | NP_001035807 | protein_coding |
28 | H2A | canonical H2A | H2AC20 | 8338 | ENSG00000184260 | ENST00000331380 | ENSP00000332194 | NM_003517 | NP_003508 | protein_coding |
29 | H2A | canonical H2A | H2AC21 | 317772 | ENSG00000184270 | ENST00000331128 | ENSP00000332790 | NM_175065 | NP_778235 | protein_coding |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
132 | H3 | H3.3 | H3-3B | 3021 | ENSG00000132475 | ENST00000587171 | ENSP00000468484 | NaN | NaN | protein_coding |
133 | H3 | TS H3.4 | H3-4 | 8290 | ENSG00000168148 | ENST00000366696 | ENSP00000355657 | NM_003493 | NP_003484 | protein_coding |
134 | H3 | H3.5 | H3-5 | 440093 | ENSG00000188375 | ENST00000340398 | ENSP00000339835 | NM_001013699 | NP_001013721 | protein_coding |
135 | H3 | cenH3 | CENPA | 1058 | ENSG00000115163 | ENST00000335756 | ENSP00000336868 | NM_001809 | NP_001800 | protein_coding |
136 | H3 | cenH3 | CENPA | 1058 | ENSG00000115163 | ENST00000233505 | ENSP00000233505 | NM_001042426 | NP_001035891 | protein_coding |
137 | H3 | cenH3 | CENPA | 1058 | ENSG00000115163 | ENST00000419525 | ENSP00000404963 | NaN | NaN | nonsense_mediated_decay |
138 | H4 | canonical H4 | H4C1 | 8359 | ENSG00000278637 | ENST00000617569 | ENSP00000479106 | NM_003538 | NP_003529 | protein_coding |
139 | H4 | canonical H4 | H4C2 | 8366 | ENSG00000278705 | ENST00000377745 | ENSP00000366974 | NM_003544 | NP_003535 | protein_coding |
140 | H4 | canonical H4 | H4C3 | 8364 | ENSG00000197061 | ENST00000377803 | ENSP00000367034 | NM_003542 | NP_003533 | protein_coding |
141 | H4 | canonical H4 | H4C4 | 8360 | ENSG00000277157 | ENST00000614247 | ENSP00000479461 | NM_003539 | NP_003530 | protein_coding |
142 | H4 | canonical H4 | H4C5 | 8367 | ENSG00000276966 | ENST00000615164 | ENSP00000484789 | NM_003545 | NP_003536 | protein_coding |
143 | H4 | canonical H4 | H4C6 | 8361 | ENSG00000274618 | ENST00000244537 | ENSP00000244537 | NM_003540 | NP_003531 | protein_coding |
144 | H4 | canonical H4 | H4C7 | 8369 | ENSG00000275663 | ENST00000611444 | ENSP00000477870 | NM_003547 | NP_003538 | protein_coding |
145 | H4 | canonical H4 | H4C8 | 8365 | ENSG00000158406 | ENST00000634956 | ENSP00000489567 | NaN | NaN | nonsense_mediated_decay |
146 | H4 | canonical H4 | H4C8 | 8365 | ENSG00000158406 | ENST00000634560 | ENSP00000489319 | NaN | NaN | nonsense_mediated_decay |
147 | H4 | canonical H4 | H4C8 | 8365 | ENSG00000158406 | ENST00000377727 | ENSP00000366956 | NM_003543 | NP_003534 | protein_coding |
148 | H4 | canonical H4 | H4C8 | 8365 | ENSG00000158406 | ENST00000635491 | ENSP00000489236 | NaN | NaN | protein_coding |
149 | H4 | canonical H4 | H4C9 | 8294 | ENSG00000276180 | ENST00000615353 | ENSP00000481486 | NM_003495 | NP_003486 | protein_coding |
150 | H4 | canonical H4 | H4C11 | 8363 | ENSG00000197238 | ENST00000355057 | ENSP00000347168 | NM_021968 | NP_068803 | protein_coding |
151 | H4 | canonical H4 | H4C12 | 8362 | ENSG00000273542 | ENST00000611927 | ENSP00000479794 | NM_003541 | NP_003532 | protein_coding |
152 | H4 | canonical H4 | H4C13 | 8368 | ENSG00000275126 | ENST00000618305 | ENSP00000480960 | NM_003546 | NP_003537 | protein_coding |
153 | H4 | canonical H4 | H4C14 | 8370 | ENSG00000270882 | ENST00000578186 | ENSP00000462667 | NM_003548 | NP_003539 | protein_coding |
154 | H4 | canonical H4 | H4C14 | 8370 | ENSG00000270882 | ENST00000618193 | ENSP00000478786 | NaN | NaN | nonsense_mediated_decay |
155 | H4 | canonical H4 | H4C14 | 8370 | ENSG00000270882 | ENST00000614272 | ENSP00000478519 | NaN | NaN | nonsense_mediated_decay |
156 | H4 | canonical H4 | H4C14 | 8370 | ENSG00000270882 | ENST00000613412 | ENSP00000481343 | NaN | NaN | nonsense_mediated_decay |
157 | H4 | canonical H4 | H4C15 | 554313 | ENSG00000270276 | ENST00000621520 | ENSP00000481507 | NaN | NaN | nonsense_mediated_decay |
158 | H4 | canonical H4 | H4C15 | 554313 | ENSG00000270276 | ENST00000612061 | ENSP00000482412 | NaN | NaN | nonsense_mediated_decay |
159 | H4 | canonical H4 | H4C15 | 554313 | ENSG00000270276 | ENST00000579512 | ENSP00000462355 | NM_001034077 | NP_001029249 | protein_coding |
160 | H4 | canonical H4 | H4-16 | 121504 | ENSG00000197837 | ENST00000358064 | ENSP00000350767 | NaN | NaN | nonsense_mediated_decay |
161 | H4 | canonical H4 | H4-16 | 121504 | ENSG00000197837 | ENST00000539745 | ENSP00000443017 | NM_175054 | NP_778224 | protein_coding |
162 rows × 10 columns
import requests
seqs=[]
seqlen=[]
refs=[]
for index, row in histone_proteins.iterrows():
# print(row['Protein stable ID'])
seq=requests.get('http://rest.ensembl.org/sequence/id/%s?content-type=text/plain'%row['Protein stable ID']).content
seqs.append(seq)
seqlen.append(len(seq))
refs.append(d.loc[d['NCBI gene ID']==row['NCBI gene ID']]['PMIDs'].values[0])
# refs.append(hist_genes.loc(hist_genes['NCBI gene ID']==row['NCBI gene ID'])values[0])
histone_proteins['Protein sequence']=seqs
histone_proteins['Protein length']=seqlen
histone_proteins['References']=refs
#response = requests.get('http://rest.ensembl.org/sequence/id/ENST00000340857?content-type=text/plain')
#print (response.status_code)
#print (response.content)
histone_proteins
Histone type | Histone variant | HGNC symbol | NCBI gene ID | Ensembl gene ID | Transcript stable ID | Protein stable ID | RefSeq mRNA ID | RefSeq peptide ID | Transcript type | Protein sequence | Protein length | References | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | H1 | H1.0 | H1-0 | 3005 | ENSG00000189060 | ENST00000340857 | ENSP00000344504 | NM_005318 | NP_005309 | protein_coding | b'MTENSTSAPAAKPKRAKASKKSTDHPKYSDMIVAAIQAEKNRAG... | 194 | 26689747 |
1 | H1 | H1.1 | H1-1 | 3024 | ENSG00000124610 | ENST00000244573 | ENSP00000244573 | NM_005325 | NP_005316 | protein_coding | b'MSETVPPAPAASAAPEKPLAGKKAKKPAKAAAASKKKPAGPSVS... | 215 | 26689747 |
2 | H1 | H1.2 | H1-2 | 3006 | ENSG00000187837 | ENST00000343677 | ENSP00000339566 | NM_005319 | NP_005310 | protein_coding | b'MSETAPAAPAAAPPAEKAPVKKKAAKKAGGTPRKASGPPVSELI... | 213 | 26689747 |
3 | H1 | H1.3 | H1-3 | 3007 | ENSG00000124575 | ENST00000244534 | ENSP00000244534 | NM_005320 | NP_005311 | protein_coding | b'MSETAPLAPTIPAPAEKTPVKKKAKKAGATAGKRKASGPPVSEL... | 221 | 26689747 |
4 | H1 | H1.4 | H1-4 | 3008 | ENSG00000168298 | ENST00000304218 | ENSP00000307705 | NM_005321 | NP_005312 | protein_coding | b'MSETAPAAPAAPAPAEKTPVKKKARKSAGAAKRKASGPPVSELI... | 219 | 26689747 |
5 | H1 | H1.5 | H1-5 | 3009 | ENSG00000184357 | ENST00000331442 | ENSP00000330074 | NM_005322 | NP_005313 | protein_coding | b'MSETAPAETATPAPVEKSPAKKKATKKAAGAGAAKRKATGPPVS... | 226 | 26689747 |
6 | H1 | TS H1.6 | H1-6 | 3010 | ENSG00000187475 | ENST00000338379 | ENSP00000341214 | NM_005323 | NP_005314 | protein_coding | b'MSETVPAASASAGVAAMEKLPTKKRGRKPAGLISASRKVPNLSV... | 207 | 26689747 |
7 | H1 | TS H1.7 | H1-7 | 341567 | ENSG00000187166 | ENST00000335017 | ENSP00000334805 | NM_181788 | NP_861453 | protein_coding | b'MEQALTGEAQSRWPRRGGSGAMAEAPGPSGESRGHSATQLPAEK... | 255 | 26689747 |
8 | H1 | OO H1.8 | H1-8 | 132243 | ENSG00000178804 | ENST00000324382 | ENSP00000319799 | NM_153833 | NP_722575 | protein_coding | b'MAPGSVTSDISPSSTSTAGSSRSPESEKPGPSHGGVPPGGPSHS... | 346 | 26689747 |
9 | H1 | OO H1.8 | H1-8 | 132243 | ENSG00000178804 | ENST00000503977 | ENSP00000422964 | NM_001308262 | NP_001295191 | protein_coding | b'MAPATAPRRAGEAKGKGPKKPSEAKEDPPNVGKVKKAAKRPAKV... | 207 | 26689747 |
10 | H1 | TS H1.9(?) | H1-9P | 373861 | ENSG00000188662 | NaN | NaN | NaN | NaN | NaN | b'{"error":"ID \'nan\' not found"}' | 30 | 12920187 26689747 17852044 |
11 | H1 | H1.10 | H1-10 | 8971 | ENSG00000184897 | ENST00000333762 | ENSP00000329662 | NM_006026 | NP_006017 | protein_coding | b'MSVELEEALPVTTAEGMAKKVTKAGGSAALSPSKKRKNSKKKNQ... | 213 | 26689747 |
12 | H2A | TS H2A.1 | H2AC1 | 221613 | ENSG00000164508 | ENST00000297012 | ENSP00000297012 | NM_170745 | NP_734466 | protein_coding | b'MSGRGKQGGKARAKSKSRSSRAGLQFPVGRIHRLLRKGNYAERI... | 131 | 2011515 7068607 24506885 |
13 | H2A | canonical H2A | H2AC4 | 8335 | ENSG00000278463 | ENST00000615868 | ENSP00000483842 | NM_003513 | NP_003504 | protein_coding | b'MSGRGKQGGKARAKAKTRSSRAGLQFPVGRVHRLLRKGNYSERV... | 130 | 12408966 25731851 |
14 | H2A | canonical H2A | H2AC6 | 8334 | ENSG00000180573 | ENST00000314088 | ENSP00000321389 | NaN | NaN | nonsense_mediated_decay | b'MSGRGKQGGKARAKAKSRSSRAGLQFPVGRVHRLLRKGNYAERV... | 130 | 12408966 25731851 |
15 | H2A | canonical H2A | H2AC6 | 8334 | ENSG00000180573 | ENST00000602637 | ENSP00000473534 | NaN | NaN | protein_coding | b'MSGRGKQGGKARAKAKSRSSRAGLQFPVGRVHRLLRKGNYAERV... | 130 | 12408966 25731851 |
16 | H2A | canonical H2A | H2AC6 | 8334 | ENSG00000180573 | ENST00000377791 | ENSP00000367022 | NM_003512 | NP_003503 | protein_coding | b'MSGRGKQGGKARAKAKSRSSRAGLQFPVGRVHRLLRKGNYAERV... | 130 | 12408966 25731851 |
17 | H2A | canonical H2A | H2AC7 | 3013 | ENSG00000196866 | ENST00000341023 | ENSP00000341094 | NM_021065 | NP_066409 | protein_coding | b'MSGRGKQGGKARAKAKTRSSRAGLQFPVGRVHRLLRKGNYSERV... | 130 | 12408966 25731851 |
18 | H2A | canonical H2A | H2AC8 | 3012 | ENSG00000277075 | ENST00000303910 | ENSP00000303373 | NM_021052 | NP_066390 | protein_coding | b'MSGRGKQGGKARAKAKTRSSRAGLQFPVGRVHRLLRKGNYSERV... | 130 | 12408966 25731851 |
19 | H2A | canonical H2A | H2AC11 | 8969 | ENSG00000196787 | ENST00000359193 | ENSP00000352119 | NM_021064 | NP_066408 | protein_coding | b'MSGRGKQGGKARAKAKTRSSRAGLQFPVGRVHRLLRKGNYAERV... | 130 | 12408966 25731851 |
20 | H2A | canonical H2A | H2AC12 | 85235 | ENSG00000274997 | ENST00000377459 | ENSP00000366679 | NM_080596 | NP_542163 | protein_coding | b'MSGRGKQGGKARAKAKTRSSRAGLQFPVGRVHRLLRKGNYAERV... | 128 | 12408966 25731851 |
21 | H2A | canonical H2A | H2AC13 | 8329 | ENSG00000196747 | ENST00000358739 | ENSP00000351589 | NM_003509 | NP_003500 | protein_coding | b'MSGRGKQGGKARAKAKTRSSRAGLQFPVGRVHRLLRKGNYAERV... | 130 | 12408966 25731851 |
22 | H2A | canonical H2A | H2AC14 | 8331 | ENSG00000276368 | ENST00000333151 | ENSP00000328484 | NM_021066 | NP_066544 | protein_coding | b'MSGRGKQGGKARAKAKTRSSRAGLQFPVGRVHRLLRKGNYAERV... | 128 | 12408966 25731851 |
23 | H2A | canonical H2A | H2AC15 | 8330 | ENSG00000275221 | ENST00000618958 | ENSP00000482431 | NM_003510 | NP_003501 | protein_coding | b'MSGRGKQGGKARAKAKTRSSRAGLQFPVGRVHRLLRKGNYAERV... | 130 | 12408966 25731851 |
24 | H2A | canonical H2A | H2AC16 | 8332 | ENSG00000276903 | ENST00000613174 | ENSP00000482538 | NM_003511 | NP_003502 | protein_coding | b'MSGRGKQGGKARAKAKTRSSRAGLQFPVGRVHRLLRKGNYAERV... | 130 | 12408966 25731851 |
25 | H2A | canonical H2A | H2AC17 | 8336 | ENSG00000278677 | ENST00000359611 | ENSP00000352627 | NM_003514 | NP_003505 | protein_coding | b'MSGRGKQGGKARAKAKTRSSRAGLQFPVGRVHRLLRKGNYAERV... | 130 | 12408966 25731851 |
26 | H2A | canonical H2A | H2AC18 | 8337 | ENSG00000203812 | ENST00000369159 | ENSP00000358155 | NM_003516 | NP_003507 | protein_coding | b'MSGRGKQGGKARAKAKSRSSRAGLQFPVGRVHRLLRKGNYAERV... | 130 | 12408966 25731851 |
27 | H2A | canonical H2A | H2AC19 | 723790 | ENSG00000272196 | ENST00000607355 | ENSP00000475814 | NM_001040874 | NP_001035807 | protein_coding | b'MSGRGKQGGKARAKAKSRSSRAGLQFPVGRVHRLLRKGNYAERV... | 130 | 12408966 25731851 |
28 | H2A | canonical H2A | H2AC20 | 8338 | ENSG00000184260 | ENST00000331380 | ENSP00000332194 | NM_003517 | NP_003508 | protein_coding | b'MSGRGKQGGKARAKAKSRSSRAGLQFPVGRVHRLLRKGNYAERV... | 129 | 12408966 25731851 |
29 | H2A | canonical H2A | H2AC21 | 317772 | ENSG00000184270 | ENST00000331128 | ENSP00000332790 | NM_175065 | NP_778235 | protein_coding | b'MSGRGKQGGKARAKAKSRSSRAGLQFPVGRVHRLLRKGNYAERV... | 130 | 12408966 25731851 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
132 | H3 | H3.3 | H3-3B | 3021 | ENSG00000132475 | ENST00000587171 | ENSP00000468484 | NaN | NaN | protein_coding | b'MARTKQTARKSTGGKAPRKQLATKAARKSAPSTGGVKKPHRYRP... | 151 | 19412883 |
133 | H3 | TS H3.4 | H3-4 | 8290 | ENSG00000168148 | ENST00000366696 | ENSP00000355657 | NM_003493 | NP_003484 | protein_coding | b'MARTKQTARKSTGGKAPRKQLATKVARKSAPATGGVKKPHRYRP... | 136 | 8986613 |
134 | H3 | H3.5 | H3-5 | 440093 | ENSG00000188375 | ENST00000340398 | ENSP00000339835 | NM_001013699 | NP_001013721 | protein_coding | b'MARTKQTARKSTGGKAPRKQLATKAARKSTPSTCGVKPHRYRPG... | 135 | 21274551 |
135 | H3 | cenH3 | CENPA | 1058 | ENSG00000115163 | ENST00000335756 | ENSP00000336868 | NM_001809 | NP_001800 | protein_coding | b'MGPRRRSRKPEAPRRRSPSPTPTPGPSRRGPSLGASSHQHSRRR... | 140 | 23324462 |
136 | H3 | cenH3 | CENPA | 1058 | ENSG00000115163 | ENST00000233505 | ENSP00000233505 | NM_001042426 | NP_001035891 | protein_coding | b'MGPRRRSRKPEAPRRRSPSPTPTPGPSRRGPSLGASSHQHSRRR... | 114 | 23324462 |
137 | H3 | cenH3 | CENPA | 1058 | ENSG00000115163 | ENST00000419525 | ENSP00000404963 | NaN | NaN | nonsense_mediated_decay | b'MGPRRRSRKPEAPRRRSPSPTPTPGPSRRGPSLGASSHQHSRRR... | 86 | 23324462 |
138 | H4 | canonical H4 | H4C1 | 8359 | ENSG00000278637 | ENST00000617569 | ENSP00000479106 | NM_003538 | NP_003529 | protein_coding | b'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV... | 103 | 12408966 |
139 | H4 | canonical H4 | H4C2 | 8366 | ENSG00000278705 | ENST00000377745 | ENSP00000366974 | NM_003544 | NP_003535 | protein_coding | b'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV... | 103 | 12408966 |
140 | H4 | canonical H4 | H4C3 | 8364 | ENSG00000197061 | ENST00000377803 | ENSP00000367034 | NM_003542 | NP_003533 | protein_coding | b'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV... | 103 | 12408966 |
141 | H4 | canonical H4 | H4C4 | 8360 | ENSG00000277157 | ENST00000614247 | ENSP00000479461 | NM_003539 | NP_003530 | protein_coding | b'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV... | 103 | 12408966 |
142 | H4 | canonical H4 | H4C5 | 8367 | ENSG00000276966 | ENST00000615164 | ENSP00000484789 | NM_003545 | NP_003536 | protein_coding | b'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV... | 103 | 12408966 |
143 | H4 | canonical H4 | H4C6 | 8361 | ENSG00000274618 | ENST00000244537 | ENSP00000244537 | NM_003540 | NP_003531 | protein_coding | b'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV... | 103 | 12408966 |
144 | H4 | canonical H4 | H4C7 | 8369 | ENSG00000275663 | ENST00000611444 | ENSP00000477870 | NM_003547 | NP_003538 | protein_coding | b'MSVRGKAGKGLGKGGAKCHRKVLSDNIQGITKCTIRRLARHGGV... | 98 | 12408966 |
145 | H4 | canonical H4 | H4C8 | 8365 | ENSG00000158406 | ENST00000634956 | ENSP00000489567 | NaN | NaN | nonsense_mediated_decay | b'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV... | 103 | 12408966 |
146 | H4 | canonical H4 | H4C8 | 8365 | ENSG00000158406 | ENST00000634560 | ENSP00000489319 | NaN | NaN | nonsense_mediated_decay | b'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV... | 103 | 12408966 |
147 | H4 | canonical H4 | H4C8 | 8365 | ENSG00000158406 | ENST00000377727 | ENSP00000366956 | NM_003543 | NP_003534 | protein_coding | b'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV... | 103 | 12408966 |
148 | H4 | canonical H4 | H4C8 | 8365 | ENSG00000158406 | ENST00000635491 | ENSP00000489236 | NaN | NaN | protein_coding | b'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV... | 103 | 12408966 |
149 | H4 | canonical H4 | H4C9 | 8294 | ENSG00000276180 | ENST00000615353 | ENSP00000481486 | NM_003495 | NP_003486 | protein_coding | b'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV... | 103 | 12408966 |
150 | H4 | canonical H4 | H4C11 | 8363 | ENSG00000197238 | ENST00000355057 | ENSP00000347168 | NM_021968 | NP_068803 | protein_coding | b'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV... | 103 | 12408966 |
151 | H4 | canonical H4 | H4C12 | 8362 | ENSG00000273542 | ENST00000611927 | ENSP00000479794 | NM_003541 | NP_003532 | protein_coding | b'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV... | 103 | 12408966 |
152 | H4 | canonical H4 | H4C13 | 8368 | ENSG00000275126 | ENST00000618305 | ENSP00000480960 | NM_003546 | NP_003537 | protein_coding | b'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV... | 103 | 12408966 |
153 | H4 | canonical H4 | H4C14 | 8370 | ENSG00000270882 | ENST00000578186 | ENSP00000462667 | NM_003548 | NP_003539 | protein_coding | b'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV... | 103 | 12408966 |
154 | H4 | canonical H4 | H4C14 | 8370 | ENSG00000270882 | ENST00000618193 | ENSP00000478786 | NaN | NaN | nonsense_mediated_decay | b'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV... | 103 | 12408966 |
155 | H4 | canonical H4 | H4C14 | 8370 | ENSG00000270882 | ENST00000614272 | ENSP00000478519 | NaN | NaN | nonsense_mediated_decay | b'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV... | 103 | 12408966 |
156 | H4 | canonical H4 | H4C14 | 8370 | ENSG00000270882 | ENST00000613412 | ENSP00000481343 | NaN | NaN | nonsense_mediated_decay | b'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV... | 103 | 12408966 |
157 | H4 | canonical H4 | H4C15 | 554313 | ENSG00000270276 | ENST00000621520 | ENSP00000481507 | NaN | NaN | nonsense_mediated_decay | b'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV... | 103 | 12408966 |
158 | H4 | canonical H4 | H4C15 | 554313 | ENSG00000270276 | ENST00000612061 | ENSP00000482412 | NaN | NaN | nonsense_mediated_decay | b'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV... | 103 | 12408966 |
159 | H4 | canonical H4 | H4C15 | 554313 | ENSG00000270276 | ENST00000579512 | ENSP00000462355 | NM_001034077 | NP_001029249 | protein_coding | b'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV... | 103 | 12408966 |
160 | H4 | canonical H4 | H4-16 | 121504 | ENSG00000197837 | ENST00000358064 | ENSP00000350767 | NaN | NaN | nonsense_mediated_decay | b'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV... | 103 | 12408966 |
161 | H4 | canonical H4 | H4-16 | 121504 | ENSG00000197837 | ENST00000539745 | ENSP00000443017 | NM_175054 | NP_778224 | protein_coding | b'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV... | 103 | 12408966 |
162 rows × 13 columns
# hp=(histone_proteins[histone_proteins['Transcript type'].isin(['protein_coding'])]).drop_duplicates(subset=['Transcript stable ID'])
hp=(histone_proteins[histone_proteins['Transcript type'].isin(['protein_coding',np.nan])]).sort_values('RefSeq peptide ID', ascending=True).drop_duplicates(subset=['Ensembl gene ID','Protein sequence']).sort_index()
hp
Histone type | Histone variant | HGNC symbol | NCBI gene ID | Ensembl gene ID | Transcript stable ID | Protein stable ID | RefSeq mRNA ID | RefSeq peptide ID | Transcript type | Protein sequence | Protein length | References | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | H1 | H1.0 | H1-0 | 3005 | ENSG00000189060 | ENST00000340857 | ENSP00000344504 | NM_005318 | NP_005309 | protein_coding | b'MTENSTSAPAAKPKRAKASKKSTDHPKYSDMIVAAIQAEKNRAG... | 194 | 26689747 |
1 | H1 | H1.1 | H1-1 | 3024 | ENSG00000124610 | ENST00000244573 | ENSP00000244573 | NM_005325 | NP_005316 | protein_coding | b'MSETVPPAPAASAAPEKPLAGKKAKKPAKAAAASKKKPAGPSVS... | 215 | 26689747 |
2 | H1 | H1.2 | H1-2 | 3006 | ENSG00000187837 | ENST00000343677 | ENSP00000339566 | NM_005319 | NP_005310 | protein_coding | b'MSETAPAAPAAAPPAEKAPVKKKAAKKAGGTPRKASGPPVSELI... | 213 | 26689747 |
3 | H1 | H1.3 | H1-3 | 3007 | ENSG00000124575 | ENST00000244534 | ENSP00000244534 | NM_005320 | NP_005311 | protein_coding | b'MSETAPLAPTIPAPAEKTPVKKKAKKAGATAGKRKASGPPVSEL... | 221 | 26689747 |
4 | H1 | H1.4 | H1-4 | 3008 | ENSG00000168298 | ENST00000304218 | ENSP00000307705 | NM_005321 | NP_005312 | protein_coding | b'MSETAPAAPAAPAPAEKTPVKKKARKSAGAAKRKASGPPVSELI... | 219 | 26689747 |
5 | H1 | H1.5 | H1-5 | 3009 | ENSG00000184357 | ENST00000331442 | ENSP00000330074 | NM_005322 | NP_005313 | protein_coding | b'MSETAPAETATPAPVEKSPAKKKATKKAAGAGAAKRKATGPPVS... | 226 | 26689747 |
6 | H1 | TS H1.6 | H1-6 | 3010 | ENSG00000187475 | ENST00000338379 | ENSP00000341214 | NM_005323 | NP_005314 | protein_coding | b'MSETVPAASASAGVAAMEKLPTKKRGRKPAGLISASRKVPNLSV... | 207 | 26689747 |
7 | H1 | TS H1.7 | H1-7 | 341567 | ENSG00000187166 | ENST00000335017 | ENSP00000334805 | NM_181788 | NP_861453 | protein_coding | b'MEQALTGEAQSRWPRRGGSGAMAEAPGPSGESRGHSATQLPAEK... | 255 | 26689747 |
8 | H1 | OO H1.8 | H1-8 | 132243 | ENSG00000178804 | ENST00000324382 | ENSP00000319799 | NM_153833 | NP_722575 | protein_coding | b'MAPGSVTSDISPSSTSTAGSSRSPESEKPGPSHGGVPPGGPSHS... | 346 | 26689747 |
9 | H1 | OO H1.8 | H1-8 | 132243 | ENSG00000178804 | ENST00000503977 | ENSP00000422964 | NM_001308262 | NP_001295191 | protein_coding | b'MAPATAPRRAGEAKGKGPKKPSEAKEDPPNVGKVKKAAKRPAKV... | 207 | 26689747 |
10 | H1 | TS H1.9(?) | H1-9P | 373861 | ENSG00000188662 | NaN | NaN | NaN | NaN | NaN | b'{"error":"ID \'nan\' not found"}' | 30 | 12920187 26689747 17852044 |
11 | H1 | H1.10 | H1-10 | 8971 | ENSG00000184897 | ENST00000333762 | ENSP00000329662 | NM_006026 | NP_006017 | protein_coding | b'MSVELEEALPVTTAEGMAKKVTKAGGSAALSPSKKRKNSKKKNQ... | 213 | 26689747 |
12 | H2A | TS H2A.1 | H2AC1 | 221613 | ENSG00000164508 | ENST00000297012 | ENSP00000297012 | NM_170745 | NP_734466 | protein_coding | b'MSGRGKQGGKARAKSKSRSSRAGLQFPVGRIHRLLRKGNYAERI... | 131 | 2011515 7068607 24506885 |
13 | H2A | canonical H2A | H2AC4 | 8335 | ENSG00000278463 | ENST00000615868 | ENSP00000483842 | NM_003513 | NP_003504 | protein_coding | b'MSGRGKQGGKARAKAKTRSSRAGLQFPVGRVHRLLRKGNYSERV... | 130 | 12408966 25731851 |
16 | H2A | canonical H2A | H2AC6 | 8334 | ENSG00000180573 | ENST00000377791 | ENSP00000367022 | NM_003512 | NP_003503 | protein_coding | b'MSGRGKQGGKARAKAKSRSSRAGLQFPVGRVHRLLRKGNYAERV... | 130 | 12408966 25731851 |
17 | H2A | canonical H2A | H2AC7 | 3013 | ENSG00000196866 | ENST00000341023 | ENSP00000341094 | NM_021065 | NP_066409 | protein_coding | b'MSGRGKQGGKARAKAKTRSSRAGLQFPVGRVHRLLRKGNYSERV... | 130 | 12408966 25731851 |
18 | H2A | canonical H2A | H2AC8 | 3012 | ENSG00000277075 | ENST00000303910 | ENSP00000303373 | NM_021052 | NP_066390 | protein_coding | b'MSGRGKQGGKARAKAKTRSSRAGLQFPVGRVHRLLRKGNYSERV... | 130 | 12408966 25731851 |
19 | H2A | canonical H2A | H2AC11 | 8969 | ENSG00000196787 | ENST00000359193 | ENSP00000352119 | NM_021064 | NP_066408 | protein_coding | b'MSGRGKQGGKARAKAKTRSSRAGLQFPVGRVHRLLRKGNYAERV... | 130 | 12408966 25731851 |
20 | H2A | canonical H2A | H2AC12 | 85235 | ENSG00000274997 | ENST00000377459 | ENSP00000366679 | NM_080596 | NP_542163 | protein_coding | b'MSGRGKQGGKARAKAKTRSSRAGLQFPVGRVHRLLRKGNYAERV... | 128 | 12408966 25731851 |
21 | H2A | canonical H2A | H2AC13 | 8329 | ENSG00000196747 | ENST00000358739 | ENSP00000351589 | NM_003509 | NP_003500 | protein_coding | b'MSGRGKQGGKARAKAKTRSSRAGLQFPVGRVHRLLRKGNYAERV... | 130 | 12408966 25731851 |
22 | H2A | canonical H2A | H2AC14 | 8331 | ENSG00000276368 | ENST00000333151 | ENSP00000328484 | NM_021066 | NP_066544 | protein_coding | b'MSGRGKQGGKARAKAKTRSSRAGLQFPVGRVHRLLRKGNYAERV... | 128 | 12408966 25731851 |
23 | H2A | canonical H2A | H2AC15 | 8330 | ENSG00000275221 | ENST00000618958 | ENSP00000482431 | NM_003510 | NP_003501 | protein_coding | b'MSGRGKQGGKARAKAKTRSSRAGLQFPVGRVHRLLRKGNYAERV... | 130 | 12408966 25731851 |
24 | H2A | canonical H2A | H2AC16 | 8332 | ENSG00000276903 | ENST00000613174 | ENSP00000482538 | NM_003511 | NP_003502 | protein_coding | b'MSGRGKQGGKARAKAKTRSSRAGLQFPVGRVHRLLRKGNYAERV... | 130 | 12408966 25731851 |
25 | H2A | canonical H2A | H2AC17 | 8336 | ENSG00000278677 | ENST00000359611 | ENSP00000352627 | NM_003514 | NP_003505 | protein_coding | b'MSGRGKQGGKARAKAKTRSSRAGLQFPVGRVHRLLRKGNYAERV... | 130 | 12408966 25731851 |
26 | H2A | canonical H2A | H2AC18 | 8337 | ENSG00000203812 | ENST00000369159 | ENSP00000358155 | NM_003516 | NP_003507 | protein_coding | b'MSGRGKQGGKARAKAKSRSSRAGLQFPVGRVHRLLRKGNYAERV... | 130 | 12408966 25731851 |
27 | H2A | canonical H2A | H2AC19 | 723790 | ENSG00000272196 | ENST00000607355 | ENSP00000475814 | NM_001040874 | NP_001035807 | protein_coding | b'MSGRGKQGGKARAKAKSRSSRAGLQFPVGRVHRLLRKGNYAERV... | 130 | 12408966 25731851 |
28 | H2A | canonical H2A | H2AC20 | 8338 | ENSG00000184260 | ENST00000331380 | ENSP00000332194 | NM_003517 | NP_003508 | protein_coding | b'MSGRGKQGGKARAKAKSRSSRAGLQFPVGRVHRLLRKGNYAERV... | 129 | 12408966 25731851 |
29 | H2A | canonical H2A | H2AC21 | 317772 | ENSG00000184270 | ENST00000331128 | ENSP00000332790 | NM_175065 | NP_778235 | protein_coding | b'MSGRGKQGGKARAKAKSRSSRAGLQFPVGRVHRLLRKGNYAERV... | 130 | 12408966 25731851 |
30 | H2A | H2A.J(?) | H2AJ | 55766 | ENSG00000246705 | ENST00000544848 | ENSP00000438553 | NM_177925 | NP_808760 | protein_coding | b'MSGRGKQGGKVRAKAKSRSSRAGLQFPVGRVHRLLRKGNYAERV... | 129 | 25731851 |
33 | H2A | canonical H2A | H2AW | 92815 | ENSG00000181218 | ENST00000366695 | ENSP00000355656 | NM_033445 | NP_254280 | protein_coding | b'MSGRGKQGGKARAKAKSRSSRAGLQFPVGRVHRLLRKGNYSERV... | 130 | ? |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
112 | H3 | H3.Y.1 | H3Y1 | 391769 | ENSG00000269466 | ENST00000598383 | ENSP00000496014 | NM_001355258 | NP_001342187 | protein_coding | b'MARTKQTARKATAWQAPRKPLATKAAGKRAPPTGGIKKPHRYKP... | 136 | 20819935 |
113 | H3 | H3.Y.2 | H3Y2 | 340096 | ENSG00000268799 | ENST00000600799 | ENSP00000497053 | NM_001371919 | NP_001358848 | protein_coding | b'MARTKQTARKATAWQAPRKPLATKAARKRASPTGGIKKPHRYKP... | 147 | 20819935 |
115 | H3 | canonical H3(?) | H3-2 | 440686 | ENSG00000273213 | ENST00000609879 | ENSP00000499501 | NM_001355409 | NP_001342338 | protein_coding | b'MARTKQTARKSTGGKAPRKQLATKAARKSAPATGGVKKPHRYRP... | 136 | 12408966 |
118 | H3 | H3.3 | H3-3A | 3020 | ENSG00000163041 | ENST00000366814 | ENSP00000355779 | NaN | NaN | protein_coding | b'MARTKQTARKSTGGKAPRKQLATKAARKSAPSTGGVKKPHRYRP... | 123 | 19412883 |
119 | H3 | H3.3 | H3-3A | 3020 | ENSG00000163041 | ENST00000366815 | ENSP00000355780 | NM_002107 | NP_002098 | protein_coding | b'MARTKQTARKSTGGKAPRKQLATKAARKSAPSTGGVKKPHRYRP... | 136 | 19412883 |
121 | H3 | H3.3 | H3-3A | 3020 | ENSG00000163041 | ENST00000667897 | ENSP00000499446 | NaN | NaN | protein_coding | b'MARTKQTARKSTGGKAPRKQLATKAARKSAPSTGGVKKPHRYRP... | 120 | 19412883 |
125 | H3 | H3.3 | H3-3B | 3021 | ENSG00000132475 | ENST00000254810 | ENSP00000254810 | NM_005324 | NP_005315 | protein_coding | b'MARTKQTARKSTGGKAPRKQLATKAARKSAPSTGGVKKPHRYRP... | 136 | 19412883 |
126 | H3 | H3.3 | H3-3B | 3021 | ENSG00000132475 | ENST00000592643 | ENSP00000467165 | NaN | NaN | protein_coding | b'MARTKQTARKSTGGKAPRKQLATKAARKSAPSTGGVKKPHRYRP... | 113 | 19412883 |
127 | H3 | H3.3 | H3-3B | 3021 | ENSG00000132475 | ENST00000591890 | ENSP00000466663 | NaN | NaN | protein_coding | b'MARTKQTARKSTGGKAPRKQLATKAARKSAPSTGGVKKPHRYRP... | 92 | 19412883 |
131 | H3 | H3.3 | H3-3B | 3021 | ENSG00000132475 | ENST00000586270 | ENSP00000465403 | NaN | NaN | protein_coding | b'MARTKQTARKSTGGKAPRKQLATKAARKSAPSTGGVKKPHRYRP... | 132 | 19412883 |
132 | H3 | H3.3 | H3-3B | 3021 | ENSG00000132475 | ENST00000587171 | ENSP00000468484 | NaN | NaN | protein_coding | b'MARTKQTARKSTGGKAPRKQLATKAARKSAPSTGGVKKPHRYRP... | 151 | 19412883 |
133 | H3 | TS H3.4 | H3-4 | 8290 | ENSG00000168148 | ENST00000366696 | ENSP00000355657 | NM_003493 | NP_003484 | protein_coding | b'MARTKQTARKSTGGKAPRKQLATKVARKSAPATGGVKKPHRYRP... | 136 | 8986613 |
134 | H3 | H3.5 | H3-5 | 440093 | ENSG00000188375 | ENST00000340398 | ENSP00000339835 | NM_001013699 | NP_001013721 | protein_coding | b'MARTKQTARKSTGGKAPRKQLATKAARKSTPSTCGVKPHRYRPG... | 135 | 21274551 |
135 | H3 | cenH3 | CENPA | 1058 | ENSG00000115163 | ENST00000335756 | ENSP00000336868 | NM_001809 | NP_001800 | protein_coding | b'MGPRRRSRKPEAPRRRSPSPTPTPGPSRRGPSLGASSHQHSRRR... | 140 | 23324462 |
136 | H3 | cenH3 | CENPA | 1058 | ENSG00000115163 | ENST00000233505 | ENSP00000233505 | NM_001042426 | NP_001035891 | protein_coding | b'MGPRRRSRKPEAPRRRSPSPTPTPGPSRRGPSLGASSHQHSRRR... | 114 | 23324462 |
138 | H4 | canonical H4 | H4C1 | 8359 | ENSG00000278637 | ENST00000617569 | ENSP00000479106 | NM_003538 | NP_003529 | protein_coding | b'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV... | 103 | 12408966 |
139 | H4 | canonical H4 | H4C2 | 8366 | ENSG00000278705 | ENST00000377745 | ENSP00000366974 | NM_003544 | NP_003535 | protein_coding | b'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV... | 103 | 12408966 |
140 | H4 | canonical H4 | H4C3 | 8364 | ENSG00000197061 | ENST00000377803 | ENSP00000367034 | NM_003542 | NP_003533 | protein_coding | b'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV... | 103 | 12408966 |
141 | H4 | canonical H4 | H4C4 | 8360 | ENSG00000277157 | ENST00000614247 | ENSP00000479461 | NM_003539 | NP_003530 | protein_coding | b'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV... | 103 | 12408966 |
142 | H4 | canonical H4 | H4C5 | 8367 | ENSG00000276966 | ENST00000615164 | ENSP00000484789 | NM_003545 | NP_003536 | protein_coding | b'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV... | 103 | 12408966 |
143 | H4 | canonical H4 | H4C6 | 8361 | ENSG00000274618 | ENST00000244537 | ENSP00000244537 | NM_003540 | NP_003531 | protein_coding | b'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV... | 103 | 12408966 |
144 | H4 | canonical H4 | H4C7 | 8369 | ENSG00000275663 | ENST00000611444 | ENSP00000477870 | NM_003547 | NP_003538 | protein_coding | b'MSVRGKAGKGLGKGGAKCHRKVLSDNIQGITKCTIRRLARHGGV... | 98 | 12408966 |
147 | H4 | canonical H4 | H4C8 | 8365 | ENSG00000158406 | ENST00000377727 | ENSP00000366956 | NM_003543 | NP_003534 | protein_coding | b'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV... | 103 | 12408966 |
149 | H4 | canonical H4 | H4C9 | 8294 | ENSG00000276180 | ENST00000615353 | ENSP00000481486 | NM_003495 | NP_003486 | protein_coding | b'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV... | 103 | 12408966 |
150 | H4 | canonical H4 | H4C11 | 8363 | ENSG00000197238 | ENST00000355057 | ENSP00000347168 | NM_021968 | NP_068803 | protein_coding | b'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV... | 103 | 12408966 |
151 | H4 | canonical H4 | H4C12 | 8362 | ENSG00000273542 | ENST00000611927 | ENSP00000479794 | NM_003541 | NP_003532 | protein_coding | b'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV... | 103 | 12408966 |
152 | H4 | canonical H4 | H4C13 | 8368 | ENSG00000275126 | ENST00000618305 | ENSP00000480960 | NM_003546 | NP_003537 | protein_coding | b'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV... | 103 | 12408966 |
153 | H4 | canonical H4 | H4C14 | 8370 | ENSG00000270882 | ENST00000578186 | ENSP00000462667 | NM_003548 | NP_003539 | protein_coding | b'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV... | 103 | 12408966 |
159 | H4 | canonical H4 | H4C15 | 554313 | ENSG00000270276 | ENST00000579512 | ENSP00000462355 | NM_001034077 | NP_001029249 | protein_coding | b'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV... | 103 | 12408966 |
161 | H4 | canonical H4 | H4-16 | 121504 | ENSG00000197837 | ENST00000539745 | ENSP00000443017 | NM_175054 | NP_778224 | protein_coding | b'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGV... | 103 | 12408966 |
120 rows × 13 columns
hp.drop(columns=['Protein sequence','Transcript type','Protein stable ID']).to_csv('human_histone_proteins_autogenerated.csv',index=False)
!cp human_histone_proteins_autogenerated.csv docs/human_histones.csv
!git pull
Already up to date.
!gacp
On branch master Your branch is up to date with 'origin/master'. nothing to commit, working tree clean Everything up-to-date
dataset.list_attributes().to_csv('atr.csv')