from __future__ import print_function, division
from collections import defaultdict
import os
import pickle
import numpy as np
!wget http://hapmap.ncbi.nlm.nih.gov/downloads/genotypes/hapmap3/plink_format/draft_2/hapmap3_r2_b36_fwd.consensus.qc.poly.map.bz2
!wget http://hapmap.ncbi.nlm.nih.gov/downloads/genotypes/hapmap3/plink_format/draft_2/hapmap3_r2_b36_fwd.consensus.qc.poly.ped.bz2
!wget http://hapmap.ncbi.nlm.nih.gov/downloads/genotypes/hapmap3/plink_format/draft_2/relationships_w_pops_121708.txt
--2015-06-26 14:52:43-- http://hapmap.ncbi.nlm.nih.gov/downloads/genotypes/hapmap3/plink_format/draft_2/hapmap3_r2_b36_fwd.consensus.qc.poly.map.bz2 Resolving hapmap.ncbi.nlm.nih.gov (hapmap.ncbi.nlm.nih.gov)... 130.14.29.113, 2607:f220:41e:4290::113 Connecting to hapmap.ncbi.nlm.nih.gov (hapmap.ncbi.nlm.nih.gov)|130.14.29.113|:80... connected. HTTP request sent, awaiting response... 200 OK Length: 10590722 (10M) [application/x-bzip2] Saving to: ‘hapmap3_r2_b36_fwd.consensus.qc.poly.map.bz2.1’ hapmap3_r2_b36_fwd. 100%[=====================>] 10.10M 3.74MB/s in 2.7s 2015-06-26 14:52:46 (3.74 MB/s) - ‘hapmap3_r2_b36_fwd.consensus.qc.poly.map.bz2.1’ saved [10590722/10590722] --2015-06-26 14:52:46-- http://hapmap.ncbi.nlm.nih.gov/downloads/genotypes/hapmap3/plink_format/draft_2/hapmap3_r2_b36_fwd.consensus.qc.poly.ped.bz2 Resolving hapmap.ncbi.nlm.nih.gov (hapmap.ncbi.nlm.nih.gov)... 130.14.29.113, 2607:f220:41e:4290::113 Connecting to hapmap.ncbi.nlm.nih.gov (hapmap.ncbi.nlm.nih.gov)|130.14.29.113|:80... connected. HTTP request sent, awaiting response... 200 OK Length: 757962593 (723M) [application/x-bzip2] Saving to: ‘hapmap3_r2_b36_fwd.consensus.qc.poly.ped.bz2.1’ hapmap3_r2_b36_fwd. 100%[=====================>] 722.85M 8.51MB/s in 3m 58s 2015-06-26 14:56:44 (3.03 MB/s) - ‘hapmap3_r2_b36_fwd.consensus.qc.poly.ped.bz2.1’ saved [757962593/757962593] --2015-06-26 14:56:45-- http://hapmap.ncbi.nlm.nih.gov/downloads/genotypes/hapmap3/plink_format/draft_2/relationships_w_pops_121708.txt Resolving hapmap.ncbi.nlm.nih.gov (hapmap.ncbi.nlm.nih.gov)... 130.14.29.113, 2607:f220:41e:4290::113 Connecting to hapmap.ncbi.nlm.nih.gov (hapmap.ncbi.nlm.nih.gov)|130.14.29.113|:80... connected. HTTP request sent, awaiting response... 200 OK Length: 36765 (36K) [text/plain] Saving to: ‘relationships_w_pops_121708.txt.4’ relationships_w_pop 100%[=====================>] 35.90K 174KB/s in 0.2s 2015-06-26 14:56:45 (174 KB/s) - ‘relationships_w_pops_121708.txt.4’ saved [36765/36765]
!bunzip2 hapmap3_r2_b36_fwd.consensus.qc.poly.map.bz2
!bunzip2 hapmap3_r2_b36_fwd.consensus.qc.poly.ped.bz2
bunzip2: Output file hapmap3_r2_b36_fwd.consensus.qc.poly.map already exists. bunzip2: Output file hapmap3_r2_b36_fwd.consensus.qc.poly.ped already exists.
tsi = open('tsi.ind', 'w')
f = open('relationships_w_pops_121708.txt')
for l in f:
toks = l.rstrip().split('\t')
fam_id = toks[0]
ind_id = toks[1]
mom = toks[2]
dad = toks[3]
pop = toks[-1]
if pop != 'TSI' or mom != '0' or dad != '0':
continue
tsi.write('%s\t%s\n' % (fam_id, ind_id))
f.close()
tsi.close()
os.system('plink --file hapmap3_r2_b36_fwd.consensus.qc.poly --maf 0.001 --keep tsi.ind --make-bed --out tsi')
0
max_chro_pos = defaultdict(int)
f = open('tsi.bim')
for l in f:
toks = l.rstrip().split('\t')
chrom = int(toks[0])
if chrom > 22:
continue
pos = int(toks[3])
if pos > max_chro_pos[chrom]:
max_chro_pos[chrom] = pos
f.close()
w = open('max_chro_pos.pickle', 'w')
pickle.dump(max_chro_pos, w)
w.close()
#os.system('plink --bfile tsi --freq --out tsi')
window_size = 2000000
%time os.system('plink --bfile tsi --freq --out tsi')
CPU times: user 4 ms, sys: 0 ns, total: 4 ms Wall time: 1.68 s
0
def traverse_genome(traverse_fun, state=None):
if state is None:
state = {}
for chrom, max_pos in max_chro_pos.items():
num_bin = (max_pos + 1) // window_size
for my_bin in range(num_bin):
start_pos = my_bin * window_size + 1 # 1-start
end_pos = start_pos + window_size
traverse_fun(state, chrom, start_pos, end_pos)
def compute_MAF(state, chrom, start_pos, end_pos):
os.system('plink --bfile tsi --freq --out tsi-%d-%d --chr %d --from-bp %d --to-bp %d' %
(chrom, start_pos, chrom, start_pos, end_pos))
os.remove('tsi-%d-%d.log' % (chrom, start_pos))
%time traverse_genome(compute_MAF)
CPU times: user 192 ms, sys: 1.06 s, total: 1.25 s Wall time: 4min 47s
def gather_statistics(state, chrom, start_pos, end_pos):
try:
f = open('tsi-%d-%d.frq' % (chrom, start_pos))
except:
# empty block
state['block_mafs'][(chrom, start_pos)] = []
return
f.readline()
for cnt, l in enumerate(f):
toks = [tok for tok in l.rstrip().split(' ') if tok != '']
maf = float(toks[-2])
state['snp_cnt'] += 1
state['sum_maf'] += maf
state['block_mafs'][(chrom, start_pos)].append(maf)
f.close()
stats = {'snp_cnt': 0, 'sum_maf': 0.0, 'block_mafs': defaultdict(list)}
traverse_genome(gather_statistics, state=stats)
print(stats['snp_cnt'], stats['sum_maf'] / stats['snp_cnt'])
1222126 0.23159641651
all_mafs = []
for mafs in stats['block_mafs'].values():
all_mafs.extend(mafs)
%time np.median(all_mafs)
CPU times: user 84 ms, sys: 4 ms, total: 88 ms Wall time: 132 ms
0.22159999999999999
all_mafs.sort()
middle = len(all_mafs) // 2
#array of even size
print((all_mafs[middle] + all_mafs[middle + 1]) / 2)
0.2216
stats.keys()
['block_mafs', 'snp_cnt', 'sum_maf']
import functools
def collect_mafs(state, chrom, start_pos, end_pos, block_mafs):
state[chrom] += len(block_mafs[(chrom, start_pos)])
chrom_cnts = defaultdict(int)
traverse_genome(functools.partial(collect_mafs, block_mafs=stats['block_mafs']),
state=chrom_cnts)
for chrom in range(1, 23):
print('%2d\t%6d' % (chrom, chrom_cnts[chrom]))
#block printing on the next chapter
1 100780 2 102377 3 84921 4 75819 5 78013 6 82169 7 67218 8 66803 9 56921 10 65166 11 62182 12 60381 13 46354 14 40525 15 37217 16 38443 17 33175 18 36144 19 21749 20 31872 21 16978 22 16919