s='''1.83155e-63 8 25
7.37596e-64 10 20
9.14344e-65 7 24
6.86568e-72 30 50
6.45089e-69 24679455 24680333
4.49086e-56 24679455 24680312
1.78896e-52 35167152 35167547
2.57611e-51 35167209 35167547'''
%store s >data.txt
Writing 's' (str) to file 'data.txt'.
import pandas as pd
df = pd.read_csv("data.txt",sep=" ",names=["e-value","start","end"])
df.head()
e-value | start | end | |
---|---|---|---|
0 | 1.831550e-63 | 8 | 25 |
1 | 7.375960e-64 | 10 | 20 |
2 | 9.143440e-65 | 7 | 24 |
3 | 6.865680e-72 | 30 | 50 |
4 | 6.450890e-69 | 24679455 | 24680333 |
def range_extract(lst):
'Yield 2-tuple ranges or 1-tuple single elements from list of increasing'
'ints; interval making code modified from'
'https://www.rosettacode.org/wiki/Range_extraction#Python'
lenlst = len(lst)
i = 0
while i< lenlst:
low = lst[i]
while i <lenlst-1 and lst[i]+1 == lst[i+1]: i +=1
hi = lst[i]
if hi - low >= 1: #<---MAIN DIFFERENCE
yield (low, hi)
else:
yield (low,)
i += 1
def printr(ranges):
print( '\n'.join( (('%i:%i' % r) if len(r) == 2 else '%i' % r)
for r in ranges ) )
def expand_all_ranges_to_each_position(the_min,the_max):
'''
Takes the minimum and the max position and returns a list of all the positions
in between as well as both the boundaries.
'''
return list(range(the_min,the_max+1))
all_positions = []
for row in df.itertuples():
all_positions.extend(expand_all_ranges_to_each_position(min(row.start,row.end),max(row.start,row.end)))
all_positions = sorted(set(all_positions)) #get unique individual positions sorted; `set()` insures unique positions
for lst in [all_positions]:
#print(list(range_extract(lst)))
printr(range_extract(sorted(lst)))
7:25 30:50 24679455:24680333 35167152:35167547