import pandas as pd
import requests
from bs4 import BeautifulSoup
url='http://www.myanmars.net/enews/2010/myanmar20101113.html'
page=requests.get(url)
soup=BeautifulSoup(page.content)
def dictify(txt):
attrs={}
for item in txt.split(';'):
itembits=item.split(':')
attrs[itembits[0]]=itembits[1]
return attrs
def approxify(x,y):
if abs(x-y)<3:
return True
return False
txt='Pyithu Hluttaw representatives in the constituencies shown against them.'
leftCounters={}
for line in soup.find(text=txt).parent.parent.find_next_siblings():
attrs=dictify(line['style'])
currLeft=int(attrs['left'])
if currLeft not in leftCounters:
leftCounters[currLeft]=1
else: leftCounters[currLeft]=leftCounters[currLeft]+1
for c in leftCounters:
if leftCounters[c]>3: print(c,leftCounters[c])
(386, 5) (653, 22) (783, 20) (927, 20) (100, 7) (488, 4) (241, 5) (628, 23) (119, 5) (1023, 25)
colSep=600
lefts=[100,628]
#---
import re
prevLeft=0
prevTop=0
cols1={}
cols2={}
topCounters1=[]
topCounters2=[]
newState=False
currstate=''
stateReps={}
col=1
for line in soup.find(text=txt).parent.parent.find_next_siblings():
attrs=dictify(line['style'])
currLeft=int(attrs['left'])
if leftCounters[currLeft]<4:
continue
currTop=int(attrs['top'])
if currLeft<colSep:
lines=cols1
tops=topCounters1
else:
lines=cols2
tops=topCounters2
col=2
#print(attrs, line.find('span').text)
itemtxt=line.find('span').text
for i in lefts:
if approxify(i,currLeft):
if not re.match('\d*\.',itemtxt):
currstate=itemtxt
stateReps[currstate]=(col,currTop)
newState=True
if newState:
newState=False
continue
for i in tops:
if approxify(currTop,i):
currTop=i
if currTop not in tops:
tops.append(currTop)
if currTop not in lines:
lines[currTop]=[]
lines[currTop].append(itemtxt)
if currTop<prevTop:
#new column
pass
prevLeft=currLeft
prevTop=currTop
stateReps
{u'Kachin State': (1, 1475), u'Kayin State': (2, 1251), u'Rakhine State': (2, 1401), u'Sr.': (2, 1008)}
cols1
{1494: [u'1.', u'U Thein Zaw', u'10/KaHtaNa(N)009269', u'Myitkyina'], 1532: [u'2.', u'U Yein Borm', u'1/WaMaNa(N)003327', u'Waingmaw', u'NUP'], 1551: [u'3.', u'U Myo Swe', u'4/HaKhaNa(N)009578', u'Tanai', u'USDP'], 1569: [u'4.', u'U Kyaw Soe Lay', u'1/MaNyaNa(N)033969', u'Mohnyin', u'USDP'], 1588: [u'5.', u'U Phone Swe', u'7/AhTaNa(N)004822', u'Mogaung', u'USDP']}
cols2
{1044: [u'6.', u'U Ohn Myint', u'12/YaKaNa(N)010442', u'Phakant', u'USDP'], 1063: [u'7.', u'U Lun Maung', u'10/MaDaNa(N)029996', u'Bhamo', u'USDP'], 1081: [u'8.', u'U Zaw Tun', u'1/MaMaNa(N)025647', u'Momauk', u'Shan Nationalities'], 1100: [u'Democratic Party'], 1119: [u'9.', u'U Tun Thein (a)', u'1/MaKaNa(N)028072', u'Mansi', u'USDP'], 1138: [u'U Tun Tun'], 1157: [u'10.', u'U Win Naing (a)', u'1/YaKaNa(N)003024', u'Shweku', u'USDP'], 1175: [u'U Phyu'], 1194: [u'11.', u'Daw Dwe Bu', u'1/MaKaNa(N)037405', u"M'Jangyang", u'Unity and Democracy'], 1213: [u'Party of Kachin State'], 1232: [u'(UDPKS)'], 1269: [u'12.', u'Daw Nan Sae Owa', u'3/BaAhNa(N)020987', u'Hpa-an', u'Phalon-Sawaw'], 1288: [u'Democratic Party'], 1307: [u'13.', u'U Saw Thein Aung', u'3/BaAhNa(N)009995', u'Hlaingbwe', u'PSDP'], 1326: [u'14.', u'U Sai Than Naing', u'3/PhaPaNa(N)000917', u'Papun', u'USDP'], 1345: [u'15.', u'U Saw Nay Kaw Kyee', u'3/ThaTaNa(N)013166', u'Thandaunggyi', u'Kayin Peoples Party'], 1364: [u'16.', u'U Thurein Zaw', u'13/KaTaNa(N)004681', u'Kawkareik', u'USDP'], 1382: [u'17.', u'U Saw Htut Khaung Lwin', u'3/KaHsaKa(N)017187', u'Kya-in-Seikkyi', u'USDP'], 1420: [u'18.', u'U Maung Nyo', u'10/SaTaNa(N)006844', u'Sittway', u'Rakhine Nationalities'], 1439: [u'Development Party'], 1458: [u'19.', u'U Tun Aung Kyaw', u'11/SaTaNa(N)005967', u'Ponnagyun', u'RNDP'], 1476: [u'20.', u'U Aung Tun Tha', u'11/MaUNa(N)005022', u'MraukU', u'RNDP'], 1495: [u'21.', u'U Tha Sein', u'12/UTaKa(N)015239', u'Kyauktaw', u'RNDP'], 1514: [u'22.', u'U Aung Sein Tha', u'11/SaTaNa(N)010908', u'Minbya', u'RNDP'], 1533: [u'23.', u'U Pe Than', u'11/MaPaTa(N)007521', u'Myebon', u'RNDP'], 1552: [u'24.', u'U Aung Kyaw Zan', u'7/ThaKaNa(N)061833', u'Pauktaw', u'RNDP'], 1570: [u'25.', u'Daw Khin Saw Wai', u'12/MaGaTa(N)069675', u'Yathedaung', u'RNDP']}
There are still end effects, eg when a party name overflows onto a second line. Also the states in which folk are standing aren't captured.