import pandas as pd, numpy as np
import bs4, requests, json, os
# os.chdir('E:/Onedrive - Lancaster University/datarepo/influence/ro')
os.chdir('C:/users/csala/Onedrive - Lancaster University/datarepo/influence/ro')
base_url='http://www.cdep.ro'
url=base_url+'/pls/parlam/structura2015.ab?idl=1'
url
'http://www.cdep.ro/pls/parlam/structura2015.ab?idl=1'
r=requests.get(url)
soup = bs4.BeautifulSoup(r.content)
tables=soup.findAll('table')
table=tables[1]
links=list(set([l['href'] for l in table.findAll('a')]))
def state_format(j):
j=j.replace('\n ','')
j=j.strip()
return j
import pickle
def save_obj(obj, name ):
with open('data/'+ name + '.pkl', 'wb') as f:
pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
def load_obj(name ):
with open('data/' + name + '.pkl', 'rb') as f:
return pickle.load(f)
members=[]
party_imgs={}
state_imgs={}
parsed_links=set()
#if available, load, dont parse
load_from_pages=True
if load_from_pages:
pages=load_obj('pages')
else:
pages={}
for link in links:
if link not in parsed_links:
if len(parsed_links)%100==0: print(len(parsed_links)/len(links)*100,'%')
idm=link[link.find('idm=')+4:link.find('idm=')+4+link[link.find('idm=')+4:].find('&')]
leg=link[link.find('leg=')+4:link.find('leg=')+4+link[link.find('leg=')+4:].find('&')]
url=base_url+link
if not load_from_pages:
r=requests.get(url)
soup = bs4.BeautifulSoup(r.content)
pages[link]=str(soup)
else:
soup=bs4.BeautifulSoup(pages[link])
name=soup.find('title').text
olddiv=soup.find('div',{'id':'olddiv'})
pretty_name=olddiv.find('h1').text
img=olddiv.find('img')['src']
divs=soup.find('div',{'id':'olddiv'}).find('div').findAll('div',{'class':'boxDep'})
camera=divs[0].find('h3').text
ikamera=link[-1]
judet=divs[0].find('a').text
birth_ro=soup.find('div',{'class':'profile-pic-dep'}).text.replace('\n','').replace('n.','').strip()
start=''
end=''
if 'data valid' in repr(divs[0]):
start=repr(divs[0])[repr(divs[0]).find('data valid')+14:]
start=start[:start.find('<')]
start=start.replace(':','').replace('-','').strip()
if 'mandatului' in repr(divs[0]):
end=repr(divs[0])[repr(divs[0]).find('mandatului')+10:]
end=end[:end.find('<')]
end=end.replace(':','').replace('-','').strip()
comisii=[]
comisii_abbr=[]
parties=[]
parties_abbr=[]
gparties=[]
gparties_abbr=[]
state=[]
state_abbr=[]
state_img=[]
activitate=[]
def divformat(div):
#return div.text.lower().replace('subcomisia ','#SUB#comisia ').replace('comisiei ','comisia ').split('comisia ')[1:]
return div.text.lower().replace('subcomisia ','comisia ').replace('comisiei ','comisia ').split('comisia ')[1:]
for div in divs:
header=div.find('h3').text
if 'omisii permanente' in header:
comisii+=['Comisia '+j.replace('\n','').strip() for j in divformat(div)]
comisii_abbr+=[j.text for j in div.findAll('a')]
if 'Biroului Permanent' in header:
comisii+=['#BP#'+i.text for i in divs[1].findAll('tr')]
comisii_abbr+=['#BP#'+ikamera for i in divs[1].findAll('tr')]
if 'omisii special' in header:
comisii+=['#SPEC#'+'Comisia '+j.replace('\n','').strip() for j in divformat(div)]
comisii_abbr+=['#SPEC#'+j.text for j in div.findAll('a')]
if 'omisii de anch' in header:
comisii+=['#ANCH#'+'Comisia '+j.replace('\n','').strip() for j in divformat(div)]
comisii_abbr+=['#ANCH#'+j.text for j in div.findAll('a')]
if 'lte comisii' in header:
comisii+=['Comisia '+j.replace('\n','').strip() for j in divformat(div)]
comisii_abbr+=[j.text for j in div.findAll('a')]
if 'iunea politic' in header:
parties=[j for j in div.findAll('tr',{'valign':'center'}) if j.findAll('table')]
if 'upul parlamentar:' in header:
if div.find('table'):
gparties=['#GRUP#Grupul parlamentar'+j.replace('\n','').strip()
for j in div.find('table').text.replace('Senator','Grupul parlamentar')\
.replace('Deputa','Grupul parlamentar').replace('Grupul Parlamentar','Grupul parlamentar')\
.split('Grupul parlamentar')][1:]
if 'altor state' in header:
states=div.findAll('tr')
state+=[state_format(states[j].text) for j in range(len(states))]
state_abbr+=['' if states[j].find('a')==None else states[j].find('a').text for j in range(len(states))]
state_img+=['' if states[j].find('img')==None else states[j].find('img')['src'] for j in range(len(states))]
if 'mentare interna' in header:
states=div.findAll('tr')
state+=['#INTER#'+state_format(states[j].text) for j in range(len(states))]
state_abbr+=['' if states[j].find('a')==None else '#INTER#'+states[j].find('a').text for j in range(len(states))]
state_img+=['' if states[j].find('img')==None else '#INTER#'+states[j].find('img')['src'] for j in range(len(states))]
if 'lte grupuri' in header:
states=div.findAll('tr')
state+=[state_format(states[j].text) for j in range(len(states))]
state_abbr+=['' if states[j].find('a')==None else states[j].find('a').text for j in range(len(states))]
state_img+=['' if states[j].find('img')==None else states[j].find('img')['src'] for j in range(len(states))]
if 'cifre' in header:
activitate=[j.text.split(':') for j in div.findAll('tr') if j.text.split(':')!=['']]
party=[parties[j].find('table').text for j in range(len(parties))]
party_abbr=['' if parties[j].find('table').find('a')==None else parties[j].find('table').find('a').text for j in range(len(parties))]
party_img=['' if parties[j].find('img')==None else parties[j].find('img')['src'] for j in range(len(parties))]
party+=gparties
party_abbr+=['' for i in range(len(gparties))]
party_img+=['' for i in range(len(gparties))]
for i in range(len(state_img)):
s=state_img[i]
t=state_abbr[i]
if t not in state_imgs:state_imgs[t]=s
for i in range(len(party_img)):
s=party_img[i]
a=party_abbr[i]
t=party[i]
if a not in party_imgs:party_imgs[a]={'name':t,'img':s}
members.append({'name':name,'birth_ro':birth_ro,'idm':idm,'link':url,'leg':leg,'start':start,
'img':img,'pretty_name':pretty_name,'camera':camera,'judet':judet,'end':end,
'party_abbr':party_abbr,'party':party,'state':state,'state_abbr':state_abbr,
'activitate':activitate,'comisii':comisii,'comisii_abbr':comisii_abbr})
parsed_links.add(link)
0.0 % 2.3261223540358222 % 4.6522447080716445 % 6.978367062107467 % 9.304489416143289 % 11.630611770179112 % 13.956734124214934 % 16.282856478250757 % 18.608978832286578 % 20.9351011863224 % 23.261223540358223 % 25.58734589439404 % 27.91346824842987 % 30.23959060246569 % 32.565712956501514 % 34.89183531053733 % 37.217957664573156 % 39.54408001860898 % 41.8702023726448 % 44.19632472668062 % 46.52244708071645 % 48.84856943475227 % 51.17469178878808 % 53.50081414282391 % 55.82693649685974 % 58.153058850895555 % 60.47918120493138 % 62.80530355896721 % 65.13142591300303 % 67.45754826703885 % 69.78367062107466 % 72.10979297511048 % 74.43591532914631 % 76.76203768318214 % 79.08816003721796 % 81.41428239125378 % 83.7404047452896 % 86.06652709932543 % 88.39264945336124 % 90.71877180739708 % 93.0448941614329 % 95.37101651546871 % 97.69713886950454 %
len(members), len(links)
(4299, 4299)
open('data/members.json','w').write(json.dumps({'members':members,
'party_imgs':party_imgs,
'state_imgs':state_imgs}))
6779963
members=json.loads(open('data/members.json','r').read())['members']
party_imgs=json.loads(open('data/members.json','r').read())['party_imgs']
state_imgs=json.loads(open('data/members.json','r').read())['state_imgs']
save_obj(pages,'pages')
Clean
legs={'2016':['2016-12-21','2019-07-01'],
'2012':['2012-12-20','2016-12-20'],
'2008':['2008-12-19','2012-12-19'],
'2004':['2004-12-19','2008-12-13'],
'2000':['2000-12-15','2004-11-30'],
'1996':['1996-11-27','2000-11-30'],
'1992':['1992-10-28','1996-11-22'],
'1990':['1990-06-19','1992-10-16']}
Manual fix for tihs Monitorul official:
ro_months={'ian.':'Jan',
' ia ':' Jan ',
'feb.':'Feb',
'mar.':'Mar',
'apr.':'Apr',
'mai':'May',
'iun.':'Jun',
' iu ':' Jun ',
'iul.':'Jul',
'aug.':'Aug',
'sep.':'Sep',
'oct.':'Oct',
'noi.':'Nov',
'dec.':'Dec',
'ianuarie':'Jan',
'februarie':'Feb',
'martie':'Mar',
'aprilie':'Apr',
'mai':'May',
'iunie':'Jun',
'iulie':'Jul',
'august':'Aug',
'septembrie':'Sep',
'octombrie':'Oct',
'noiembrie':'Nov',
'decembrie':'Dec',}
def date_ro(d,s=True):
if 'prezent' in d:
t=pd.to_datetime(legs['2016'][1])
else:
ds=d.replace(')','').replace('(','').split(' ')
x=ds[1].replace('0201','2001').replace('0092','1992').replace('0213','2013')
t=pd.to_datetime(ro_months[ds[0].strip()]+' '+x)
if s: return str(t)[:10]
else: return t
def date_ro2(d,s=False):
for i in range(20):
d=d.replace(' ',' ')
ds=d.split(' ')
t=pd.to_datetime(ds[0]+' '+ro_months[ds[1]]+' '+ds[2])
s=str(t)[:10]
if s=='2004-02-17': s='2004-12-17'
return s
decess={'Ratiu Ion1917-2000':'6 Jun 1917',
'Popovici Dan Ion Cristian1946-1996':'26 Dec 1946',
'Palfi Mozes Zoltan1943-2011':'1 Jan 1943',
'Blaga Ionel1929-1994':'17 Mar 1929',
'Daraban Aurel1939-2004':'27 Sep 1939',
'Croitoru Mircea-Adrian1941-1999':'1 Jan 1941',
'Budeanu Radu1943-1997':'1 Jan 1943',
'Coposu Corneliu1914-1995':'20 May 1914',
'Ignat Miron1941-2018':'24 Aug 1941',
'Nastase Toma1932-1997':'1 Jan 1932',
'Musat Mircea1930-1994':'1 Jan 1930',
'Stoica Stefan1976-2014':'1 Jan 1976',
'Grama Mihail1924-1999':'1 Jan 1924',
'Bot Octavian1951-2015':'1 Jan 1951', #real 1 Jan
'Iorgovan Antonie1948-2007':'9 Aug 1948',
'Dinescu Valentin1955-2008':'25 Dec 1955',
'Babias Iohan-Peter1952-2002':'28 Jun 1952',
'Munteanu Mircea Mihai1933-1998':'26 May 1933',
'Timis Ioan1951-2010':'17 Sep 1951',
'Barbu Eugen1924-1993':'1 Jan 1924',
'Tcaciuc Stefan1936-2005':'13 Jan 1936',
'Mircovici Niculae1950-2016':'1 Oct 1950',
'Rusu Horia Mircea1952-2001':'18 Sep 1952',
'Racoceanu Viorel1962-2006':'8 Jun 1962',
'Andrei Zeno1935-2001':'1 Jan 1935',
'Surdu-Soreanu Raul-Victor1947-2011':'11 Jul 1947',
'Dan Iosif1950-2007':'14 Oct 1950',
'Dutu Ion1942-2000':'7 Oct 1942',
'Bindea Liviu-Doru1957-2006':'26 Jul 1957',
'Verestoy Attila1954-2018':'1 Mar 1954',
'Dragomir Nelu Aristide1957-1995':'13 Oct 1957',
'Micle Ulpiu-Radu-Sabin1935-2000':'1 Jan 1935',
'Cojocariu Emil1938-1994':'2 Dec 1938',
'Policrat Rene-Radu1910-1993':'12 Aug 1910',
'Serban Gheorghe1954-1998':'25 Jun 1954',
'Vladoiu Aurel1948-2015':'27 Jan 1948',
'Preda Ion1947-2007':'1 Jan 1947',
'Coste Marina-Adelina1965-2017':'30 Nov 1965',
'Sincai Ovidiu1949-1999':'14 Dec 1949',
'Grosaru Mircea1952-2014':'30 Jun 1952',
'Florescu Nicolae-Doru1960-2001':'1 jan 1960',
'Sinko Stefan1939-1995':'1 Jan 1939',
'Alecsandrescu Nicolae1923-1993':'1 Jan 1923',
'Ratoi Neculai1939-2016':'15 Mar 1939',
'Ichim Mircea-Adrian1944-1993':'1 Jan 1944',
'Fotopolos Sotiris1937-2008':'6 Dec 1937',
'Ciobanu Gheorghe1964-2015':'22 Sep 1964',
'Dumitrescu Liana1973-2011':'20 Jan 1973',
'Dida Corneliu Ioan1942-2008':'26 May 1942',
'Draghici Sonia-Maria1956-2016':'25 Jul 1956'}
state_abbs={'membru':'','\xa0':'','supleant':''}
def replace_all(text, dic):
for i in dic:
text = text.replace(i, dic[i])
return text
def president(s):
abb=''
#reset pres
if 'Vicepreşedinte' in s:
s=s.replace('Vicepreşedinte','')
abb='#VP#'
elif 'vicepreşedinte' in s:
s=s.replace('vicepreşedinte','')
abb='#VP#'
elif 'preşedinte' in s:
s=s.replace('preşedinte','')
abb='#PRES#'
elif 'Preşedinte' in s:
s=s.replace('Preşedinte','')
abb='#PRES#'
elif 'Secretar' in s:
s=s.replace('Secretar','')
abb='#SECR#'
elif 'secretar' in s:
s=s.replace('secretar','')
abb='#SECR#'
elif 'Trezorier' in s:
s=s.replace('Trezorier','')
abb='#TREZ#'
elif 'trezorier' in s:
s=s.replace('trezorier','')
abb='#TREZ#'
elif 'chestor' in s:
s=s.replace('chestor','')
abb='#CHES#'
elif 'Chestor' in s:
s=s.replace('Chestor','')
abb='#CHES#'
elif 'Şeful' in s:
s=s.replace('Şeful','')
abb='#SEF#'
return s.strip(),abb
gpresident={'Vicelider':'$#VP#%',
'vicelider':'$#VP#%',
'Secretar':'$#SECR#%',
'secretar':'$#SECR#%',
'Lider':'$#PRES#%',
'Purtător de cuvânt':'$#PRCV#%',
'lider':'$#PRES#%'}
gpresident2={'Vicepreşedinte':'$#VP#%',
'vicepreşedinte':'$#VP#%',
'preşedinte':'$#PRES#%',
'Preşedinte':'$#PRES#%',
'Secretar':'$#SECR#%',
'secretar':'$#SECR#%'}
nmembers={}
nparty_imgs={}
nstate_imgs={}
party_set=set()
country_set=set()
comisii_set=set()
deaths={}
for i in members[:]:
if '-' not in i['birth_ro']:
birth=str(pd.to_datetime(replace_all(i['birth_ro'],ro_months)))
death=''
else:
birth=str(pd.to_datetime(replace_all(i['name']+i['birth_ro'],decess)))
death=i['birth_ro'].split('-')[1].strip()
if len(birth)>3:
name=i['name']+' | '+birth[:10]
else:
name=i['name']+' | Ismeretlen'
if death:
if name not in deaths:
deaths[name]=death
since=str(pd.to_datetime(legs[i['leg']][0]))[:10]
until=str(pd.to_datetime(legs[i['leg']][1]))[:10]
if i['start']: since=date_ro2(i['start'],True)
if i['end']: until=date_ro2(i['end'])
pretty_name,pres=president(i['pretty_name'])
if name not in nmembers:
nmembers[name]={}
if 'Name' not in nmembers[name]:
nmembers[name]['Name']={'full':i['pretty_name'],'simple':name,'short':i['name']}
if 'Photo' not in nmembers[name]:
nmembers[name]['Photo']=[]
nmembers[name]['Photo'].append(base_url+i['img'])
if 'UserID' not in nmembers[name]:
nmembers[name]['UserID']=[]
nmembers[name]['UserID'].append(i['idm'])
if 'Camera' not in nmembers[name]:
nmembers[name]['Camera']=[]
dummy,pres=president(i['camera'])
if not pres:
dummy,pres=president(i['pretty_name'])
nmembers[name]['Camera'].append(pres+i['camera'])
if 'Starts' not in nmembers[name]:
nmembers[name]['Starts']=[]
nmembers[name]['Starts'].append(since)
if 'Ends' not in nmembers[name]:
nmembers[name]['Ends']=[]
nmembers[name]['Ends'].append(until)
if 'Link' not in nmembers[name]:
nmembers[name]['Link']=[]
nmembers[name]['Link'].append(i['link'])
if 'Parties' not in nmembers[name]:
nmembers[name]['Parties']=[]
parties=i['party']
parties_abbr=i['party_abbr']
pi=0
while pi<(len(parties)):
p=parties[pi].strip()
p,pres=president(p)
#reset since
since=str(pd.to_datetime(legs[i['leg']][0]))[:10]
until=str(pd.to_datetime(legs[i['leg']][1]))[:10]
if i['start']: since=date_ro2(i['start'],True)
if i['end']: until=date_ro2(i['end'])
#process
if '#GRUP#' not in p:
if '-' not in p:
if p!='independent':
abbr='Minorități'
party=p
else:
abbr='Independent'
party='Independent'
else:
abbr=parties_abbr[pi]
party=p.split('-')[1].strip()
if ('din ') in party:
if ('din R') not in party:
if ('din B') not in party:
since=max(since,date_ro(party[party.find('din ')+4:].strip(),True))
party='Independent'
#else:
if True: #keep, for cases with both 'din' and 'pana'
if len(p.split('-'))>2:
s=''.join(p.split('-')[2:]).strip()
if ('din ') in s:
if ('din R') not in s:
since=max(since,date_ro(s[s.find('din ')+4:].strip(),True))
if ('până în ') in s:
until=min(until,date_ro(s[s.find('până în ')+8:].strip(),True))
if abbr in ['independent','','Neafiliaţi']:
abbr='Independent'
nmembers[name]['Parties'].append({'party':pres+abbr,
'start':since,'end':until,'judet':i['judet']})
party_set.add(abbr)
else:
p=parties[pi].strip()
if 'se transf' not in p:
abbr=p.split('\xa0')[0]
p=replace_all(p,gpresident)
for pk in p.split('$'):
kabbr=pk.replace('%',abbr).replace(' ',' ').replace(' ',' ').replace(' ',' ').strip()
pabbr=kabbr[:kabbr.find(abbr)+len(abbr)].strip()
s=kabbr[kabbr.find(abbr)+len(abbr):].strip()
if ('din ') in s:
since=max(since,date_ro(s[s.find('din ')+4:].strip(),True))
if ('până în ') in s:
until=min(until,date_ro(s[s.find('până în ')+8:].strip(),True))
nmembers[name]['Parties'].append({'party':pabbr,
'start':since,'end':until,'judet':i['judet']})
party_set.add(pabbr)
else:
p0=parties[min(pi+2,len(parties)-1)].strip()
p1=parties[pi+1].strip()
abbr0=p0.split('\xa0')[0]
abbr1=p1.split(' în ')[0]
since2=date_ro(p1.split(' în ')[1].replace('-','. ').replace('fost',''))
p0=replace_all(p0,gpresident)
for pk in p0.split('$'):
kabbr0=pk.replace('%',abbr0).replace(' ',' ').replace(' ',' ').replace(' ',' ').strip()
pabbr0=kabbr0[:kabbr0.find(abbr0)+len(abbr0)].strip().split('(')[0].strip()
s=kabbr0[kabbr0.find(abbr0)+len(abbr0):].strip()
if ('din ') in s:
since=max(since,date_ro(s[s.find('din ')+4:].strip(),True))
if ('până în ') in s:
until=min(until,date_ro(s[s.find('până în ')+8:].strip(),True))
nmembers[name]['Parties'].append({'party':pabbr0,
'start':since,'end':since2,'judet':i['judet']})
nmembers[name]['Parties'].append({'party':abbr1,
'start':since2,'end':until,'judet':i['judet']})
party_set.add(pabbr0)
party_set.add(abbr1)
pi+=2
pi+=1
if 'Countries' not in nmembers[name]:
nmembers[name]['Countries']=[]
states=i['state']
states_abbr=i['state_abbr']
for pi in range(len(states)):
p=states[pi].strip()
p,pres=president(p)
cont=True
if '#INTER#' in p:
cont=False
if 'Bucureşti - Chişinău' in p:cont=True
if 'elega' in p:cont=True
if 'Europol' in p:cont=True
if cont:
abb_state=state_format(states_abbr[pi])
#reset since
since=str(pd.to_datetime(legs[i['leg']][0]))[:10]
until=str(pd.to_datetime(legs[i['leg']][1]))[:10]
if i['start']: since=date_ro2(i['start'],True)
if i['end']: until=date_ro2(i['end'])
#process
if '-' in p:
state=p.split('-')[0]
s=replace_all(''.join(p.split('-')[1:]).strip(),state_abbs).strip()
if ('din ') in s:
if ('din R') not in s:
since=max(since,date_ro(s[s.find('din ')+4:].strip(),True))
if ('până în ') in s:
until=min(until,date_ro(s[s.find('până în ')+8:].strip(),True))
nmembers[name]['Countries'].append({'country':pres+abb_state,'start':since,'end':until})
country_set.add(abb_state)
if 'Groups' not in nmembers[name]:
nmembers[name]['Groups']=[]
comisii=i['comisii']
comisii_abbr=i['comisii_abbr']
for pi in range(len(comisii_abbr)):
p=comisii[pi].strip()
abbr=comisii_abbr[pi].strip()
if '#BP#' in p:
p=replace_all(p,state_abbs)
p,pres=president(p)
if '#BP#Atribu' not in p:
#reset since
since=str(pd.to_datetime(legs[i['leg']][0]))[:10]
until=str(pd.to_datetime(legs[i['leg']][1]))[:10]
if i['start']: since=date_ro2(i['start'],True)
if i['end']: until=date_ro2(i['end'])
if '-' in p:
s0=p.replace('#BP#','').replace('în sesiunea parlamentară:','').replace('atributii','')
s=s0.split('-')
if len(s[0])<6:
s[0]=s[0]+s[1].strip()[-4:]
since=max(since,date_ro(s[0].strip(),True))
until=min(until,date_ro(s[1].strip(),True))
if ('din ') in s0:
if ('din sumele') not in s:
since=max(since,date_ro(s0[s0.find('din ')+4:].strip(),True))
if ('până în ') in s:
until=min(until,date_ro(s0[s0.find('până în ')+8:].strip(),True))
nmembers[name]['Groups'].append({'group':pres+abbr,
'start':since,'end':until})
comisii_set.add(abbr)
else:
#process
p=replace_all(p,gpresident2)
if '$' not in p:
since=str(pd.to_datetime(legs[i['leg']][0]))[:10]
until=str(pd.to_datetime(legs[i['leg']][1]))[:10]
if i['start']: since=date_ro2(i['start'],True)
if i['end']: until=date_ro2(i['end'])
kabbr=p.replace(' ',' ').replace(' ',' ').replace(' ',' ').strip()
pabbr=kabbr[:kabbr.find(abbr)+len(abbr)].strip()
s=kabbr[kabbr.find(abbr)+len(abbr):].strip()
if ('din ') in s:
if ('din sumele') not in s:
since=max(since,date_ro(s[s.find('din ')+4:s.find('din ')+4+9].replace(')','').strip(),True))
if ('până în ') in s:
until=min(until,date_ro(s[s.find('până în ')+8:s.find('până în ')+8+9].replace(')','').strip(),True))
nmembers[name]['Groups'].append({'group':pabbr,
'start':since,'end':until})
comisii_set.add(pabbr)
else:
pk=p.split('$')[0]
since0=str(pd.to_datetime(legs[i['leg']][0]))[:10]
until0=str(pd.to_datetime(legs[i['leg']][1]))[:10]
if i['start']: since0=date_ro2(i['start'],True)
if i['end']: until0=date_ro2(i['end'])
kabbr=pk.replace('%',abbr).replace(' ',' ').replace(' ',' ').replace(' ',' ').strip()
pabbr0=kabbr[:kabbr.find(abbr)+len(abbr)].strip()
s=kabbr[kabbr.find(abbr)+len(abbr):].strip()
if ('din ') in s:
if ('din sumele') not in s:
since0=max(since0,date_ro(s[s.find('din ')+4:s.find('din ')+4+9].replace(')','').strip(),True))
if ('până în ') in s:
until0=min(until0,date_ro(s[s.find('până în ')+8:s.find('până în ')+8+9].replace(')','').strip(),True))
if ('-') in s:
if len(s.split('-')[-1])>4:
s0=s.split('-')[0].strip()
s1=s.split('-')[1].strip()
if len(s0)<6:
s0=s0+' '+s1.split(' ')[1].strip()
since0=max(since0,date_ro(s0,True))
until0=min(until0,date_ro(s1,True))
pk=p.split('$')[1]
since1=str(pd.to_datetime(legs[i['leg']][0]))[:10]
until1=str(pd.to_datetime(legs[i['leg']][1]))[:10]
if i['start']: since1=date_ro2(i['start'],True)
if i['end']: until1=date_ro2(i['end'])
kabbr=pk.replace('%',abbr).replace(' ',' ').replace(' ',' ').replace(' ',' ').strip()
pabbr1=kabbr[:kabbr.find(abbr)+len(abbr)].strip()
s=kabbr[kabbr.find(abbr)+len(abbr):].strip()
if ('din ') in s:
if ('din sumele') not in s:
since1=max(since1,date_ro(s[s.find('din ')+4:s.find('din ')+4+9].replace(')','').strip(),True))
if ('până în ') in s:
until1=min(until1,date_ro(s[s.find('până în ')+8:s.find('până în ')+8+9].replace(')','').strip(),True))
if ('-') in s:
if len(s.split('-')[-1])>4:
s0=s.split('-')[0].strip()
s1=s.split('-')[1].strip()
if len(s0)<6:
s0=s0+' '+s1.split(' ')[1].strip()
since1=max(since1,date_ro(s0,True))
until1=min(until1,date_ro(s1,True))
since1=max(since0,since1)
until1=min(until0,until1)
if since0==since1 and until0==until1:
nmembers[name]['Groups'].append({'group':pabbr1,
'start':since1,'end':until1})
comisii_set.add(pabbr1)
elif since0<since1 and until0>until1:
nmembers[name]['Groups'].append({'group':pabbr0,
'start':since0,'end':since1})
comisii_set.add(pabbr0)
nmembers[name]['Groups'].append({'group':pabbr1,
'start':since1,'end':until1})
comisii_set.add(pabbr1)
nmembers[name]['Groups'].append({'group':pabbr0,
'start':until1,'end':until0})
comisii_set.add(pabbr0)
elif until1<until0:
nmembers[name]['Groups'].append({'group':pabbr1,
'start':since0,'end':until1})
comisii_set.add(pabbr1)
nmembers[name]['Groups'].append({'group':pabbr0,
'start':until1,'end':until0})
comisii_set.add(pabbr0)
elif since1>since0:
nmembers[name]['Groups'].append({'group':pabbr0,
'start':since0,'end':since1})
comisii_set.add(pabbr0)
nmembers[name]['Groups'].append({'group':pabbr1,
'start':since1,'end':until0})
comisii_set.add(pabbr1)
#reset since
since=str(pd.to_datetime(legs[i['leg']][0]))[:10]
until=str(pd.to_datetime(legs[i['leg']][1]))[:10]
if i['start']: since=date_ro2(i['start'],True)
if i['end']: until=date_ro2(i['end'])
#process
if 'Activity' not in nmembers[name]:
nmembers[name]['Activity']={}
activitate=i['activitate']
for a in activitate:
if len(a)>1:
an=a[0]
if an not in ['Membru în']:
av=int(a[1].split('(')[0].strip().split(',')[0].strip().split('-')[0].strip())
if an not in nmembers[name]['Activity']:nmembers[name]['Activity'][an]=[]
nmembers[name]['Activity'][an].append({'value':av,'start':since,'end':until})
if an=='Luari de cuvânt':
if 'în ' in a[1]:
av=int(a[1].split('(în ')[-1].strip().split('sedinte)')[0].strip())
nmembers[name]['Activity']['Sedințe']={'value':av,'start':since,'end':until}
# nmembers['Popescu-Tariceanu Calin Constantin Anton | 1952-01-14']
# nmembers['Stanisoara Mihai | 1962-06-11']
#nmembers['Verestoy Attila | 1954-03-01']['Parties']
#nmembers['Turcan Raluca | 1976-04-02']['Parties']
len(nmembers),len(members),len(deaths)
(2818, 4299, 50)
open('data/nmembers.json','w').write(json.dumps(nmembers))
6860931
open('data/deaths.json','w').write(json.dumps(deaths))
2021
No need to re-run
def party_cleaner(i,p):
if i==p: return p
p=p[len(i)+1:].split('-')[0].strip()
return p
#! run once
for i in party_set.difference(set(party_imgs.keys())):
party_imgs[i]={'name': i, 'img': ''}
for i in set(party_imgs.keys()).difference(party_set):
party_imgs.pop(i)
for i in party_imgs:
party_imgs[i]['name']=party_cleaner(i,party_imgs[i]['name'])
nstate_imgs={}
for i in state_imgs:
nstate_imgs[state_format(i)]=state_imgs[i]
nstate_imgs['']=''
#img=json.loads(open('data/img.json','r').read())
img={'party':party_imgs,'state':nstate_imgs}
#manual
img['party']['PP-DD']['name']='Partidul Poporului Dan Diaconescu'
img['party']['PUR-SL']['name']='Partidul Umanist din România'
img['party']['RMDSZ']=img['party']['UDMR']
img['party']['RMDSZ']['name']='Romániai Magyar Demokrata Szövetség'
open('data/img.json','w').write(json.dumps(img))
48161