In [1]:
import pandas as pd, numpy as np
import bs4, requests, json, os
In [4]:
# os.chdir('E:/Onedrive - Lancaster University/datarepo/influence/ro')
os.chdir('C:/users/csala/Onedrive - Lancaster University/datarepo/influence/ro')
In [5]:
base_url='http://www.cdep.ro'
url=base_url+'/pls/parlam/structura2015.ab?idl=1'
In [6]:
url
Out[6]:
'http://www.cdep.ro/pls/parlam/structura2015.ab?idl=1'
In [7]:
r=requests.get(url)
soup = bs4.BeautifulSoup(r.content)
In [8]:
tables=soup.findAll('table')
table=tables[1]
In [9]:
links=list(set([l['href'] for l in table.findAll('a')]))
In [10]:
def state_format(j):
    j=j.replace('\n ','')
    j=j.strip()
    return j
In [11]:
import pickle
In [12]:
def save_obj(obj, name ):
    with open('data/'+ name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name ):
    with open('data/' + name + '.pkl', 'rb') as f:
        return pickle.load(f)
In [135]:
members=[]
party_imgs={}
state_imgs={}
parsed_links=set()

#if available, load, dont parse
load_from_pages=True
if load_from_pages:
    pages=load_obj('pages')
else:
    pages={}
In [136]:
for link in links:
    if link not in parsed_links:
        if len(parsed_links)%100==0: print(len(parsed_links)/len(links)*100,'%')
        idm=link[link.find('idm=')+4:link.find('idm=')+4+link[link.find('idm=')+4:].find('&')]
        leg=link[link.find('leg=')+4:link.find('leg=')+4+link[link.find('leg=')+4:].find('&')]
        url=base_url+link
        if not load_from_pages:
            r=requests.get(url)
            soup = bs4.BeautifulSoup(r.content)
            pages[link]=str(soup)
        else:
            soup=bs4.BeautifulSoup(pages[link])
        name=soup.find('title').text
        olddiv=soup.find('div',{'id':'olddiv'})
        pretty_name=olddiv.find('h1').text
        img=olddiv.find('img')['src']
        divs=soup.find('div',{'id':'olddiv'}).find('div').findAll('div',{'class':'boxDep'})
        camera=divs[0].find('h3').text
        ikamera=link[-1]
        judet=divs[0].find('a').text
        birth_ro=soup.find('div',{'class':'profile-pic-dep'}).text.replace('\n','').replace('n.','').strip()
        start=''
        end=''
        if 'data valid' in repr(divs[0]):
            start=repr(divs[0])[repr(divs[0]).find('data valid')+14:]
            start=start[:start.find('<')]
            start=start.replace(':','').replace('-','').strip()
        if 'mandatului' in repr(divs[0]):
            end=repr(divs[0])[repr(divs[0]).find('mandatului')+10:]
            end=end[:end.find('<')]
            end=end.replace(':','').replace('-','').strip()
        comisii=[]
        comisii_abbr=[]
        parties=[]
        parties_abbr=[]
        gparties=[]
        gparties_abbr=[]
        state=[]
        state_abbr=[]
        state_img=[]
        activitate=[]
        def divformat(div):
            #return div.text.lower().replace('subcomisia ','#SUB#comisia ').replace('comisiei ','comisia ').split('comisia ')[1:]
            return div.text.lower().replace('subcomisia ','comisia ').replace('comisiei ','comisia ').split('comisia ')[1:]
        
        for div in divs:
            header=div.find('h3').text
            if 'omisii permanente' in header:
                comisii+=['Comisia '+j.replace('\n','').strip() for j in divformat(div)]
                comisii_abbr+=[j.text for j in div.findAll('a')]
            if 'Biroului Permanent' in header:
                comisii+=['#BP#'+i.text for i in divs[1].findAll('tr')]
                comisii_abbr+=['#BP#'+ikamera for i in divs[1].findAll('tr')]
            if 'omisii special' in header:
                comisii+=['#SPEC#'+'Comisia '+j.replace('\n','').strip() for j in divformat(div)]
                comisii_abbr+=['#SPEC#'+j.text for j in div.findAll('a')]
            if 'omisii de anch' in header:
                comisii+=['#ANCH#'+'Comisia '+j.replace('\n','').strip() for j in divformat(div)]
                comisii_abbr+=['#ANCH#'+j.text for j in div.findAll('a')]
            if 'lte comisii' in header:
                comisii+=['Comisia '+j.replace('\n','').strip() for j in divformat(div)]
                comisii_abbr+=[j.text for j in div.findAll('a')]
            if 'iunea politic' in header:
                parties=[j for j in div.findAll('tr',{'valign':'center'}) if j.findAll('table')]
            if 'upul parlamentar:' in header:
                if div.find('table'):
                    gparties=['#GRUP#Grupul parlamentar'+j.replace('\n','').strip() 
                              for j in div.find('table').text.replace('Senator','Grupul parlamentar')\
                              .replace('Deputa','Grupul parlamentar').replace('Grupul Parlamentar','Grupul parlamentar')\
                              .split('Grupul parlamentar')][1:]
            if 'altor state' in header:
                states=div.findAll('tr')
                state+=[state_format(states[j].text) for j in range(len(states))]    
                state_abbr+=['' if states[j].find('a')==None else states[j].find('a').text for j in range(len(states))]
                state_img+=['' if states[j].find('img')==None else states[j].find('img')['src'] for j in range(len(states))]
            if 'mentare interna' in header:
                states=div.findAll('tr')
                state+=['#INTER#'+state_format(states[j].text) for j in range(len(states))]    
                state_abbr+=['' if states[j].find('a')==None else '#INTER#'+states[j].find('a').text for j in range(len(states))]
                state_img+=['' if states[j].find('img')==None else '#INTER#'+states[j].find('img')['src'] for j in range(len(states))]
            if 'lte grupuri' in header:
                states=div.findAll('tr')
                state+=[state_format(states[j].text) for j in range(len(states))]    
                state_abbr+=['' if states[j].find('a')==None else states[j].find('a').text for j in range(len(states))]
                state_img+=['' if states[j].find('img')==None else states[j].find('img')['src'] for j in range(len(states))]
            if 'cifre' in header:
                activitate=[j.text.split(':') for j in div.findAll('tr') if j.text.split(':')!=['']]        
        
        party=[parties[j].find('table').text for j in range(len(parties))]
        party_abbr=['' if parties[j].find('table').find('a')==None else parties[j].find('table').find('a').text for j in range(len(parties))]
        party_img=['' if parties[j].find('img')==None else parties[j].find('img')['src'] for j in range(len(parties))]
        party+=gparties
        party_abbr+=['' for i in range(len(gparties))]
        party_img+=['' for i in range(len(gparties))]
        
        for i in range(len(state_img)):
            s=state_img[i]
            t=state_abbr[i]
            if t not in state_imgs:state_imgs[t]=s
        for i in range(len(party_img)):
            s=party_img[i]
            a=party_abbr[i]
            t=party[i]
            if a not in party_imgs:party_imgs[a]={'name':t,'img':s}
        members.append({'name':name,'birth_ro':birth_ro,'idm':idm,'link':url,'leg':leg,'start':start,
                        'img':img,'pretty_name':pretty_name,'camera':camera,'judet':judet,'end':end,
                       'party_abbr':party_abbr,'party':party,'state':state,'state_abbr':state_abbr,
                        'activitate':activitate,'comisii':comisii,'comisii_abbr':comisii_abbr})
        parsed_links.add(link)
0.0 %
2.3261223540358222 %
4.6522447080716445 %
6.978367062107467 %
9.304489416143289 %
11.630611770179112 %
13.956734124214934 %
16.282856478250757 %
18.608978832286578 %
20.9351011863224 %
23.261223540358223 %
25.58734589439404 %
27.91346824842987 %
30.23959060246569 %
32.565712956501514 %
34.89183531053733 %
37.217957664573156 %
39.54408001860898 %
41.8702023726448 %
44.19632472668062 %
46.52244708071645 %
48.84856943475227 %
51.17469178878808 %
53.50081414282391 %
55.82693649685974 %
58.153058850895555 %
60.47918120493138 %
62.80530355896721 %
65.13142591300303 %
67.45754826703885 %
69.78367062107466 %
72.10979297511048 %
74.43591532914631 %
76.76203768318214 %
79.08816003721796 %
81.41428239125378 %
83.7404047452896 %
86.06652709932543 %
88.39264945336124 %
90.71877180739708 %
93.0448941614329 %
95.37101651546871 %
97.69713886950454 %
In [137]:
len(members), len(links)
Out[137]:
(4299, 4299)
In [138]:
open('data/members.json','w').write(json.dumps({'members':members,
                                                'party_imgs':party_imgs,
                                                'state_imgs':state_imgs}))
Out[138]:
6779963
In [139]:
members=json.loads(open('data/members.json','r').read())['members']
party_imgs=json.loads(open('data/members.json','r').read())['party_imgs']
state_imgs=json.loads(open('data/members.json','r').read())['state_imgs']
In [140]:
save_obj(pages,'pages')

Clean

In [141]:
legs={'2016':['2016-12-21','2019-07-01'],
'2012':['2012-12-20','2016-12-20'],
'2008':['2008-12-19','2012-12-19'],
'2004':['2004-12-19','2008-12-13'],
'2000':['2000-12-15','2004-11-30'],
'1996':['1996-11-27','2000-11-30'],
'1992':['1992-10-28','1996-11-22'],
'1990':['1990-06-19','1992-10-16']}

Manual fix for tihs Monitorul official:

In [142]:
ro_months={'ian.':'Jan',
          ' ia ':' Jan ',
          'feb.':'Feb',
          'mar.':'Mar',
          'apr.':'Apr',
          'mai':'May',
          'iun.':'Jun',
          ' iu ':' Jun ',
          'iul.':'Jul',
          'aug.':'Aug',
          'sep.':'Sep',
          'oct.':'Oct',
          'noi.':'Nov',
          'dec.':'Dec',
          'ianuarie':'Jan',
          'februarie':'Feb',
          'martie':'Mar',
          'aprilie':'Apr',
          'mai':'May',
          'iunie':'Jun',
          'iulie':'Jul',
          'august':'Aug',
          'septembrie':'Sep',
          'octombrie':'Oct',
          'noiembrie':'Nov',
          'decembrie':'Dec',}
def date_ro(d,s=True):
    if 'prezent' in d:
        t=pd.to_datetime(legs['2016'][1])
    else:
        ds=d.replace(')','').replace('(','').split(' ')
        x=ds[1].replace('0201','2001').replace('0092','1992').replace('0213','2013')
        t=pd.to_datetime(ro_months[ds[0].strip()]+' '+x)
    if s: return str(t)[:10]
    else: return t
def date_ro2(d,s=False):
    for i in range(20):
        d=d.replace('  ',' ')
    ds=d.split(' ')
    t=pd.to_datetime(ds[0]+' '+ro_months[ds[1]]+' '+ds[2])
    s=str(t)[:10]
    if s=='2004-02-17': s='2004-12-17'
    return s
In [143]:
decess={'Ratiu Ion1917-2000':'6 Jun 1917',
      'Popovici Dan Ion Cristian1946-1996':'26 Dec 1946',
      'Palfi Mozes Zoltan1943-2011':'1 Jan 1943',
      'Blaga Ionel1929-1994':'17 Mar 1929',
      'Daraban Aurel1939-2004':'27 Sep 1939',
      'Croitoru Mircea-Adrian1941-1999':'1 Jan 1941',
      'Budeanu Radu1943-1997':'1 Jan 1943',
      'Coposu Corneliu1914-1995':'20 May 1914',
      'Ignat Miron1941-2018':'24 Aug 1941',
      'Nastase Toma1932-1997':'1 Jan 1932',
      'Musat Mircea1930-1994':'1 Jan 1930',
      'Stoica Stefan1976-2014':'1 Jan 1976',
      'Grama Mihail1924-1999':'1 Jan 1924',
      'Bot Octavian1951-2015':'1 Jan 1951', #real 1 Jan
      'Iorgovan Antonie1948-2007':'9 Aug 1948',
      'Dinescu Valentin1955-2008':'25 Dec 1955',
      'Babias Iohan-Peter1952-2002':'28 Jun 1952',
      'Munteanu Mircea Mihai1933-1998':'26 May 1933',
      'Timis Ioan1951-2010':'17 Sep 1951',
      'Barbu Eugen1924-1993':'1 Jan 1924',
      'Tcaciuc Stefan1936-2005':'13 Jan 1936',
      'Mircovici Niculae1950-2016':'1 Oct 1950',
      'Rusu Horia Mircea1952-2001':'18 Sep 1952',
      'Racoceanu Viorel1962-2006':'8 Jun 1962',
      'Andrei Zeno1935-2001':'1 Jan 1935',
      'Surdu-Soreanu Raul-Victor1947-2011':'11 Jul 1947',
      'Dan Iosif1950-2007':'14 Oct 1950',
      'Dutu Ion1942-2000':'7 Oct 1942',
      'Bindea Liviu-Doru1957-2006':'26 Jul 1957',
      'Verestoy Attila1954-2018':'1 Mar 1954',
      'Dragomir Nelu Aristide1957-1995':'13 Oct 1957',
      'Micle Ulpiu-Radu-Sabin1935-2000':'1 Jan 1935',
       'Cojocariu Emil1938-1994':'2 Dec 1938',
       'Policrat Rene-Radu1910-1993':'12 Aug 1910',
       'Serban Gheorghe1954-1998':'25 Jun 1954',
       'Vladoiu Aurel1948-2015':'27 Jan 1948',
       'Preda Ion1947-2007':'1 Jan 1947',
       'Coste Marina-Adelina1965-2017':'30 Nov 1965',
       'Sincai Ovidiu1949-1999':'14 Dec 1949',
       'Grosaru Mircea1952-2014':'30 Jun 1952',
       'Florescu Nicolae-Doru1960-2001':'1 jan 1960',
       'Sinko Stefan1939-1995':'1 Jan 1939',
       'Alecsandrescu Nicolae1923-1993':'1 Jan 1923',
       'Ratoi Neculai1939-2016':'15 Mar 1939',
       'Ichim Mircea-Adrian1944-1993':'1 Jan 1944',
       'Fotopolos Sotiris1937-2008':'6 Dec 1937',
       'Ciobanu Gheorghe1964-2015':'22 Sep 1964',
       'Dumitrescu Liana1973-2011':'20 Jan 1973',
       'Dida Corneliu Ioan1942-2008':'26 May 1942',
       'Draghici Sonia-Maria1956-2016':'25 Jul 1956'}
In [144]:
state_abbs={'membru':'','\xa0':'','supleant':''}
In [145]:
def replace_all(text, dic):
    for i in dic:
        text = text.replace(i, dic[i])
    return text
In [146]:
def president(s):
    abb=''
    #reset pres
    if 'Vicepreşedinte' in s:
        s=s.replace('Vicepreşedinte','')
        abb='#VP#'
    elif 'vicepreşedinte' in s:
        s=s.replace('vicepreşedinte','')
        abb='#VP#'
    elif 'preşedinte' in s:
        s=s.replace('preşedinte','')
        abb='#PRES#'
    elif 'Preşedinte' in s:
        s=s.replace('Preşedinte','')
        abb='#PRES#'
    elif 'Secretar' in s:
        s=s.replace('Secretar','')
        abb='#SECR#'
    elif 'secretar' in s:
        s=s.replace('secretar','')
        abb='#SECR#'
    elif 'Trezorier' in s:
        s=s.replace('Trezorier','')
        abb='#TREZ#'
    elif 'trezorier' in s:
        s=s.replace('trezorier','')
        abb='#TREZ#'
    elif 'chestor' in s:
        s=s.replace('chestor','')
        abb='#CHES#'
    elif 'Chestor' in s:
        s=s.replace('Chestor','')
        abb='#CHES#'
    elif 'Şeful' in s:
        s=s.replace('Şeful','')
        abb='#SEF#'
    
    return s.strip(),abb
In [147]:
gpresident={'Vicelider':'$#VP#%',
'vicelider':'$#VP#%',
'Secretar':'$#SECR#%',
'secretar':'$#SECR#%',
'Lider':'$#PRES#%',
'Purtător de cuvânt':'$#PRCV#%',
'lider':'$#PRES#%'}
In [148]:
gpresident2={'Vicepreşedinte':'$#VP#%',
'vicepreşedinte':'$#VP#%',
'preşedinte':'$#PRES#%',
'Preşedinte':'$#PRES#%',
'Secretar':'$#SECR#%',
'secretar':'$#SECR#%'}        
In [149]:
nmembers={}
nparty_imgs={}
nstate_imgs={}
party_set=set()
country_set=set()
comisii_set=set()
deaths={}
for i in members[:]:
    if '-' not in i['birth_ro']:
        birth=str(pd.to_datetime(replace_all(i['birth_ro'],ro_months)))
        death=''
    else:
        birth=str(pd.to_datetime(replace_all(i['name']+i['birth_ro'],decess)))
        death=i['birth_ro'].split('-')[1].strip()
    if len(birth)>3:
        name=i['name']+' | '+birth[:10]
    else:
        name=i['name']+' | Ismeretlen'
    if death:
        if name not in deaths:
            deaths[name]=death
    since=str(pd.to_datetime(legs[i['leg']][0]))[:10]
    until=str(pd.to_datetime(legs[i['leg']][1]))[:10] 
    if i['start']: since=date_ro2(i['start'],True)
    if i['end']: until=date_ro2(i['end'])
    pretty_name,pres=president(i['pretty_name'])
    if name not in nmembers:
        nmembers[name]={}
    if 'Name' not in nmembers[name]:
        nmembers[name]['Name']={'full':i['pretty_name'],'simple':name,'short':i['name']}
    if 'Photo' not in nmembers[name]:
        nmembers[name]['Photo']=[]
    nmembers[name]['Photo'].append(base_url+i['img'])
    if 'UserID' not in nmembers[name]:
        nmembers[name]['UserID']=[]
    nmembers[name]['UserID'].append(i['idm'])
    if 'Camera' not in nmembers[name]:
        nmembers[name]['Camera']=[]
    dummy,pres=president(i['camera'])
    if not pres:
        dummy,pres=president(i['pretty_name'])
    nmembers[name]['Camera'].append(pres+i['camera'])
    if 'Starts' not in nmembers[name]:
        nmembers[name]['Starts']=[]
    nmembers[name]['Starts'].append(since)
    if 'Ends' not in nmembers[name]:
        nmembers[name]['Ends']=[]
    nmembers[name]['Ends'].append(until)
    if 'Link' not in nmembers[name]:
        nmembers[name]['Link']=[]
    nmembers[name]['Link'].append(i['link'])

    if 'Parties' not in nmembers[name]:
        nmembers[name]['Parties']=[]
    parties=i['party']
    parties_abbr=i['party_abbr']
    pi=0
    while pi<(len(parties)):
        p=parties[pi].strip()
        p,pres=president(p)
        #reset since
        since=str(pd.to_datetime(legs[i['leg']][0]))[:10]
        until=str(pd.to_datetime(legs[i['leg']][1]))[:10] 
        if i['start']: since=date_ro2(i['start'],True)
        if i['end']: until=date_ro2(i['end'])
        #process
        if '#GRUP#' not in p:
            if '-' not in p:
                if p!='independent':
                    abbr='Minorități'
                    party=p
                else:
                    abbr='Independent'
                    party='Independent'
            else:
                abbr=parties_abbr[pi]
                party=p.split('-')[1].strip()
                if ('din ') in party:
                    if ('din R') not in party:
                        if ('din B') not in party:
                            since=max(since,date_ro(party[party.find('din ')+4:].strip(),True))
                            party='Independent'
                #else:
                if True: #keep, for cases with both 'din' and 'pana'
                    if len(p.split('-'))>2:
                        s=''.join(p.split('-')[2:]).strip()
                        if ('din ') in s:
                            if ('din R') not in s:
                                since=max(since,date_ro(s[s.find('din ')+4:].strip(),True))
                        if ('până în ') in s:
                            until=min(until,date_ro(s[s.find('până în ')+8:].strip(),True))
            if abbr in ['independent','','Neafiliaţi']:
                abbr='Independent'
            nmembers[name]['Parties'].append({'party':pres+abbr,
                    'start':since,'end':until,'judet':i['judet']})
            party_set.add(abbr)
        else:
            p=parties[pi].strip()
            if 'se transf' not in p:
                abbr=p.split('\xa0')[0]
                p=replace_all(p,gpresident)
                for pk in p.split('$'):
                    kabbr=pk.replace('%',abbr).replace('  ',' ').replace('  ',' ').replace('  ',' ').strip()
                    pabbr=kabbr[:kabbr.find(abbr)+len(abbr)].strip()
                    s=kabbr[kabbr.find(abbr)+len(abbr):].strip()
                    if ('din ') in s:
                        since=max(since,date_ro(s[s.find('din ')+4:].strip(),True))
                    if ('până în ') in s:
                        until=min(until,date_ro(s[s.find('până în ')+8:].strip(),True))
                    nmembers[name]['Parties'].append({'party':pabbr,
                            'start':since,'end':until,'judet':i['judet']})
                    party_set.add(pabbr)
            else:
                p0=parties[min(pi+2,len(parties)-1)].strip()
                p1=parties[pi+1].strip()
                abbr0=p0.split('\xa0')[0]
                abbr1=p1.split(' în ')[0]
                since2=date_ro(p1.split(' în ')[1].replace('-','. ').replace('fost',''))
                p0=replace_all(p0,gpresident)
                for pk in p0.split('$'):
                    kabbr0=pk.replace('%',abbr0).replace('  ',' ').replace('  ',' ').replace('  ',' ').strip()
                    pabbr0=kabbr0[:kabbr0.find(abbr0)+len(abbr0)].strip().split('(')[0].strip()
                    s=kabbr0[kabbr0.find(abbr0)+len(abbr0):].strip()
                    if ('din ') in s:
                        since=max(since,date_ro(s[s.find('din ')+4:].strip(),True))
                    if ('până în ') in s:
                        until=min(until,date_ro(s[s.find('până în ')+8:].strip(),True))
                    nmembers[name]['Parties'].append({'party':pabbr0,
                            'start':since,'end':since2,'judet':i['judet']})
                    nmembers[name]['Parties'].append({'party':abbr1,
                            'start':since2,'end':until,'judet':i['judet']})
                    party_set.add(pabbr0)
                    party_set.add(abbr1)
                pi+=2
        pi+=1

    if 'Countries' not in nmembers[name]:
        nmembers[name]['Countries']=[]
    states=i['state']
    states_abbr=i['state_abbr']
    for pi in range(len(states)):
        p=states[pi].strip()
        p,pres=president(p)
        cont=True
        if '#INTER#' in p:
            cont=False
            if 'Bucureşti - Chişinău' in p:cont=True
            if 'elega' in p:cont=True
            if 'Europol' in p:cont=True
        if cont:
            abb_state=state_format(states_abbr[pi])
            #reset since
            since=str(pd.to_datetime(legs[i['leg']][0]))[:10]
            until=str(pd.to_datetime(legs[i['leg']][1]))[:10] 
            if i['start']: since=date_ro2(i['start'],True)
            if i['end']: until=date_ro2(i['end'])
            #process
            if '-' in p:
                state=p.split('-')[0]
                s=replace_all(''.join(p.split('-')[1:]).strip(),state_abbs).strip()
                if ('din ') in s:
                    if ('din R') not in s:
                        since=max(since,date_ro(s[s.find('din ')+4:].strip(),True))
                if ('până în ') in s:
                    until=min(until,date_ro(s[s.find('până în ')+8:].strip(),True))
            nmembers[name]['Countries'].append({'country':pres+abb_state,'start':since,'end':until})
            country_set.add(abb_state)

    if 'Groups' not in nmembers[name]:
        nmembers[name]['Groups']=[]
    comisii=i['comisii']
    comisii_abbr=i['comisii_abbr']
    for pi in range(len(comisii_abbr)):
        p=comisii[pi].strip()
        abbr=comisii_abbr[pi].strip()
        if '#BP#' in p:
            p=replace_all(p,state_abbs)
            p,pres=president(p)
            if '#BP#Atribu' not in p:
                #reset since
                since=str(pd.to_datetime(legs[i['leg']][0]))[:10]
                until=str(pd.to_datetime(legs[i['leg']][1]))[:10] 
                if i['start']: since=date_ro2(i['start'],True)
                if i['end']: until=date_ro2(i['end'])
                if '-' in p:
                    s0=p.replace('#BP#','').replace('în sesiunea parlamentară:','').replace('atributii','')
                    s=s0.split('-')
                    if len(s[0])<6:
                        s[0]=s[0]+s[1].strip()[-4:]
                    since=max(since,date_ro(s[0].strip(),True))
                    until=min(until,date_ro(s[1].strip(),True))
                    if ('din ') in s0:
                        if ('din sumele') not in s:
                            since=max(since,date_ro(s0[s0.find('din ')+4:].strip(),True))
                    if ('până în ') in s:
                        until=min(until,date_ro(s0[s0.find('până în ')+8:].strip(),True))
                nmembers[name]['Groups'].append({'group':pres+abbr,
                        'start':since,'end':until})
                comisii_set.add(abbr)                
        else:
            #process
            p=replace_all(p,gpresident2)
            if '$' not in p:
                since=str(pd.to_datetime(legs[i['leg']][0]))[:10]
                until=str(pd.to_datetime(legs[i['leg']][1]))[:10] 
                if i['start']: since=date_ro2(i['start'],True)
                if i['end']: until=date_ro2(i['end'])
                kabbr=p.replace('  ',' ').replace('  ',' ').replace('  ',' ').strip()
                pabbr=kabbr[:kabbr.find(abbr)+len(abbr)].strip()
                s=kabbr[kabbr.find(abbr)+len(abbr):].strip()
                if ('din ') in s:
                    if ('din sumele') not in s:
                        since=max(since,date_ro(s[s.find('din ')+4:s.find('din ')+4+9].replace(')','').strip(),True))
                if ('până în ') in s:
                    until=min(until,date_ro(s[s.find('până în ')+8:s.find('până în ')+8+9].replace(')','').strip(),True))
                nmembers[name]['Groups'].append({'group':pabbr,
                        'start':since,'end':until})
                comisii_set.add(pabbr)                
            else:
                pk=p.split('$')[0]
                since0=str(pd.to_datetime(legs[i['leg']][0]))[:10]
                until0=str(pd.to_datetime(legs[i['leg']][1]))[:10] 
                if i['start']: since0=date_ro2(i['start'],True)
                if i['end']: until0=date_ro2(i['end'])
                kabbr=pk.replace('%',abbr).replace('  ',' ').replace('  ',' ').replace('  ',' ').strip()
                pabbr0=kabbr[:kabbr.find(abbr)+len(abbr)].strip()
                s=kabbr[kabbr.find(abbr)+len(abbr):].strip()
                if ('din ') in s:
                    if ('din sumele') not in s:
                        since0=max(since0,date_ro(s[s.find('din ')+4:s.find('din ')+4+9].replace(')','').strip(),True))
                if ('până în ') in s:
                    until0=min(until0,date_ro(s[s.find('până în ')+8:s.find('până în ')+8+9].replace(')','').strip(),True))
                if ('-') in s:
                    if len(s.split('-')[-1])>4:
                        s0=s.split('-')[0].strip()
                        s1=s.split('-')[1].strip()
                        if len(s0)<6:
                            s0=s0+' '+s1.split(' ')[1].strip()
                        since0=max(since0,date_ro(s0,True))
                        until0=min(until0,date_ro(s1,True))

                pk=p.split('$')[1]
                since1=str(pd.to_datetime(legs[i['leg']][0]))[:10]
                until1=str(pd.to_datetime(legs[i['leg']][1]))[:10] 
                if i['start']: since1=date_ro2(i['start'],True)
                if i['end']: until1=date_ro2(i['end'])
                kabbr=pk.replace('%',abbr).replace('  ',' ').replace('  ',' ').replace('  ',' ').strip()
                pabbr1=kabbr[:kabbr.find(abbr)+len(abbr)].strip()
                s=kabbr[kabbr.find(abbr)+len(abbr):].strip()
                if ('din ') in s:
                    if ('din sumele') not in s:
                        since1=max(since1,date_ro(s[s.find('din ')+4:s.find('din ')+4+9].replace(')','').strip(),True))
                if ('până în ') in s:
                    until1=min(until1,date_ro(s[s.find('până în ')+8:s.find('până în ')+8+9].replace(')','').strip(),True))
                if ('-') in s:
                    if len(s.split('-')[-1])>4:
                        s0=s.split('-')[0].strip()
                        s1=s.split('-')[1].strip()
                        if len(s0)<6:
                            s0=s0+' '+s1.split(' ')[1].strip()
                        since1=max(since1,date_ro(s0,True))
                        until1=min(until1,date_ro(s1,True))
                since1=max(since0,since1)
                until1=min(until0,until1)

                if since0==since1 and until0==until1:
                    nmembers[name]['Groups'].append({'group':pabbr1,
                            'start':since1,'end':until1})
                    comisii_set.add(pabbr1)
                elif since0<since1 and until0>until1:
                    nmembers[name]['Groups'].append({'group':pabbr0,
                            'start':since0,'end':since1})
                    comisii_set.add(pabbr0)
                    nmembers[name]['Groups'].append({'group':pabbr1,
                            'start':since1,'end':until1})
                    comisii_set.add(pabbr1)
                    nmembers[name]['Groups'].append({'group':pabbr0,
                            'start':until1,'end':until0})
                    comisii_set.add(pabbr0)
                elif until1<until0:
                    nmembers[name]['Groups'].append({'group':pabbr1,
                            'start':since0,'end':until1})
                    comisii_set.add(pabbr1)
                    nmembers[name]['Groups'].append({'group':pabbr0,
                            'start':until1,'end':until0})
                    comisii_set.add(pabbr0)
                elif since1>since0:
                    nmembers[name]['Groups'].append({'group':pabbr0,
                            'start':since0,'end':since1})
                    comisii_set.add(pabbr0)
                    nmembers[name]['Groups'].append({'group':pabbr1,
                            'start':since1,'end':until0})
                    comisii_set.add(pabbr1)

    #reset since
    since=str(pd.to_datetime(legs[i['leg']][0]))[:10]
    until=str(pd.to_datetime(legs[i['leg']][1]))[:10] 
    if i['start']: since=date_ro2(i['start'],True)
    if i['end']: until=date_ro2(i['end'])
    #process
    if 'Activity' not in nmembers[name]:
        nmembers[name]['Activity']={}
    activitate=i['activitate']
    for a in activitate:
        if len(a)>1:
            an=a[0]
            if an not in ['Membru în']:
                av=int(a[1].split('(')[0].strip().split(',')[0].strip().split('-')[0].strip())
                if an not in nmembers[name]['Activity']:nmembers[name]['Activity'][an]=[]
                nmembers[name]['Activity'][an].append({'value':av,'start':since,'end':until})        
            if an=='Luari de cuvânt':
                if 'în ' in a[1]:
                    av=int(a[1].split('(în ')[-1].strip().split('sedinte)')[0].strip())
                    nmembers[name]['Activity']['Sedințe']={'value':av,'start':since,'end':until}
In [150]:
# nmembers['Popescu-Tariceanu Calin Constantin Anton | 1952-01-14']
In [151]:
# nmembers['Stanisoara Mihai | 1962-06-11']
In [152]:
#nmembers['Verestoy Attila | 1954-03-01']['Parties']
In [153]:
#nmembers['Turcan Raluca | 1976-04-02']['Parties']
In [154]:
len(nmembers),len(members),len(deaths)
Out[154]:
(2818, 4299, 50)
In [155]:
open('data/nmembers.json','w').write(json.dumps(nmembers))
Out[155]:
6860931
In [156]:
open('data/deaths.json','w').write(json.dumps(deaths))
Out[156]:
2021

No need to re-run

In [38]:
def party_cleaner(i,p):
    if i==p: return p
    p=p[len(i)+1:].split('-')[0].strip()
    return p
In [39]:
#! run once
for i in party_set.difference(set(party_imgs.keys())):
    party_imgs[i]={'name': i, 'img': ''}
for i in set(party_imgs.keys()).difference(party_set):
    party_imgs.pop(i)
for i in party_imgs:
    party_imgs[i]['name']=party_cleaner(i,party_imgs[i]['name'])
In [40]:
nstate_imgs={}
In [41]:
for i in state_imgs:
    nstate_imgs[state_format(i)]=state_imgs[i]
nstate_imgs['']=''
In [42]:
#img=json.loads(open('data/img.json','r').read())
img={'party':party_imgs,'state':nstate_imgs}
In [43]:
#manual
img['party']['PP-DD']['name']='Partidul Poporului Dan Diaconescu'
img['party']['PUR-SL']['name']='Partidul Umanist din România'
img['party']['RMDSZ']=img['party']['UDMR']
img['party']['RMDSZ']['name']='Romániai Magyar Demokrata Szövetség'
In [44]:
open('data/img.json','w').write(json.dumps(img))
Out[44]:
48161
In [ ]: