Find interesting patterns/trends through analysis & comparison of the world bank loan commitments, HDI,Freedom Index and GDP
To analyze what insight open data can give us as to how effective initiatives and funding actually is as opposed to what it’s meant to be.
An economics and development thinkTank based in Washington D.C, they analyze and keep track of the economic freedom around the world with the influential Index of Economic Freedom.
from pandas import DataFrame, Series
import pandas as pd
import os
import codecs
# Verify existence of & Read in the datasets - project and operations & Freedom Index
DATA_FILES={"projdict":"data/projects_operations_api.csv", "fredict":"data/FreedomIndex.csv"}
def file_path(key):
return os.path.join(os.pardir, DATA_FILES[key])
for file_key in DATA_FILES.keys():
abs_fname = file_path(file_key)
print abs_fname, os.path.exists(abs_fname)
f = codecs.open(file_path("projdict"), encoding='iso-8859-1')
initial_proj_df = pd.read_csv(f)
initial_proj_df.columns
In this section, we explore the world banks commitments to Africa. By looking at the total amount loaned out to different african countries by the years.
is_africa = initial_proj_df['regionname']=='AFRICA'
initial_proj_df[is_africa]['countryname'][:5]
initial_proj_df[is_africa][['countryname', 'totalamt']][:5]
#The totalamt value is not properly formatted. This step cleans up the value by stripping out unnecessary characters.
initial_proj_df['totalamt'] = initial_proj_df['totalamt'].str.replace(';','')
initial_proj_df[is_africa]['totalamt'][:5]
initial_proj_df['totalamt'] = initial_proj_df['totalamt'].astype('float32')
sum(initial_proj_df[is_africa]['totalamt'][:5])
In the next steps, we perform clean up of the data. For example, The amounts in the projects & operations dataset have comma's as the delimiters, so they need to be stripped out and the values parsed as floats.
initial_proj_df[['regionname','countryname','projectstatusdisplay','totalamt']][:2]
# This step is data cleaning. Removing the semi-column from the money values.
initial_proj_df['lendprojectcost'] = initial_proj_df['lendprojectcost'].str.replace(';','')
initial_proj_df['lendprojectcost'] = initial_proj_df['lendprojectcost'].astype('float32')
initial_proj_df['ibrdcommamt'] = initial_proj_df['ibrdcommamt'].str.replace(';','')
initial_proj_df['ibrdcommamt'] = initial_proj_df['ibrdcommamt'].astype('float32')
initial_proj_df['idacommamt'] = initial_proj_df['idacommamt'].str.replace(';','')
initial_proj_df['idacommamt'] = initial_proj_df['idacommamt'].astype('float32')
initial_proj_df['grantamt'] = initial_proj_df['grantamt'].str.replace(';','')
initial_proj_df['grantamt'] = initial_proj_df['grantamt'].astype('float32')
initial_proj_df[is_africa][['countryname','project_name','boardapprovaldate','status','lendprojectcost','grantamt']][:10]
projcp_df = initial_proj_df.copy()
projcp_df = projcp_df.drop(['lendinginstrtype','envassesmentcategorycode','productlinetype','closingdate','url','sector2','sector3','sector4','sector5','sector','mjsector1','mjsector2','mjsector3','mjsector4','mjsector5','mjsector','theme1','theme2','theme3','theme4','theme5','financier','mjtheme2name','mjtheme3name','mjtheme4name','mjtheme5name'],axis=1)
del projcp_df['projectstatusdisplay']
projcp_df2 = projcp_df.drop(['prodline','supplementprojectflg','goal','mjtheme1name','location'], axis=1)
projcp_df2.columns
projcp_df2[is_africa][:5]
grouped = projcp_df2.groupby('regionname')
This section we look further into the Projects and operations data sets and try to find any interesting or surprising facts to analyze further. We also perform some basic statistics on the data such as summation, mean, standard deviation, etc
# function to calculate the total amount awarded by the worldbank per country or regional operating body
def func(x):
totalamt = x['totalamt'].sum()
return Series([totalamt] ,index=['totalamt'])
# result dataframe
result = grouped.apply(func)
#create a new column in dataframe to hold the years from the board approval date
projcp_df2['year'] = projcp_df2['boardapprovaldate'].str[:4]
projcp_df2['year'][:2]
# group data by year and region name
grouped3 = projcp_df2.groupby(['regionname','year'])
# statistics on the banks lending commitments to different regions over time
grouped3['totalamt'].describe()
grouped4 = projcp_df2.groupby(['regionname','year','board_approval_month'])
result4 =grouped4.apply(func)
result4.unstack('regionname')[:5]
result5 = grouped3.apply(func).unstack('regionname').fillna(0)
result5[:5]
# python-us-cpi is a tool for parsing the latest US Consumer Price Index and also provides an inflation calculator api.
#We'll be using this api to calculate the loan commitments from other years into today's dollars for better comparision.
from uscpi import UsCpi
cpi = UsCpi() # downloads the latest CPI data
# $100 in 2012 is worth how much in 1980?
cpi.value_with_inflation(100, 2012, 1980)
projcpi = projcp_df2[['regionname','countryname','project_name','totalamt','grantamt','sector1','year']].copy()
# Function used to convert monetary values to today's value from any year less than 2013 using the cpi api
def fun2(y):
totalamts = y['totalamt']
year = int(y['year'])
regionname =y['regionname']
countryname = y['countryname']
project_name = y['project_name']
grantamt = y['grantamt']
sector1 = y['sector1']
boolVal = 1914 <= year <= 2013
if(boolVal):
totalamts = cpi.value_with_inflation(totalamts,year,2013)
return Series([regionname,countryname,project_name,totalamts,grantamt,sector1,year],index=['regionname','countryname','project_name','totalamt','grantamt','sector1','year'])
resultcpi = projcpi.fillna(0).apply(fun2, axis=1)
# Data cleaning: Removing un-wanted data from the sector1 string
resultcpi['sectorMain'] = resultcpi['sector1'].str.split("!").str[0]
resultcpi['country'] = resultcpi['countryname'].str.split(";").str[0]
# validate that the data is cleaned.
resultcpi[:4]
resultcpi['year'] = resultcpi['year'].astype(int)
# The dataset is big, which makes it very difficult to analyze. This next step we construct a boolean to extract only those items that have been funded from 200 - 2013
is_bv = (resultcpi['year'] >= 2000) & (resultcpi['year'] <= 2013)
resultcpi2 = resultcpi[is_bv]
# verify that data is formatted in the way we want to analyze it.
resultcpi2[:4]
#In the line below we are finding the total bank commitments to Africa over a period from 2000 - 2009.
# We use the cpi function to calculate the inflation and CPI on all the loans less than 2013
ggroup_africa = resultcpi2[resultcpi2['regionname']=='AFRICA'].groupby('year').apply(func)
ggroup_africa.plot(kind='bar', title='Bank lending commitments to Africa in year 2000 - 2013'); plt.tight_layout()
#In the line below we are finding the total bank commitments per region over a period from 2000 - 2009.
# We use the cpi function to calculate the inflation and CPI on all the loans less than 2013
amtByRegion =resultcpi2.groupby(['regionname','year']).apply(func).unstack('regionname')
amtByRegion[:2]
amtByRegion.plot(kind='bar',figsize=(16,8), title='Lendig commitments by the Bank from 1947 - 2013'); plt.legend(loc='best')
# count the number of world bank projects from 2000 - 2013 per country
numOfproj_by_country = resultcpi2.groupby('country').size().order(na_last=True, ascending=False, kind='mergesort')
numOfproj_by_country[:5]
The top 3 borrowers from the world bank are part of the BRICS. We are interested in analyzing patterns of borrowing between the BRIC Nations, their freedom index, Human Development Index and GDP. The next steps analyze the lending of the world bank to these nations.
# From above, I observed that the top most funded UN nations are BRICS, so the list below is created to filter out the BRICS for further observation and analysis
listBRICS = ['Federative Republic of Brazil','Russian Federation','Republic of India','People\'s Republic of China','Republic of South Africa']
brics_nations = resultcpi2[resultcpi2['country'].isin(listBRICS)].groupby(['country','year']).size()
# In the Graph, we look at the number of projects funded by the world bank per country per year since 2000 - 2013
brics_nations.unstack('country').fillna(0).plot(subplots=True, figsize=(8, 8),kind='bar'); plt.legend(loc='best');plt.tight_layout()
#rpt[rpt['STK_ID'].isin(stk_list)]
df_of_BRICS = resultcpi2[resultcpi2['country'].isin(listBRICS)].groupby(['country','sector3']).size().order(na_last=True, ascending=False, kind='mergesort')
df_of_BRICS.unstack('country').fillna(0)
# We import the Freedom index csv for comparison analysis
f = codecs.open(file_path("fredict"), encoding='iso-8859-1')
free_df = pd.read_csv(f)
free_df[:2]
# similar to the projects and operations dataset, I restrict the analysis to only data from 2000 - 2009
free_df2 = free_df[free_df['index year']>=2000].copy()
free_df2.columns
# I extract the BRICS to further observe them
free_df2 = free_df2[free_df2['name'].isin(['China', 'India', 'Russia', 'Brazil', 'South Africa'])]
free_df2[:5]
free_df3 = free_df2[['name','index year','overall score']].copy()
free_df3[:2]
free_df3['overall score'] = free_df3['overall score'].astype(float)
free_df3.pivot_table(['overall score'], rows=['index year'], cols='name').plot(kind='line', title='freedom Index per BRICS country', figsize=(10,10))
free_df3.pivot_table(['overall score'], rows=['index year'], cols='name').plot(subplots=True, figsize=(8, 8)); plt.legend(loc='best');plt.tight_layout();plt.ylabel('Freedom Index');
# In the Graph, we look at the number of projects funded by the world bank per country per year since 2000 - 2013
brics_nations.unstack('country').fillna(0).plot(subplots=True, figsize=(8, 8),kind='bar'); plt.legend(loc='best');plt.tight_layout()
Freedom versus funding
f = codecs.open(file_path("fredict"), encoding='iso-8859-1')
free_df = pd.read_csv(f)
#because I'm looking at the contribution of funds over a period of time, I want to look at the current
#Freedom Index for these countries to make an anlysis of their current state
free_df2 = free_df[free_df['index year']==2013].copy()
#for simplicity, let's ignore those who have not been scored as well
free_df2 = free_df2[free_df2['overall score']!='N/A'].copy()
free_df2.columns
low_freedom = free_df2.sort(['overall score'], ascending=True)
low_freedom = low_freedom[:10]
low_freedom[['name', 'overall score']]
high_freedom = free_df2.sort(['overall score'], ascending=False)
high_freedom = high_freedom[:10]
high_freedom[['name', 'overall score']]
#high corruption
high_corruption = free_df2.sort(['freedom from corruption'], ascending=True)
high_corruption = high_corruption[:10]
high_corruption[['name', 'freedom from corruption']]
#low corruption
low_corruption = free_df2.sort(['freedom from corruption'], ascending=False)
low_corruption = low_corruption[:10]
low_corruption[['name', 'freedom from corruption']]
numOfproj_by_country[:10]
#recall the resultcpi looking at the projects funded, converted to 2013 dollars
#let's sort by country
country_cpi = resultcpi.sort(column='country', ascending=True)
country_cpi[['country', 'totalamt', 'grantamt', 'year']][:2]
#there are a few problems with this data-- First, there are continents included in the countryname:
country_cpi= country_cpi.dropna()
#country_cpi= country_cpi[((country_cpi.country !='Africa')
# &(country_cpi.country !='Central America')
# &(country_cpi.country !='Latin America')
# &(country_cpi.country !='Europe')
# &(country_cpi.country !='East Asia and Pacific')
# &(country_cpi.country !='Europe and Central Asia')
# &(country_cpi.country !='World')
# &(country_cpi.country !='Asia')
# &(country_cpi.country !='Middle East and North Africa')
# &(country_cpi.country !='Africa')
# &(country_cpi.country !='South Eastern Europe and Balkans'))]
#Because the naming conventions of the Freedom Index and World Bank, we will have to manually input the countries we are looking for to compare their freedom index with their world bank funding.
#Because the naming conventions of the Freedom Index and World Bank, we will have to manually input the countries we are looking for to compare their freedom index with their world bank funding.
low_Freedom_list= ['Belize','Turkmenistan','Republic of Zimbabwe','Republic of Uzbekistan',
'Republic of Haiti', 'Republic of Burundi', 'Republic of Equatorial Guinea',
'People\'s Republic of Angola', 'Republica Bolivariana de Venezuela']
high_Freedom_list= ['Kingdom of Norway', 'New Zealand', 'Kingdom of Denmark', 'Republic of Finland',
'Republic of Sweden', 'Kingdom of The Netherlands', 'Common of Australia']
#now we can see how much money was committed to each country in the Freedom_list
low_Freedom_nations = country_cpi[country_cpi['country'].isin(low_Freedom_list)].groupby(['country']).size()
low_Freedom_nations
low_Freedom_nations.plot(kind='bar', title='Lending to Countries with low Freedom Index'); plt.tight_layout()
From this point, we are interested in expanding on the previous analysis, by retriving information from wikipedia an others.
¶import pandas as pd
import wikipydia as wk
import mwparserfromhell
from wikitools import wiki
from wikitools import api
from wikitools import category
from wikitools import page
import itertools
import re
wikisite = "http://en.wikipedia.org/w/api.php"
wikiObject = wiki.Wiki(wikisite)
projectsAPI = pd.read_csv('../data/projects_operations_api.csv')
wikipediadf = pd.read_csv('../data/matchcountries.csv')
# some cleaning on the datasets
wikipediadf.index =wikipediadf['countryname']
projectsAPI['countryname'] = [str(country).split(";")[0] for country in projectsAPI['countryname']]
#print matchNames.columns
#print projectsAPI.columns
projects = pd.merge(projectsAPI,wikipediadf, on='countryname', how = 'left')
projects = projects[projects['countryname'].map(type) != type(0.0)]
projectsAPI = projectsAPI[projectsAPI['countryname'].map(type) != type(0.0)]
projects['totalamt'] = projects['totalamt'].str.replace(';','')
projects['totalamt'] = projects['totalamt'].astype('float32')
print projects.columns
projects['year'] = [str(x)[0:4] for x in projects['boardapprovaldate']]
projects[projects.year == 'nan'] =[str(x)[0:4] for x in projects[projects.year == 'nan']['closingdate']]
import matplotlib.pyplot as plt
import matplotlib.colors as col
def color_variant(hex_color, brightness_offset=1):
if len(hex_color) != 7:
raise Exception("Passed %s into color_variant(), needs to be in #87c95f format." % hex_color)
rgb_hex = [hex_color[x:x+2] for x in [1, 3, 5]]
new_rgb_int = [int(hex_value, 16) + brightness_offset for hex_value in rgb_hex]
new_rgb_int = [min([255, max([0, i])]) for i in new_rgb_int] # make sure new values are between 0 and 255
# hex() produces "0x88", we want just "88"
hexcolor = "#"
for i in new_rgb_int:
if(i<16):
hexcolor+="0"+str(hex(i)[2:])
else:
hexcolor+=str(hex(i)[2:])
return hexcolor
def drawBarCharReference(Color,targetlist, field, title, labels):
fig = plt.figure(num=None, figsize=(24, 8), dpi=700, facecolor='w', edgecolor='k')
ax = fig.add_subplot(111)
ColorBase = Color
changeQuantile = True
changeRange = 0.10
i = 0
for x in targetlist.sort(columns=field,ascending=True).index:
if i/float(len(targetlist.index)) > changeRange:
ColorBase = color_variant(ColorBase,20)
changeRange = changeRange + 0.10
targetlist['color'][x] = ColorBase
#print (type(targetlist[field][x]))
ax.bar(i,float(targetlist[field][x]),1,color=matplotlib.colors.colorConverter.to_rgb(ColorBase))
i+=1
ax.set_xticklabels( ([x[1] for x in targetlist.sort(columns=field,ascending=True).index]) )
#plt.subplots_adjust(bottom=1, left=.01, right=.99, top=.90, hspace=.35)
plt.xticks(np.arange(0.5, i+1, 1))
plt.setp(ax.get_xticklabels(), fontsize=9, rotation='vertical')
plt.setp(ax.get_yticklabels(), fontsize=10)
plt.title(title)
plt.xlabel(labels[0],fontsize=18)
plt.ylabel(labels[1],fontsize=18)
plt.show()
# http://www.geophysique.be/2013/02/12/matplotlib-basemap-tutorial-10-shapefiles-unleached-continued/
#
# BaseMap example by geophysique.be
# tutorial 10
import os
import inspect
import numpy as np
import matplotlib.pyplot as plt
from itertools import islice, izip
from mpl_toolkits.basemap import Basemap
def zip_filter_by_state(records, shapes, included_states=None):
# by default, no filtering
# included_states is a list of states fips prefixes
for (record, state) in izip(records, shapes):
if record[1] in included_states:
yield (record, state)
def draw_global_map(colors, indexlist, titles):
### PARAMETERS FOR MATPLOTLIB :
import matplotlib as mpl
mpl.rcParams['font.size'] = 14.
mpl.rcParams['font.family'] = 'Serif'
mpl.rcParams['axes.labelsize'] = 8.
mpl.rcParams['xtick.labelsize'] = 40.
mpl.rcParams['ytick.labelsize'] = 20.
fig = plt.figure(figsize=(11.7,8.3))
#Custom adjust of the subplots
plt.subplots_adjust(left=0.05,right=0.95,top=0.90,bottom=0.05,wspace=0.15,hspace=0.05)
ax = plt.subplot(111)
#Let's create a basemap of USA
x1 = -179.
x2 = 179.
y1 = -60.
y2 = 80.
i=0
#colors = ['#8C040A','#9A040C','#A8050E','#C40813','#D20915','#DF0A17','#ED0C19','#FC0D1B']
m = Basemap(resolution='i',projection='merc', llcrnrlat=y1,urcrnrlat=y2,llcrnrlon=x1,urcrnrlon=x2,lat_ts=(y1+y2)/2)
m.drawcountries(linewidth=0.5)
m.drawcoastlines(linewidth=0.5)
m.drawparallels(np.arange(y1,y2,20.),labels=[1,0,0,0],color='black',dashes=[1,0],labelstyle='+/-',linewidth=0.2) # draw parallels
m.drawmeridians(np.arange(x1,x2,20.),labels=[0,0,0,1],color='black',dashes=[1,0],labelstyle='+/-',linewidth=0.2) # draw meridians
from matplotlib.collections import LineCollection
from matplotlib import cm
import shapefile
basemap_data_dir = os.path.join(os.path.dirname(inspect.getfile(Basemap)), "data")
# this is my git clone of https://github.com/matplotlib/basemap --> these files will be in the PiCloud basemap_data_dir
if os.path.exists(os.path.join(basemap_data_dir,"UScounties.shp")):
shpf = shapefile.Reader("../data/world_country_admin_boundary_shapefile_with_fips_codes.shp")
else:
# put in your path
#shpf = shapefile.Reader("/Users/raymondyee/Dropbox/WwoD13/tl_2012_us_county")
shpf = shapefile.Reader("../data/world_country_admin_boundary_shapefile_with_fips_codes.shp")
shapes = shpf.shapes()
records = shpf.records()
#print cm.colors.ColorConverter.to_rgba('#eeefff')
#random_number = 38*145*155
# show only CA and AK (for example)
for record, shape in zip(records, shapes):
lons,lats = zip(*shape.points)
data = np.array(m(lons, lats)).T
if len(shape.parts) == 1:
segs = [data,]
else:
segs = []
for i in range(1,len(shape.parts)):
index = shape.parts[i-1]
index2 = shape.parts[i]
segs.append(data[index:index2])
segs.append(data[index2:])
lines = LineCollection(segs,antialiaseds=(1,))
#cm.jet(random_number)
lines.set_facecolors(colors[0])
lines.set_edgecolors(colors[1])
lines.set_linewidth(0.1)
ax.add_collection(lines)
for record, shape in zip_filter_by_state(records, shapes, [x[1] for x in indexlist.index]):
lons,lats = zip(*shape.points)
data = np.array(m(lons, lats)).T
if len(shape.parts) == 1:
segs = [data,]
else:
segs = []
for i in range(1,len(shape.parts)):
index = shape.parts[i-1]
index2 = shape.parts[i]
segs.append(data[index:index2])
segs.append(data[index2:])
lines = LineCollection(segs,antialiaseds=(1,))
#cm.jet(random_number)
i=i+1
x_color=None
for w in heatmapfounding.index:
if record[1] in w[1]:
x_color = w
break
lines.set_facecolors(indexlist['color'][x_color])
lines.set_edgecolors(indexlist['color'][x_color])
lines.set_linewidth(0.1)
ax.add_collection(lines)
plt.title(titles[0])
plt.savefig('tutorial10.png',dpi=300)
plt.show()
#draw_global_map(['#3C989E','#424242'],heatmapfounding, ['Total World Bank Lending Commitments Accumulated 2001-2013'])
heatmapfounding = pd.DataFrame(projects[projects.type == 'Country'], columns=['wikiname','mapname','totalamt','year'])
heatmapfounding = pd.DataFrame(heatmapfounding[heatmapfounding.year>='2001'], columns=['wikiname','mapname','totalamt','year'])
heatmapfounding = heatmapfounding.groupby(['wikiname','mapname']).sum()
heatmapfounding['color'] = pd.Series(["hola" for x in heatmapfounding.index], index=heatmapfounding.index)
drawBarCharReference( '#C73F2A',heatmapfounding, "totalamt","Total World Bank Lending Commitments Accumulated 2001-2013",['Country','US$'])
draw_global_map(['#ffffff','#000000'],heatmapfounding, ['Total World Bank Lending Commitments Accumulated 2001-2013'])
def cleanFloatnumber(x):
if type(x) is float:
return float(x)
elif type(x) is str:
if len(x) ==0:
return None
x=re.sub('<!--.*?-->','',x)
x=re.sub('<*?>.*?<*?>','',x)
x=x.strip()
delimiterRegex = re.compile(r'[-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?')
Numbers = re.findall(delimiterRegex,x)
if len(Numbers)>0:
return float(Numbers[0])
else:
return None
else:
return None
def cleanIntNumber(x):
if type(x) is float:
return float(x)
elif type(x) is str:
if len(x) ==0:
return None
x=re.sub('<!--.*?-->','',x)
x=re.sub('<*?>.*?<*?>','',x)
x=re.sub(',','',x)
x=x.strip()
delimiterRegex = re.compile(r'[0-9]+')
Numbers = re.findall(delimiterRegex,x)
if len(Numbers)>0:
return float(Numbers[0])
else:
return None
else:
return None
def get_infobox_from_wikipedia(countryname):
#print "Checking: "+str(countryname)+"__"
country_found = False
hdi = None
gini = None
GDP = None
GDP_nominal_per_capita = None
population = None
if str(countryname).strip() == "" or countryname is None or str(countryname).strip()=='nan':
return hdi,gini,GDP,GDP_nominal_per_capita, population
try:
wikipage = page.Page(wikiObject,title=countryname)
except Exception as inst:
print "No results from Wikipedia: "+str(countryname)
return hdi,gini,GDP,GDP_nominal_per_capita, population
wikiraw = wikipage.getWikiText()
wikiraw = wikiraw.decode('UTF-8')
parsedWikiText = mwparserfromhell.parse(wikiraw)
for x in parsedWikiText.nodes:
if "template" in str(type(x)) and "Infobox country" in str(x.name):
country_found = True
if x.has_param('population_census'):
population = cleanIntNumber(str(x.get('population_census').value))
if population is None:
if x.has_param('population_estimate'):
population = cleanIntNumber(str(x.get('population_estimate').value))
if x.has_param('HDI'):
hdi = cleanFloatnumber(str(x.get('HDI').value))
if x.has_param('Gini'):
gini = cleanFloatnumber(str(x.get('Gini').value))
if x.has_param('GDP'):
GDP = x.get('GDP').value
if x.has_param('GDP_nominal_per_capita'):
GDP_nominal_per_capita = str(x.get('GDP_nominal_per_capita').value)
break
if country_found == False:
print "No Infobox: "+str(countryname)
return hdi,gini,GDP,GDP_nominal_per_capita,population
wikipediadf["HDI"], wikipediadf["gini"],wikipediadf['GDP'],wikipediadf['GDP_nominal_per_capita'],wikipediadf['population'] = zip(*wikipediadf['wikiname'].map(get_infobox_from_wikipedia))
#pp = pd.DataFrame(zip(*wikipediadf[wikipediadf.wikiname == "Guinea"]['wikiname'].map(get_infobox_from_wikipedia)))
#print pp[:]
# It was not possible to process this data from wikipedia, so I decided to filter it (Ignacio)
for i in wikipediadf[wikipediadf.type == 'Country'].index:
typeFound = type(wikipediadf['population'][i])
if typeFound is not float and typeFound is not None:
print "deleted"
wikipediadf=wikipediadf.drop([i])
break
for i in wikipediadf[wikipediadf.type == 'Country'].index:
typeFound = type(wikipediadf['GDP_nominal_per_capita'][i])
if typeFound is not float and typeFound is not None:
print "deleted"
wikipediadf=wikipediadf.drop([i])
break
projects = pd.merge(projectsAPI,wikipediadf, on='countryname', how = 'left')
projects = projects[projects['countryname'].map(type) != type(0.0)]
projectsAPI = projectsAPI[projectsAPI['countryname'].map(type) != type(0.0)]
projects['totalamt'] = projects['totalamt'].str.replace(';','')
projects['totalamt'] = projects['totalamt'].astype('float32')
print projects.columns
projects['year'] = [str(x)[0:4] for x in projects['boardapprovaldate']]
projects[projects.year == 'nan'] =[str(x)[0:4] for x in projects[projects.year == 'nan']['closingdate']]
def drawBarCharReference2(Color,targetlist, field, title,labels):
fig = plt.figure(num=None, figsize=(24, 8), dpi=700, facecolor='w', edgecolor='k')
ax = fig.add_subplot(111)
ColorBase = Color
changeQuantile = True
changeRange = 0.10
i = 0
for x in targetlist.sort(columns=field,ascending=True).index:
if i/float(len(targetlist.index)) > changeRange:
ColorBase = color_variant(ColorBase,20)
changeRange = changeRange + 0.10
targetlist['color'][x] = ColorBase
#print (type(targetlist[field][x]))
ax.bar(i,float(targetlist[field][x]),1,color=matplotlib.colors.colorConverter.to_rgb(ColorBase))
i+=1
ax.set_xticklabels( ([targetlist['mapname'][x] for x in targetlist.sort(columns=field,ascending=True).index]) )
#plt.subplots_adjust(bottom=1, left=.01, right=.99, top=.90, hspace=.35)
plt.xticks(np.arange(0.5, i+1, 1))
plt.setp(ax.get_xticklabels(), fontsize=9, rotation='vertical')
plt.setp(ax.get_yticklabels(), fontsize=10)
plt.title(title)
plt.xlabel(labels[0], fontsize=18)
plt.ylabel(labels[1], fontsize=18)
plt.show()
def zip_filter_by_state2(records, shapes, included_states=None):
# by default, no filtering
# included_states is a list of states fips prefixes
for (record, state) in izip(records, shapes):
if record[1] in included_states:
yield (record, state)
def draw_global_map2(colors, indexlist, titles):
### PARAMETERS FOR MATPLOTLIB :
import matplotlib as mpl
mpl.rcParams['font.size'] = 14.
mpl.rcParams['font.family'] = 'Serif'
mpl.rcParams['axes.labelsize'] = 8.
mpl.rcParams['xtick.labelsize'] = 40.
mpl.rcParams['ytick.labelsize'] = 20.
fig = plt.figure(figsize=(11.7,8.3))
#Custom adjust of the subplots
plt.subplots_adjust(left=0.05,right=0.95,top=0.90,bottom=0.05,wspace=0.15,hspace=0.05)
ax = plt.subplot(111)
#Let's create a basemap of USA
x1 = -179.
x2 = 179.
y1 = -60.
y2 = 80.
i=0
#colors = ['#8C040A','#9A040C','#A8050E','#C40813','#D20915','#DF0A17','#ED0C19','#FC0D1B']
m = Basemap(resolution='i',projection='merc', llcrnrlat=y1,urcrnrlat=y2,llcrnrlon=x1,urcrnrlon=x2,lat_ts=(y1+y2)/2)
m.drawcountries(linewidth=0.5)
m.drawcoastlines(linewidth=0.5)
m.drawparallels(np.arange(y1,y2,20.),labels=[1,0,0,0],color='black',dashes=[1,0],labelstyle='+/-',linewidth=0.2) # draw parallels
m.drawmeridians(np.arange(x1,x2,20.),labels=[0,0,0,1],color='black',dashes=[1,0],labelstyle='+/-',linewidth=0.2) # draw meridians
from matplotlib.collections import LineCollection
from matplotlib import cm
import shapefile
basemap_data_dir = os.path.join(os.path.dirname(inspect.getfile(Basemap)), "data")
# this is my git clone of https://github.com/matplotlib/basemap --> these files will be in the PiCloud basemap_data_dir
if os.path.exists(os.path.join(basemap_data_dir,"UScounties.shp")):
shpf = shapefile.Reader("../data/world_country_admin_boundary_shapefile_with_fips_codes.shp")
else:
# put in your path
#shpf = shapefile.Reader("/Users/raymondyee/Dropbox/WwoD13/tl_2012_us_county")
shpf = shapefile.Reader("../data/world_country_admin_boundary_shapefile_with_fips_codes.shp")
shapes = shpf.shapes()
records = shpf.records()
#print cm.colors.ColorConverter.to_rgba('#eeefff')
#random_number = 38*145*155
# show only CA and AK (for example)
for record, shape in zip(records, shapes):
lons,lats = zip(*shape.points)
data = np.array(m(lons, lats)).T
if len(shape.parts) == 1:
segs = [data,]
else:
segs = []
for i in range(1,len(shape.parts)):
index = shape.parts[i-1]
index2 = shape.parts[i]
segs.append(data[index:index2])
segs.append(data[index2:])
lines = LineCollection(segs,antialiaseds=(1,))
#cm.jet(random_number)
lines.set_facecolors(colors[0])
lines.set_edgecolors(colors[1])
lines.set_linewidth(0.1)
ax.add_collection(lines)
for record, shape in zip_filter_by_state2(records, shapes, [indexlist['mapname'][x] for x in indexlist.index]):
lons,lats = zip(*shape.points)
data = np.array(m(lons, lats)).T
if len(shape.parts) == 1:
segs = [data,]
else:
segs = []
for i in range(1,len(shape.parts)):
index = shape.parts[i-1]
index2 = shape.parts[i]
segs.append(data[index:index2])
segs.append(data[index2:])
lines = LineCollection(segs,antialiaseds=(1,))
#cm.jet(random_number)
i=i+1
x_color=None
for (w,x) in [(indexlist['mapname'][x],x) for x in indexlist.index]:
if type(w) is str and record[1] in w:
x_color = x
break
lines.set_facecolors(indexlist['color'][x_color])
lines.set_edgecolors(indexlist['color'][x_color])
lines.set_linewidth(0.1)
ax.add_collection(lines)
plt.title(titles[0])
plt.savefig('tutorial10.png',dpi=300)
plt.show()
The Human Development Index (HDI) is a composite statistic of life expectancy, education, and income indices to rank countries into four tiers of human development. It was created by economist Mahbub ul Haq, followed by economist Amartya Sen in 1990,[1] and published by the United Nations Development Programme.[2]
Published on 4 November 2010 (and updated on 10 June 2011), starting with the 2010 Human Development Report the HDI combines three dimensions:
#wikipediadf["HDI"], #
#wikipediadf["gini"],
#wikipediadf['GDP'],
#wikipediadf['GDP_nominal_per_capita'],
#wikipediadf['population']
heatmapHDI = pd.DataFrame(projects[projects.type == 'Country'], columns=['wikiname','mapname','HDI'])
#heatmapHDI=heatmapHDI.reindex(index=['wikiname','wikiname'])
#heatmapHDI = heatmapHDI.groupby(['wikiname','mapname'])
#heatmapHDI = pd.DataFrame(heatmapHDI)
heatmapHDI = heatmapHDI.fillna(0)
heatmapHDI = heatmapHDI.drop_duplicates()
heatmapHDI['color'] = pd.Series(["hola" for x in heatmapHDI.index], index=heatmapHDI.index)
drawBarCharReference2( '#425910',heatmapHDI, 'HDI',"Human Development Index (Wikipedia)", ['Country','HDI Index'])
draw_global_map2(['#ffffff','#000000'],heatmapHDI, ['Human Development Index Map (Wikipedia)'])