Python script by Hassan Mojeed
Email: mojeed.o.hassan@gmail.com
Website: https://hassanmojeed.pages.dev
import pandas as pd
import numpy as np
import os
from glob import glob
import pandas_gbq
import warnings
warnings.filterwarnings("ignore")
pwd = os.getcwd()
datadir = pwd +"/passport index/data"
datadir
'/Users/mj/Projects/Projects/More Projects/passport index/data'
file_paths = glob(datadir+"/*.csv")
file_paths
['/Users/mj/Projects/Projects/More Projects/passport index/data/henley-passport-index--2023.csv', '/Users/mj/Projects/Projects/More Projects/passport index/data/henley-passport-index--2024.csv', '/Users/mj/Projects/Projects/More Projects/passport index/data/passport image link.csv', '/Users/mj/Projects/Projects/More Projects/passport index/data/Positions.csv', '/Users/mj/Projects/Projects/More Projects/passport index/data/henley-passport-index--2006-2022.csv', '/Users/mj/Projects/Projects/More Projects/passport index/data/country code.csv']
# importing 2006 -2022 Henley Passport Index Data
df_one = pd.read_csv("/Users/mj/Projects/Projects/More Projects/passport index/data/henley-passport-index--2006-2022.csv")
# For the purpose of this study we will be focusing on time period from 2010 and above
df_one = df_one[df_one["YEAR"] > 2009]
# Exploring the data
print(str(df_one[df_one["ACCESS TO COUNTRIES"] == 0].value_counts().sum()) + " zeros found in access_to_countries column \n")
47 zeros found in access_to_countries column
# filling rows with zero values
# This function fills a missing value on the "ACCESS TO COUNTRIES" column base on the condition that
# row in focus either has the same rank with previous row or the next row.
def access_to_countries(row):
"""
Function to update "ACCESS TO COUNTRIES" column in the DataFrame based on specific conditions.
Parameters:
- row: DataFrame row representing each entry to be processed
Returns:
- Updated value for "ACCESS TO COUNTRIES" column based on defined conditions
"""
# Check if 'ACCESS TO COUNTRIES' is 0
if row["ACCESS TO COUNTRIES"] == 0:
# Get the rank of the previous entry (if available)
prev_rank = df_one.loc[row.name - 1, "RANK"] if row.name > 0 else None
# Get the rank of the next entry (if available)
next_rank = df_one.loc[row.name + 1, "RANK"] if row.name < len(df_one) - 1 else None
# Check if the current rank matches the previous rank
if prev_rank is not None and row["RANK"] == prev_rank:
# Return the 'ACCESS TO COUNTRIES' value of the previous entry
return df_one.loc[row.name - 1, "ACCESS TO COUNTRIES"]
# Check if the current rank matches the next rank
elif next_rank is not None and row["RANK"] == next_rank:
# Return the 'ACCESS TO COUNTRIES' value of the next entry
return df_one.loc[row.name + 1, "ACCESS TO COUNTRIES"]
# Return the original 'ACCESS TO COUNTRIES' value if conditions are not met
return row["ACCESS TO COUNTRIES"]
# Apply the 'access_to_countries' function to update 'ACCESS TO COUNTRIES' column in df_one DataFrame
df_one["ACCESS TO COUNTRIES"] = df_one.apply(access_to_countries, axis=1)
# This function fills a missing value on the "ACCESS TO COUNTRIES" column base on the condition that
# it finds the average of the "ACCESS TO COUNTRIES" of previous row and the next row.
# Locating all zero values within the ACCESS TO COUNTRIES column
zero_values = df_one[df_one["ACCESS TO COUNTRIES"] == 0].index
# Loop through each row with ACCESS TO COUNTRIES value 0
for indx in zero_values:
# Find the previous and next rows
prev_row = df_one.loc[indx - 1]
next_row = df_one.loc[indx + 1]
# Calculate the average ACCESS TO COUNTRIES value between the previous and next rows
avg_access = (prev_row["ACCESS TO COUNTRIES"] + next_row["ACCESS TO COUNTRIES"]) / 2
# Assign the average ACCESS TO COUNTRIES value to the current row
df_one.at[indx, "ACCESS TO COUNTRIES"] = avg_access
# Calculate the number of Visa Required Destinations based on the total number of countries (226) and ACCESS TO COUNTRIES values
df_one["Visa_Required_Destination"] = 227 - df_one["ACCESS TO COUNTRIES"]
# Rename columns for clarity and consistency
df_one.rename(columns={
"COUNTRY": "Country",
"RANK": "Rank",
"ACCESS TO COUNTRIES": "Visa_Free_Destinations",
"YEAR": "Year"
}, inplace=True)
# Convert Visa_Free_Destinations and Visa_Required_Destination columns to integer type
df_one["Visa_Free_Destinations"] = df_one["Visa_Free_Destinations"].astype(int)
df_one["Visa_Required_Destination"] = df_one["Visa_Required_Destination"].astype(int)
# Print the shape (dimensions) of the DataFrame df_one
print(df_one.shape)
# Print the first few rows of the DataFrame df_one for inspection
print(df_one.head())
(2587, 5) Country Rank Visa_Free_Destinations Year Visa_Required_Destination 0 Japan 1 192 2022 35 1 Singapore 1 192 2022 35 2 Germany 2 190 2022 37 3 South Korea 2 190 2022 37 4 Finland 3 189 2022 38
# importing 2023 Henley Passport Index Data
df_two = pd.read_csv("/Users/mj/Projects/Projects/More Projects/passport index/data/henley-passport-index--2023.csv")
# Adding a year column to the data
df_two["Year"] = 2023
# Renaming the "Score Column" to "Visa_Free"
df_two.rename(columns={"Score" : "Visa_Free_Destinations"}, inplace= True)
# Adding the "Visa_Required_Destination" Column
df_two["Visa_Required_Destination"] = 227 - df_two["Visa_Free_Destinations"]
print(df_two.shape)
print(df_two.head())
(199, 5) Country Rank Visa_Free_Destinations Year Visa_Required_Destination 0 Japan 1 193 2023 34 1 Singapore 2 192 2023 35 2 South Korea 2 192 2023 35 3 Germany 3 190 2023 37 4 Spain 3 190 2023 37
# Import 2024 Henley Passport Index Data from a CSV file
df_three = pd.read_csv("/Users/mj/Projects/Projects/More Projects/passport index/data/henley-passport-index--2024.csv")
# Rank countries based on the "Visa Free" column to generate the 'Rank' column
df_three["Rank"] = df_three["Visa Free"].rank(method="dense", ascending=False)
# Add a 'Year' column with the value 2024 to indicate the year of the data
df_three["Year"] = 2024
# Rename columns for clarity and consistency
df_three.rename(columns={
"Origin": "Country",
"Visa Free": "Visa_Free_Destinations",
"Visa Required": "Visa_Required_Destination"
}, inplace=True)
# Sort the DataFrame by 'Rank' and 'Country' columns in ascending order
df_three.sort_values(by=["Rank", "Country"], ascending=True, inplace=True, ignore_index=True)
# Select and rearrange columns of interest in the DataFrame
df_three = df_three[["Country", "Rank", "Visa_Free_Destinations", "Year", "Visa_Required_Destination"]]
# Convert selected columns to integer type for appropriate data representation
df_three["Visa_Free_Destinations"] = df_three["Visa_Free_Destinations"].astype(int)
df_three["Visa_Required_Destination"] = df_three["Visa_Required_Destination"].astype(int)
df_three["Rank"] = df_three["Rank"].astype(int)
# Display the shape (dimensions) of the DataFrame df_three
print(df_three.shape)
(199, 5)
# Combine all data together
all_passport_ranking_data = pd.concat([df_one, df_two, df_three])
print(all_passport_ranking_data.shape)
all_passport_ranking_data.head()
(2985, 5)
Country | Rank | Visa_Free_Destinations | Year | Visa_Required_Destination | |
---|---|---|---|---|---|
0 | Japan | 1 | 192 | 2022 | 35 |
1 | Singapore | 1 | 192 | 2022 | 35 |
2 | Germany | 2 | 190 | 2022 | 37 |
3 | South Korea | 2 | 190 | 2022 | 37 |
4 | Finland | 3 | 189 | 2022 | 38 |
# Replace specific country names in the 'Country' column of all_passport_ranking_data DataFrame
all_passport_ranking_data["Country"].replace({
"Türkiye": "Turkey",
"Brunei": "Brunei Darussalam",
"Czechia": "Czech Republic",
"The Gambia": "Gambia",
"Vatican City": "Vatican City(Holy See)",
"Moldova": "Republic of Moldova",
"St. Kitts and Nevis": "Saint Kitts and Nevis",
"St. Lucia": "Saint Lucia",
"St. Vincent and the Grenadines": "Saint Vincent and the Grenadines",
"Palestinian Territory": "State of Palestine",
"United States": "United States of America"
}, inplace=True)
# Display the shape (dimensions) of the DataFrame all_passport_ranking_data
print(all_passport_ranking_data.shape)
all_passport_ranking_data.to_parquet("Passport Index.parquet")
# Display the first few rows of the DataFrame all_passport_ranking_data for inspection
all_passport_ranking_data.head()
(2985, 5)
Country | Rank | Visa_Free_Destinations | Year | Visa_Required_Destination | |
---|---|---|---|---|---|
0 | Japan | 1 | 192 | 2022 | 35 |
1 | Singapore | 1 | 192 | 2022 | 35 |
2 | Germany | 2 | 190 | 2022 | 37 |
3 | South Korea | 2 | 190 | 2022 | 37 |
4 | Finland | 3 | 189 | 2022 | 38 |
# importing the positions data. This help to provide the 1st, 2nd, 3rd and so on columns for this study
file_position = pd.read_csv("/Users/mj/Projects/Projects/More Projects/passport index/data/positions.csv")
print(file_position.shape)
file_position.head()
(199, 2)
Rank | Position | |
---|---|---|
0 | 1 | 1st |
1 | 2 | 2nd |
2 | 3 | 3rd |
3 | 4 | 4th |
4 | 5 | 5th |
# merging with the final data
merged_data = pd.merge(all_passport_ranking_data,file_position, how='left', on='Rank')
print(merged_data.shape)
merged_data.head()
(2985, 6)
Country | Rank | Visa_Free_Destinations | Year | Visa_Required_Destination | Position | |
---|---|---|---|---|---|---|
0 | Japan | 1 | 192 | 2022 | 35 | 1st |
1 | Singapore | 1 | 192 | 2022 | 35 | 1st |
2 | Germany | 2 | 190 | 2022 | 37 | 2nd |
3 | South Korea | 2 | 190 | 2022 | 37 | 2nd |
4 | Finland | 3 | 189 | 2022 | 38 | 3rd |
# write data to Google Big Query
table = "Country_Rank.passport index"
project_id = "cool-ship-407420"
pandas_gbq.to_gbq(merged_data,destination_table=table, project_id=project_id, if_exists="replace")
# Importing country code data
country_code_data = pd.read_csv("/Users/mj/Projects/Projects/More Projects/passport index/data/country code.csv")
# Replace specific country names in the 'country' column of country_code_data DataFrame
country_code_data["country"].replace({
"Türkiye": "Turkey",
"Brunei": "Brunei Darussalam",
"Czechia": "Czech Republic",
"The Gambia": "Gambia",
"Vatican City": "Vatican City(Holy See)",
"Moldova": "Republic of Moldova",
"St. Kitts and Nevis": "Saint Kitts and Nevis",
"St. Lucia": "Saint Lucia",
"St. Vincent and the Grenadines": "Saint Vincent and the Grenadines",
"Palestinian Territory": "State of Palestine",
"United States": "United States of America"
}, inplace=True)
# Display the shape (dimensions) of the DataFrame country_code_data
print(country_code_data.shape)
country_code_data.to_parquet("Country and Code.parquet")
country_code_data.head()
(199, 2)
code | country | |
---|---|---|
0 | af | Afghanistan |
1 | al | Albania |
2 | dz | Algeria |
3 | ad | Andorra |
4 | ao | Angola |
# Writing data to Google Big Query
#table = "Country_Rank.Country and Code"
#project_id = "cool-ship-407420"
#pandas_gbq.to_gbq(country_code_data,destination_table=table, project_id=project_id, if_exists="replace")
# Importing data for all links to passport images
df = pd.read_csv("/Users/mj/Projects/Projects/More Projects/passport index/data/passport image link.csv")
df.to_parquet("Passport Images Data.parquet")
print(df.shape)
df.head()
(199, 2)
Country Code | Passport Url | |
---|---|---|
0 | af | https://www.passportindex.org/countries/af.png |
1 | al | https://www.passportindex.org/countries/al.png |
2 | dz | https://www.passportindex.org/countries/dz.png |
3 | ad | https://www.passportindex.org/countries/ad.png |
4 | ao | https://www.passportindex.org/countries/ao.png |
# Writing data to Google Big Query
#table = "Country_Rank.Passport Images Data"
#project_id = "cool-ship-407420"
#pandas_gbq.to_gbq(df,destination_table=table, project_id=project_id, if_exists="replace")
# Importing data for all links to passport images
xcel_data = pd.read_excel("/Users/mj/Projects/Projects/More Projects/passport index/data/Country_Capital_Language.xlsx")
xcel_data.to_parquet("Country_Capital_Language.parquet")
xcel_data.head()
Country | Capital city | Language | |
---|---|---|---|
0 | Afghanistan | Kabul | Dari Persian |
1 | Albania | Tirana | Albanian |
2 | Algeria | Algiers | Arabic |
3 | Andorra | Andorra la Vella | Catalan |
4 | Angola | Luanda | Portuguese |
# Writing data to Google Big Query
#table = "Country_Rank.Country_Capital_Language"
#project_id = "cool-ship-407420"
#pandas_gbq.to_gbq(xcel_data,destination_table=table, project_id=project_id, if_exists="replace")