#!/usr/bin/env python # coding: utf-8 # # Web Scraping Into a MySQL Database # # Project Steps: # 1. Extract data using Fixer.io Currency API. # 2. Transform the data into a relevant form. # 3. Load the data into a MySQL Data-Warehouse. # 4. Data Analysis/Visualization # In[1]: # Import packages import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns import pandasql as ps import pymysql import requests from bs4 import BeautifulSoup from datetime import datetime import matplotlib.dates as md sns.set_style('white') # ## 1. Extract data using Fixer.io Currency API. # # Will extract EUR/USD currency data for 2018-2020. More details of API usage at https://fixer.io/ # In[2]: # Generating a range of daily dates from 2018 to 2020 dates = pd.date_range(start='1/1/2018', end = '1/1/2021').to_pydatetime().tolist() # In[3]: # Generating a dictionary where the keys are the days and the values are the currencies rates with a EUR base data = {} for date in range(0,len(dates)): url = 'http://api.exchangeratesapi.io/v1/'+dates[date].strftime("%Y-%m-%d")+'?access_key=*************' r = requests.get(url) html = r.text soup = BeautifulSoup(html) data[dates[date]] = soup # It's a good practice to have a backup for your data, especially when relying on external services. Better safe than sorry! # In[4]: # Create backup data source backupData = data.copy() # ## 2. Transform data to relevant form # # We need to extract the US Dollar rates against the EUR (EUR/USD). Fixer.io sends data in HTML format, which is a lengthy string in Python. I've utilized string manipulation to extract USD rates for each specified date and compiled them into a dictionary with dates as keys and rates as values. # In[5]: EUR_USD_rates = {} for day in range(len(dates)): # Extracting the HTML Data d = data[dates[day]].text # Extracting the Date from the Data start_date_location = d.find('"date":"') + len('"date":"') end_date_location = d.find('","rates"') date = d[start_date_location:end_date_location] # Extracting the Rate from the Data start_rate_location = d.find('"USD":') + len('"USD":') end_rate_location = d.find(',"UYU"') rate = float(d[start_rate_location:end_rate_location]) # Creating a Dictionary where the keys are the dates and the values are the EUR/USD rates EUR_USD_rates[date] = rate # Creating back-up for data # In[6]: # Creating a Backup for the Data rates_backup_data = EUR_USD_rates.copy() # ## 3. Load the data into a MySQL Data-Warehouse. # # Before uploading the data to MySQL, I'll save it as a CSV file for future analysis. This ensures I have easy access for later evaluations. # In[45]: # Creating a Panda's Dataframe out of the rates dictionary df = pd.DataFrame.from_dict(EUR_USD_rates, orient='index', columns=['Rates']) # Load data into a CSV file df.to_csv('EUR_USD_Rates.csv') # In[10]: # Import pymysql module import pymysql # Connect to the database connection = pymysql.connect(host='localhost', user='root', password='*****', db='mysql') cursor = connection.cursor() # Creating a table called “rates” # In[11]: sql = "CREATE TABLE rates(day VARCHAR(50) NOT NULL, rate FLOAT NOT NULL)" # Executing Query cursor.execute(sql) # Commit the Query connection.commit() # In[13]: df.head() # To populate the table with my extracted data, I'll need some transformations to make it compatible with MySQL. Essentially, I'll convert the earlier-created dataframe into a tuple. Each row in this tuple will contain a 'date' column (String) and a 'rate' column (Float). # In[14]: dates = df.index.values rates = df.iloc[:,0].values data_tuple = tuple(zip(dates, rates)) # Inserting data # In[15]: for data in range(len(data_tuple)): # Create a new record sql = "INSERT INTO rates (`day`, `rate`) VALUES (%s, %s)" # Execute the query cursor.execute(sql,data_tuple[data]) # Commit the transaction connection.commit() # In[16]: # Select All Rows from the Database sql = "select * from rates" cursor.execute(sql) result = cursor.fetchall() # Print the first 10 Rows for i in result[0:10]: print(i) # ## 4. Data Analysis/Visualization # In[17]: # Loading data from the CSV file df = pd.read_csv('EUR_USD_Rates.csv') # Renaming the date column df.rename({'Unnamed: 0':'day'},axis = 1, inplace=True) # Generating simple descriptive statistics df.describe() # ### Analysis # # Over the past three years, the average USD to EUR exchange rate stands at 1.147. With a standard deviation that's just 4.2% of the mean, the data exhibits low volatility. # # The interquartile range further confirms this stability, showing that half of the values range from 1.115 to 1.176. # In[29]: df # ### Plot Trending rate # In[25]: fig, ax = plt.subplots(figsize = [22,8]) # specify the position of the major ticks at the beginning of the week ax.xaxis.set_major_locator(md.WeekdayLocator(byweekday = 1)) # specify the format of the labels as 'year-month-day' ax.xaxis.set_major_formatter(md.DateFormatter('%Y-%m-%d')) # (optional) rotate by 90° the labels in order to improve their spacing plt.setp(ax.xaxis.get_majorticklabels(), rotation = 90) # specify the position of the minor ticks at each day ax.xaxis.set_minor_locator(md.DayLocator(interval = 100)) ax.tick_params(axis = 'x', which = 'major', length = 10) ax.tick_params(axis = 'x', which = 'minor', length = 5) sns.lineplot(data=df, x = 'day', y = 'Rates', ax=ax) plt.xlabel('day',size = 16) plt.ylabel('Rates',size = 16) plt.title('Trending Rates - EUR/USD - By Day',size = 22) plt.show() # ### Box Plot by Month # # # In[36]: # Creating a Month column df['month'] = pd.DatetimeIndex(df.day).month # Creating a Year column df['year'] = pd.DatetimeIndex(df.day).year # Sample Data df # In[37]: sns.catplot(data=df.reset_index(), x = 'month', y = 'Rates', kind = 'box') plt.xlabel('Month',size = 16) plt.ylabel('Rates',size = 16) plt.title('Trending Rates - EUR/USD - By Month',size = 22) plt.show() # ### Point plot by Month # In[38]: sns.catplot(data=df.reset_index(), x = 'month', y = 'Rates', kind = 'point') plt.xlabel('Month',size = 16) plt.ylabel('Rates',size = 16) plt.title('Trending Rates - EUR/USD - By Month',size = 22) plt.show() # ### Trending Point Plot by Month/Year # # # In[39]: sns.catplot(data=df.reset_index(), x = 'month', y = 'Rates', kind = 'point',col = 'year') plt.show() # ### Density Plot by Month # In[40]: fig, ax = plt.subplots(figsize = [18,8]) sns.kdeplot(data = df, x = 'month', y = 'Rates') plt.show() # ### Kernel Density Plot — Comparison By Year # In[41]: def sql(query): return ps.sqldf(query) query1 = ''' select Rates from df where year = 2018 ''' query2 = ''' select Rates from df where year = 2019 ''' query3 = ''' select Rates from df where year = 2020 ''' df_2018 = sql(query1) df_2019 = sql(query2) df_2020 = sql(query3) # In[42]: fig, ax = plt.subplots(figsize = [18,8]) sns.kdeplot(data = df_2018,x = 'Rates',shade = True, label='2018') sns.kdeplot(data = df_2019,x = 'Rates',shade = True, label='2019') sns.kdeplot(data = df_2020,x = 'Rates',shade = True, label='2020') plt.legend() plt.show()