import pandas as pd
import plotly.express as px
import plotly.graph_objs as go
import plotly.offline as po
po.init_notebook_mode(connected=True)
import matplotlib.pyplot as plt
from matplotlib import rcParams
import seaborn as sb
%matplotlib inline
rcParams['figure.figsize'] = 8,5
# plt.style.use('fivethirtyeight')
atm = pd.read_csv('AggregatedData.csv')
atm.tail()
ATM Name | Transaction Date | No Of Withdrawals | No Of XYZ Card Withdrawals | No Of Other Card Withdrawals | Total amount Withdrawn | Amount withdrawn XYZ Card | Amount withdrawn Other Card | Weekday | Festival Religion | Working Day | Holiday Sequence | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
11584 | Big Street ATM | 29-09-2017 | 137 | 34 | 103 | 468800 | 146200 | 322600 | FRIDAY | H | H | WHH |
11585 | Mount Road ATM | 29-09-2017 | 79 | 27 | 52 | 305100 | 172500 | 132600 | FRIDAY | H | H | WHH |
11586 | Airport ATM | 29-09-2017 | 117 | 77 | 40 | 709900 | 576800 | 133100 | FRIDAY | H | H | WHH |
11587 | KK Nagar ATM | 29-09-2017 | 76 | 48 | 28 | 408700 | 279900 | 128800 | FRIDAY | H | H | WHH |
11588 | Christ College ATM | 29-09-2017 | 143 | 61 | 82 | 700400 | 364200 | 336200 | FRIDAY | H | H | WHH |
atm.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 11589 entries, 0 to 11588 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ATM Name 11589 non-null object 1 Transaction Date 11589 non-null object 2 No Of Withdrawals 11589 non-null int64 3 No Of XYZ Card Withdrawals 11589 non-null int64 4 No Of Other Card Withdrawals 11589 non-null int64 5 Total amount Withdrawn 11589 non-null int64 6 Amount withdrawn XYZ Card 11589 non-null int64 7 Amount withdrawn Other Card 11589 non-null int64 8 Weekday 11589 non-null object 9 Festival Religion 11589 non-null object 10 Working Day 11589 non-null object 11 Holiday Sequence 11589 non-null object dtypes: int64(6), object(6) memory usage: 1.1+ MB
Data Cleaning
# renaming columns for easy accessing
atm = atm.rename(columns={
'ATM Name':'atm_name',
'Transaction Date':'transaction_date',
'No Of Withdrawals':'no_of_withdrawals',
'No Of XYZ Card Withdrawals':'xyz_card_withdrawals',
'No Of Other Card Withdrawals':'other_card_withdrawals',
'Total amount Withdrawn':'total_amt_withdrawn($)',
'Amount withdrawn XYZ Card':'amt_withdrawn_xyz($)',
'Amount withdrawn Other Card':'amt_withdrawn_other($)',
'Weekday':'wkday',
'Festival Religion':'festival_region',
'Working Day':'working_day',
'Holiday Sequence':'holiday_sequence'
})
atm['transaction_date'] = atm['transaction_date'].apply(lambda x: x.replace('-', '/'))
# Change transaction date to date data type
atm.transaction_date = pd.to_datetime(atm.transaction_date, format='%d/%m/%Y')
# Change all weekdays to title case
atm['wkday'] = atm['wkday'].str.title()
import functions as fn
fn.hist(atm, 'no_of_withdrawals', 'Distribution of Number of Withdrawals')
fn.hist(atm, ['xyz_card_withdrawals', 'other_card_withdrawals'], 'Distribution of Number of Withdrawals by Card Type')
fn.hist(atm, ['amt_withdrawn_xyz($)', 'amt_withdrawn_other($)'], 'Amount Withdrawn by Card type')
fig = px.histogram(atm, 'atm_name', 'no_of_withdrawals', histnorm='percent',
text_auto='.1f', width=800, title='Number of withdrawals in percent for each ATM')
fig.update_yaxes(showticklabels=False)
fig = px.histogram(atm, 'atm_name', 'total_amt_withdrawn($)', histfunc='avg',
text_auto='.4s', width=800, title='Average amount withdrawn in from each ATM')
fig.update_yaxes(showticklabels=False)
Sweetviz is used below for a brief EDA on the dataset.
import sweetviz as sv
report = sv.analyze(atm)
| | [ 0%] 00:00 -> (? left)
report.show_html('SweetvizEDA.html', open_browser=False)
Report SweetvizEDA.html was generated.
report.show_notebook(scale=0.6, layout='widescreen', h='1400', w=1200)
days = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday']
wk_grp = atm.groupby('wkday').sum().reindex(days)
wk_grp.reset_index(inplace=True)
fig = px.bar(wk_grp, 'wkday', ['xyz_card_withdrawals', 'other_card_withdrawals'],
width=800, barmode='group', text_auto='.3s', title='Card withdrawals by Weekday for each Card type')
fig.update_traces(textposition='outside')
fig.update_yaxes(showticklabels=False)
fig = px.bar(wk_grp, 'wkday', ['amt_withdrawn_xyz', 'amt_withdrawn_other'],
width=800, barmode='group', text_auto='.3s', title='Amount withdrawn by Weekday for each Card type')
fig.update_traces(textposition='outside')
fig.update_yaxes(showticklabels=False)
atm_grp = atm.groupby('atm_name').sum()
atm_grp.reset_index(inplace=True)
fig = px.bar(atm_grp, 'atm_name', ['xyz_card_withdrawals', 'other_card_withdrawals'], barmode='group',
width=900, text_auto='.3s', title='Card withdrawals from each Atm')
fig.update_traces(textposition='outside')
fig.update_yaxes(showticklabels=False)
from pandas_profiling import ProfileReport
profile = ProfileReport(atm, title='Pandas Profiling')
profile.to_widgets()
Summarize dataset: 0%| | 0/5 [00:00<?, ?it/s]
Generate report structure: 0%| | 0/1 [00:00<?, ?it/s]
Render widgets: 0%| | 0/1 [00:00<?, ?it/s]
VBox(children=(Tab(children=(Tab(children=(GridBox(children=(VBox(children=(GridspecLayout(children=(HTML(valu…
profile.to_notebook_iframe()
Render HTML: 0%| | 0/1 [00:00<?, ?it/s]
px.line(atm, 'transaction_date', 'no_of_withdrawals', color='atm_name', width=1000)
px.line(atm, 'transaction_date', 'no_of_withdrawals', facet_col='atm_name', facet_col_wrap=2,
height=800, width=700)
new_df = atm.set_index('transaction_date')
new_df = new_df.resample('D').sum()
px.line(x=new_df.index, y= new_df['no_of_withdrawals'])
plt.style.use('seaborn')
# plt.style.available
plt.figure(figsize=(10,6))
plt.plot(new_df.index, new_df.xyz_card_withdrawals, '--', c='b')
plt.plot(new_df.index, new_df.other_card_withdrawals, '--', c='r')
# plt.grid()
plt.xlabel('Year')
plt.ylabel('No. of Withdrawals')
plt.legend(['XYZ Card', 'Other Card'])
plt.show()
plt.figure(figsize=(10,6))
plt.plot(new_df.index, new_df.amt_withdrawn_xyz, '--', c='b')
plt.plot(new_df.index, new_df.amt_withdrawn_other, '--', c='r')
# plt.grid()
plt.xlabel('Year')
plt.ylabel('Amount Withdrawan')
plt.legend(['XYZ Card', 'Other Card'])
plt.show()
monthly = new_df.resample('M').sum()
plt.figure(figsize=(10,6))
plt.plot(monthly.index, monthly.xyz_card_withdrawals, c='b')
plt.plot(monthly.index, monthly.other_card_withdrawals, c='r')
# plt.grid()
plt.xlabel('Year')
plt.ylabel('No. of Withdrawals')
plt.legend(['XYZ Card', 'Other Card'])
plt.show()
no_of_withdrawals | xyz_card_withdrawals | other_card_withdrawals | total_amt_withdrawn | amt_withdrawn_xyz | amt_withdrawn_other | |
---|---|---|---|---|---|---|
transaction_date | ||||||
2011-01-31 | 18012 | 10924 | 7088 | 64819400 | 44459200 | 20360200 |
2011-02-28 | 15637 | 9968 | 5669 | 57395900 | 40790200 | 16605700 |
2011-03-31 | 17727 | 11797 | 5930 | 66051300 | 48823000 | 17228300 |
2011-04-30 | 15784 | 9778 | 6006 | 59207400 | 40749000 | 18458400 |
2011-05-31 | 17154 | 10306 | 6848 | 65967100 | 43767100 | 22200000 |
plt.figure(figsize=(10,6))
plt.plot(monthly.index, monthly.amt_withdrawn_xyz, c='b')
plt.plot(monthly.index, monthly.amt_withdrawn_other, c='r')
# plt.grid()
plt.xlabel('Year')
plt.ylabel('Amount Withdrawan')
plt.legend(['XYZ Card', 'Other Card'])
plt.show()
atm1 = atm.sample(frac=0.1).copy()
plt.scatter(x=atm1['xyz_card_withdrawals'], y=atm1['other_card_withdrawals'])
<matplotlib.collections.PathCollection at 0x2a708696a30>
g = sb.PairGrid(data=atm, vars=['no_of_withdrawals', 'xyz_card_withdrawals', 'other_card_withdrawals',
'total_amt_withdrawn', 'amt_withdrawn_xyz', 'amt_withdrawn_other'],
hue='atm_name')
g = g.map_offdiag(sb.scatterplot)
g.map_diag(plt.hist)
g.add_legend();
g = sb.PairGrid(data=atm, vars=['no_of_withdrawals', 'xyz_card_withdrawals', 'other_card_withdrawals',
'total_amt_withdrawn', 'amt_withdrawn_xyz', 'amt_withdrawn_other'])
# g = g.map_offdiag(sb.scatterplot)
g.map_upper(sb.scatterplot)
g.map_lower(sb.kdeplot)
g.map_diag(sb.kdeplot)
g.add_legend();