#!/usr/bin/env python
# coding: utf-8

# # SQL Translation to Python

# data available via https://ourairports.com/data/

# In[ ]:


import pandas as pd

# airports = pd.read_csv('./data/airports.csv')
# airport_freq = pd.read_csv('./data/airport-frequencies.csv')
# runways = pd.read_csv('./data/runways.csv')

airports = pd.read_csv('https://davidmegginson.github.io/ourairports-data/airports.csv')
airport_freq = pd.read_csv('https://davidmegginson.github.io/ourairports-data/airport-frequencies.csv')
runways = pd.read_csv('https://davidmegginson.github.io/ourairports-data/runways.csv')


# ## Order of SQL executions
# 
# | Order	| Clause	| Function									| 
# | ----- | --------- | ----------------------------------------- | 
# | 1	    | FROM	    | Tables are joined to get the base data.	| 
# | 2	    | WHERE	    | The base data is filtered.				| 
# | 3	    | GROUP BY	| The filtered base data is grouped.		| 
# | 4	    | HAVING	| The grouped base data is filtered.		| 
# | 5	    | SELECT	| The final data is returned.				| 
# | 6	    | ORDER BY	| The final data is sorted.					| 
# | 7	    | LIMIT	    | The returned data is limited to row count	| 

# ## SELECT, WHERE, DISTINCT, LIMIT

# select * from airports

# In[ ]:


airports


# select * from airports limit 3

# In[ ]:


airports.head(3)


# select id from airports where ident = 'KLAX'

# In[ ]:


airports[airports.ident == 'KLAX'].id


# In[ ]:


# or
airports[airports['ident']=='KLAX'].id


# select distinct type from airport	

# In[ ]:


airports.type.unique()


# In[ ]:


airports['type'].unique()


# # SELECT with multiple conditions

# select * from airports where iso_region = 'US-CA' and type = 'seaplane_base'

# In[ ]:


airports[(airports['iso_region']=='US-CA') & (airports['type']=='seaplane_base')] # mask


# select ident, name, municipality from airports where iso_region = 'US-CA' and type = 'large_airport'

# In[ ]:


airports[(airports['iso_region']=='US-CA') & (airports['type']=='large_airport')][['ident', 'name', 'municipality']]
# passing a list to __getitem__; 
# https://stackoverflow.com/questions/11285613/selecting-multiple-columns-in-a-pandas-dataframe


# In[ ]:


a = airports[(airports['iso_region']=='US-CA') & (airports['type']=='large_airport')][['ident']]
b = airports[(airports['iso_region']=='US-CA') & (airports['type']=='large_airport')]['ident']


# In[ ]:


type(a)


# In[ ]:


type(b)


# ## ORDER BY

# select * from airport_freq where airport_ident = 'KLAX' order by type

# In[ ]:


airport_freq[airport_freq['airport_ident']=='KLAX'].sort_values('type')


# In[ ]:


airport_freq[airport_freq.airport_ident == 'KLAX'].sort_values('type')


# select * from airport_freq where airport_ident = 'KLAX' order by type desc

# In[ ]:


airport_freq[airport_freq.airport_ident == 'KLAX'].sort_values('type', ascending=False)


# ## IN… NOT IN

# select * from airports where type in ('heliport', 'balloonport')

# In[ ]:


airports[airports.type.isin(['heliport', 'balloonport'])] # still, you need to pass a list


# In[ ]:


# airports[airports.type.isin('heliport', 'balloonport')] # TypeError


# select * from airports where type not in ('heliport', 'balloonport')

# In[ ]:


airports[~airports.type.isin(['heliport', 'balloonport'])]


# ## GROUP BY, COUNT, ORDER BY

# select iso_country, type, count(*) from airports group by iso_country, type order by iso_country, type

# In[ ]:


airports.groupby(['iso_country', 'type']).size() # as usual, pass a list to df.groupby()


# select iso_country, type, count(\*) from airports group by iso_country, type order by iso_country, count(\*) desc

# In[ ]:


type(airports.groupby(['iso_country', 'type']).size()) # pandas.core.series.Series


# In[ ]:


type(airports.groupby(['iso_country', 'type']).size().to_frame('size')) # pandas.core.frame.DataFrame


# In[ ]:


airports.groupby(['iso_country', 'type']).size().to_frame('size').sort_values(['iso_country', 'size'])


# In[ ]:


airports.groupby(['iso_country', 'type']).size().to_frame('size').sort_values(['iso_country', 'size'], ascending=[True, False])


# In[ ]:


# reset_index()
airports.groupby(['iso_country', 'type']).size().to_frame('size').reset_index().sort_values(['iso_country', 'size'], ascending=[True, False])


# In[ ]:


type(airports.groupby(['iso_country', 'type'])) # pandas.core.groupby.generic.DataFrameGroupBy


# ## Top N records

# first, let's create a dataframe of unique airports in each country

# In[ ]:


# filter out closed airports
by_country = airports[~airports.type.isin(['closed'])].groupby('iso_country')['name'].nunique().to_frame('size').reset_index()


# In[ ]:


by_country.head(3)


# select iso_country from by_country order by size desc limit 10

# In[ ]:


by_country.nlargest(10, columns='size')


# In[ ]:


# alternative
by_country.sort_values('size', ascending=False).head(10)
# by_country.sort_values('size', ascending=False).iso_country.head(10)


# select iso_country from by_country order by size desc limit 10 offset 10

# In[ ]:


by_country.nlargest(20, columns='size').tail(10)


# **Followup: try to add rank per group**
# 
# wait till proper data is ready

# In[ ]:


# https://stackoverflow.com/questions/26720916/faster-way-to-rank-rows-in-subgroups-in-pandas-dataframe

# df["rank"] = df.groupby("group_ID")["value"].rank("dense", ascending=False)


# ## Aggregate functions (MIN, MAX, MEAN)

# In[ ]:


runways.head(3)


# **Calculate min, max, mean, and median length of a runway:**
# 
# select max(length_ft), min(length_ft), avg(length_ft), median(length_ft) from runways

# In[ ]:


runways.agg({'length_ft':['max', 'min', 'mean', 'median']})


# In[ ]:


runways.agg({'length_ft':['max', 'min', 'mean', 'median']}).T # or df.transpose()


# In[ ]:


# runways.agg({'length_ft':['max', 'min', 'mean', 'median']}).transpose()


# ## JOIN

# **inner join**

# select airport_ident, type, description, frequency_mhz from airport_freq join airports on airport_freq.airport_ref = airports.id where airports.ident = 'KLAX'

# In[ ]:


airports[airports.ident == 'KLAX'][['id']]


# In[ ]:


# airport_freq.merge(airports[airports.ident=='KLAX'], 
#                    left_on='airport_ref', 
#                    right_on='id')


# In[ ]:


airport_freq.merge(airports[airports.ident == 'KLAX'][['id']], 
                   left_on='airport_ref', 
                   right_on='id', 
                   how='inner')[['airport_ident', 'type', 'description', 'frequency_mhz']]


# ## UNION ALL and UNION

# select name, municipality from airports where ident = 'KLAX' union all select name, municipality from airports where ident = 'KLGB'

# **Use pd.concat() to UNION ALL two dataframes:**

# In[ ]:


airports[airports.ident=='KLAX'][['name', 'municipality']]


# In[ ]:


airports[airports.ident=='KLGB'][['name', 'municipality']]


# In[ ]:


pd.concat([
    airports[airports.ident=='KLAX'][['name', 'municipality']], 
    airports[airports.ident=='KLGB'][['name', 'municipality']]
]) # axis=1 will be "join"


# **union all**

# In[ ]:


pd.concat([
    airports[airports.ident=='KLAX'][['name', 'municipality']], 
    airports[airports.ident=='KLAX'][['name', 'municipality']]
]) # union all


# **union**

# In[ ]:


pd.concat([
    airports[airports.ident=='KLAX'][['name', 'municipality']], 
    airports[airports.ident=='KLAX'][['name', 'municipality']]
]).drop_duplicates() # union


# ## INSERT

# create table heroes (id integer, name text);
# 
# insert into heroes values (1, 'Harry Potter');
# 
# insert into heroes values (2, 'Ron Weasley');
# 
# insert into heroes values (3, 'Hermione Granger');

# In[ ]:


df1 = pd.DataFrame([[1, 'Harry Potter'], [2, 'Ron Weasley']], columns = ['id', 'name'])
# df1 = pd.DataFrame({'id': [1, 2], 'name': ['Harry Potter', 'Ron Weasley']})


# In[ ]:


df1


# In[ ]:


df2 = pd.DataFrame({'id':[3], 'name':['Hermione Granger']})


# In[ ]:


df2


# In[ ]:


pd.concat([df1, df2]).reset_index(drop=True) # try drop=False


# ## UPDATE

# update airports set home_link = 'http://www.lawa.org/welcomelax.aspx' where ident == 'KLAX';

# In[ ]:


airports[airports.ident=='KLAX'][['home_link']]


# In[ ]:


type(airports[airports.ident=='KLAX'][['home_link']])


# In[ ]:


type(airports[airports.ident=='KLAX']['home_link'])


# In[ ]:


# airports[airports.ident=='KLAX'][home_link='http://www.lawa.org/welcomelax.aspx']
# airports.loc[airports['ident'] == 'KLAX', 'home_link'] = 'http://www.lawa.org/welcomelax.aspx'


# `.loc` - selects subsets of rows and columns by label only
# 
# `.iloc` - selects subsets of rows and columns by integer location only
# 
# `.at` selects a single scalar value in the DataFrame by label only
# 
# `.iat` selects a single scalar value in the DataFrame by integer location only

# In[ ]:


airports.loc[airports['ident'] == 'KLAX', 'home_link']


# In[ ]:


# airports.at[airports['ident'] == 'KLAX', 'home_link']
# must use index in df.at[]


# In[ ]:


airports.loc[airports['ident'] == 'KLAX', 'home_link'] = 'http://www.lawa.org/welcomelax.aspx'


# In[ ]:


airports[airports.ident=='KLAX'][['home_link']]


# ## DELETE

# In[ ]:


airports2 = airports.copy()


# In[ ]:


lax_freq = airport_freq[airport_freq.airport_ident=='KLAX'].drop_duplicates()


# In[ ]:


lax_freq


# In[ ]:


lax_freq2 = lax_freq.copy() # make a copy


# delete from lax_freq where type = 'MISC'

# **option 1: assign the DataFrame to a filtered version of itself:**

# In[ ]:


lax_freq = lax_freq[lax_freq.type != 'MISC']


# In[ ]:


lax_freq


# In[ ]:


lax_freq2[lax_freq2.type == 'MISC'].index


# In[ ]:


# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.drop.html
lax_freq2.drop(lax_freq2[lax_freq2.type == 'MISC'].index)


# ## Immutability

# By default, most operators applied to a Pandas dataframe return a new object. Some operators accept a parameter inplace=True, so you can work with the original dataframe instead. 
# 
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.reset_index.html
# 
# - inplace: bool, default False
#     - Modify the DataFrame in place (do not create a new object).