#!/usr/bin/env python # coding: utf-8 # # SQL Translation to Python # data available via https://ourairports.com/data/ # In[ ]: import pandas as pd # airports = pd.read_csv('./data/airports.csv') # airport_freq = pd.read_csv('./data/airport-frequencies.csv') # runways = pd.read_csv('./data/runways.csv') airports = pd.read_csv('https://davidmegginson.github.io/ourairports-data/airports.csv') airport_freq = pd.read_csv('https://davidmegginson.github.io/ourairports-data/airport-frequencies.csv') runways = pd.read_csv('https://davidmegginson.github.io/ourairports-data/runways.csv') # ## Order of SQL executions # # | Order | Clause | Function | # | ----- | --------- | ----------------------------------------- | # | 1 | FROM | Tables are joined to get the base data. | # | 2 | WHERE | The base data is filtered. | # | 3 | GROUP BY | The filtered base data is grouped. | # | 4 | HAVING | The grouped base data is filtered. | # | 5 | SELECT | The final data is returned. | # | 6 | ORDER BY | The final data is sorted. | # | 7 | LIMIT | The returned data is limited to row count | # ## SELECT, WHERE, DISTINCT, LIMIT # select * from airports # In[ ]: airports # select * from airports limit 3 # In[ ]: airports.head(3) # select id from airports where ident = 'KLAX' # In[ ]: airports[airports.ident == 'KLAX'].id # In[ ]: # or airports[airports['ident']=='KLAX'].id # select distinct type from airport # In[ ]: airports.type.unique() # In[ ]: airports['type'].unique() # # SELECT with multiple conditions # select * from airports where iso_region = 'US-CA' and type = 'seaplane_base' # In[ ]: airports[(airports['iso_region']=='US-CA') & (airports['type']=='seaplane_base')] # mask # select ident, name, municipality from airports where iso_region = 'US-CA' and type = 'large_airport' # In[ ]: airports[(airports['iso_region']=='US-CA') & (airports['type']=='large_airport')][['ident', 'name', 'municipality']] # passing a list to __getitem__; # https://stackoverflow.com/questions/11285613/selecting-multiple-columns-in-a-pandas-dataframe # In[ ]: a = airports[(airports['iso_region']=='US-CA') & (airports['type']=='large_airport')][['ident']] b = airports[(airports['iso_region']=='US-CA') & (airports['type']=='large_airport')]['ident'] # In[ ]: type(a) # In[ ]: type(b) # ## ORDER BY # select * from airport_freq where airport_ident = 'KLAX' order by type # In[ ]: airport_freq[airport_freq['airport_ident']=='KLAX'].sort_values('type') # In[ ]: airport_freq[airport_freq.airport_ident == 'KLAX'].sort_values('type') # select * from airport_freq where airport_ident = 'KLAX' order by type desc # In[ ]: airport_freq[airport_freq.airport_ident == 'KLAX'].sort_values('type', ascending=False) # ## IN… NOT IN # select * from airports where type in ('heliport', 'balloonport') # In[ ]: airports[airports.type.isin(['heliport', 'balloonport'])] # still, you need to pass a list # In[ ]: # airports[airports.type.isin('heliport', 'balloonport')] # TypeError # select * from airports where type not in ('heliport', 'balloonport') # In[ ]: airports[~airports.type.isin(['heliport', 'balloonport'])] # ## GROUP BY, COUNT, ORDER BY # select iso_country, type, count(*) from airports group by iso_country, type order by iso_country, type # In[ ]: airports.groupby(['iso_country', 'type']).size() # as usual, pass a list to df.groupby() # select iso_country, type, count(\*) from airports group by iso_country, type order by iso_country, count(\*) desc # In[ ]: type(airports.groupby(['iso_country', 'type']).size()) # pandas.core.series.Series # In[ ]: type(airports.groupby(['iso_country', 'type']).size().to_frame('size')) # pandas.core.frame.DataFrame # In[ ]: airports.groupby(['iso_country', 'type']).size().to_frame('size').sort_values(['iso_country', 'size']) # In[ ]: airports.groupby(['iso_country', 'type']).size().to_frame('size').sort_values(['iso_country', 'size'], ascending=[True, False]) # In[ ]: # reset_index() airports.groupby(['iso_country', 'type']).size().to_frame('size').reset_index().sort_values(['iso_country', 'size'], ascending=[True, False]) # In[ ]: type(airports.groupby(['iso_country', 'type'])) # pandas.core.groupby.generic.DataFrameGroupBy # ## Top N records # first, let's create a dataframe of unique airports in each country # In[ ]: # filter out closed airports by_country = airports[~airports.type.isin(['closed'])].groupby('iso_country')['name'].nunique().to_frame('size').reset_index() # In[ ]: by_country.head(3) # select iso_country from by_country order by size desc limit 10 # In[ ]: by_country.nlargest(10, columns='size') # In[ ]: # alternative by_country.sort_values('size', ascending=False).head(10) # by_country.sort_values('size', ascending=False).iso_country.head(10) # select iso_country from by_country order by size desc limit 10 offset 10 # In[ ]: by_country.nlargest(20, columns='size').tail(10) # **Followup: try to add rank per group** # # wait till proper data is ready # In[ ]: # https://stackoverflow.com/questions/26720916/faster-way-to-rank-rows-in-subgroups-in-pandas-dataframe # df["rank"] = df.groupby("group_ID")["value"].rank("dense", ascending=False) # ## Aggregate functions (MIN, MAX, MEAN) # In[ ]: runways.head(3) # **Calculate min, max, mean, and median length of a runway:** # # select max(length_ft), min(length_ft), avg(length_ft), median(length_ft) from runways # In[ ]: runways.agg({'length_ft':['max', 'min', 'mean', 'median']}) # In[ ]: runways.agg({'length_ft':['max', 'min', 'mean', 'median']}).T # or df.transpose() # In[ ]: # runways.agg({'length_ft':['max', 'min', 'mean', 'median']}).transpose() # ## JOIN # **inner join** # select airport_ident, type, description, frequency_mhz from airport_freq join airports on airport_freq.airport_ref = airports.id where airports.ident = 'KLAX' # In[ ]: airports[airports.ident == 'KLAX'][['id']] # In[ ]: # airport_freq.merge(airports[airports.ident=='KLAX'], # left_on='airport_ref', # right_on='id') # In[ ]: airport_freq.merge(airports[airports.ident == 'KLAX'][['id']], left_on='airport_ref', right_on='id', how='inner')[['airport_ident', 'type', 'description', 'frequency_mhz']] # ## UNION ALL and UNION # select name, municipality from airports where ident = 'KLAX' union all select name, municipality from airports where ident = 'KLGB' # **Use pd.concat() to UNION ALL two dataframes:** # In[ ]: airports[airports.ident=='KLAX'][['name', 'municipality']] # In[ ]: airports[airports.ident=='KLGB'][['name', 'municipality']] # In[ ]: pd.concat([ airports[airports.ident=='KLAX'][['name', 'municipality']], airports[airports.ident=='KLGB'][['name', 'municipality']] ]) # axis=1 will be "join" # **union all** # In[ ]: pd.concat([ airports[airports.ident=='KLAX'][['name', 'municipality']], airports[airports.ident=='KLAX'][['name', 'municipality']] ]) # union all # **union** # In[ ]: pd.concat([ airports[airports.ident=='KLAX'][['name', 'municipality']], airports[airports.ident=='KLAX'][['name', 'municipality']] ]).drop_duplicates() # union # ## INSERT # create table heroes (id integer, name text); # # insert into heroes values (1, 'Harry Potter'); # # insert into heroes values (2, 'Ron Weasley'); # # insert into heroes values (3, 'Hermione Granger'); # In[ ]: df1 = pd.DataFrame([[1, 'Harry Potter'], [2, 'Ron Weasley']], columns = ['id', 'name']) # df1 = pd.DataFrame({'id': [1, 2], 'name': ['Harry Potter', 'Ron Weasley']}) # In[ ]: df1 # In[ ]: df2 = pd.DataFrame({'id':[3], 'name':['Hermione Granger']}) # In[ ]: df2 # In[ ]: pd.concat([df1, df2]).reset_index(drop=True) # try drop=False # ## UPDATE # update airports set home_link = 'http://www.lawa.org/welcomelax.aspx' where ident == 'KLAX'; # In[ ]: airports[airports.ident=='KLAX'][['home_link']] # In[ ]: type(airports[airports.ident=='KLAX'][['home_link']]) # In[ ]: type(airports[airports.ident=='KLAX']['home_link']) # In[ ]: # airports[airports.ident=='KLAX'][home_link='http://www.lawa.org/welcomelax.aspx'] # airports.loc[airports['ident'] == 'KLAX', 'home_link'] = 'http://www.lawa.org/welcomelax.aspx' # `.loc` - selects subsets of rows and columns by label only # # `.iloc` - selects subsets of rows and columns by integer location only # # `.at` selects a single scalar value in the DataFrame by label only # # `.iat` selects a single scalar value in the DataFrame by integer location only # In[ ]: airports.loc[airports['ident'] == 'KLAX', 'home_link'] # In[ ]: # airports.at[airports['ident'] == 'KLAX', 'home_link'] # must use index in df.at[] # In[ ]: airports.loc[airports['ident'] == 'KLAX', 'home_link'] = 'http://www.lawa.org/welcomelax.aspx' # In[ ]: airports[airports.ident=='KLAX'][['home_link']] # ## DELETE # In[ ]: airports2 = airports.copy() # In[ ]: lax_freq = airport_freq[airport_freq.airport_ident=='KLAX'].drop_duplicates() # In[ ]: lax_freq # In[ ]: lax_freq2 = lax_freq.copy() # make a copy # delete from lax_freq where type = 'MISC' # **option 1: assign the DataFrame to a filtered version of itself:** # In[ ]: lax_freq = lax_freq[lax_freq.type != 'MISC'] # In[ ]: lax_freq # In[ ]: lax_freq2[lax_freq2.type == 'MISC'].index # In[ ]: # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.drop.html lax_freq2.drop(lax_freq2[lax_freq2.type == 'MISC'].index) # ## Immutability # By default, most operators applied to a Pandas dataframe return a new object. Some operators accept a parameter inplace=True, so you can work with the original dataframe instead. # # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.reset_index.html # # - inplace: bool, default False # - Modify the DataFrame in place (do not create a new object).