SQL Translation to Python¶

data available via https://ourairports.com/data/

In [ ]:

import pandas as pd

# airports = pd.read_csv('./data/airports.csv')
# airport_freq = pd.read_csv('./data/airport-frequencies.csv')
# runways = pd.read_csv('./data/runways.csv')

airports = pd.read_csv('https://davidmegginson.github.io/ourairports-data/airports.csv')
airport_freq = pd.read_csv('https://davidmegginson.github.io/ourairports-data/airport-frequencies.csv')
runways = pd.read_csv('https://davidmegginson.github.io/ourairports-data/runways.csv')

Order of SQL executions¶

Order	Clause	Function
1	FROM	Tables are joined to get the base data.
2	WHERE	The base data is filtered.
3	GROUP BY	The filtered base data is grouped.
4	HAVING	The grouped base data is filtered.
5	SELECT	The final data is returned.
6	ORDER BY	The final data is sorted.
7	LIMIT	The returned data is limited to row count

SELECT, WHERE, DISTINCT, LIMIT¶

select * from airports

In [ ]:

airports

select * from airports limit 3

In [ ]:

airports.head(3)

select id from airports where ident = 'KLAX'

In [ ]:

airports[airports.ident == 'KLAX'].id

In [ ]:

# or
airports[airports['ident']=='KLAX'].id

select distinct type from airport

In [ ]:

airports.type.unique()

In [ ]:

airports['type'].unique()

SELECT with multiple conditions¶

select * from airports where iso_region = 'US-CA' and type = 'seaplane_base'

In [ ]:

airports[(airports['iso_region']=='US-CA') & (airports['type']=='seaplane_base')] # mask

select ident, name, municipality from airports where iso_region = 'US-CA' and type = 'large_airport'

In [ ]:

airports[(airports['iso_region']=='US-CA') & (airports['type']=='large_airport')][['ident', 'name', 'municipality']]
# passing a list to __getitem__; 
# https://stackoverflow.com/questions/11285613/selecting-multiple-columns-in-a-pandas-dataframe

In [ ]:

a = airports[(airports['iso_region']=='US-CA') & (airports['type']=='large_airport')][['ident']]
b = airports[(airports['iso_region']=='US-CA') & (airports['type']=='large_airport')]['ident']

In [ ]:

type(a)

In [ ]:

type(b)

ORDER BY¶

select * from airport_freq where airport_ident = 'KLAX' order by type

In [ ]:

airport_freq[airport_freq['airport_ident']=='KLAX'].sort_values('type')

In [ ]:

airport_freq[airport_freq.airport_ident == 'KLAX'].sort_values('type')

select * from airport_freq where airport_ident = 'KLAX' order by type desc

In [ ]:

airport_freq[airport_freq.airport_ident == 'KLAX'].sort_values('type', ascending=False)

IN… NOT IN¶

select * from airports where type in ('heliport', 'balloonport')

In [ ]:

airports[airports.type.isin(['heliport', 'balloonport'])] # still, you need to pass a list

In [ ]:

# airports[airports.type.isin('heliport', 'balloonport')] # TypeError

select * from airports where type not in ('heliport', 'balloonport')

In [ ]:

airports[~airports.type.isin(['heliport', 'balloonport'])]

GROUP BY, COUNT, ORDER BY¶

select iso_country, type, count(*) from airports group by iso_country, type order by iso_country, type

In [ ]:

airports.groupby(['iso_country', 'type']).size() # as usual, pass a list to df.groupby()

select iso_country, type, count(*) from airports group by iso_country, type order by iso_country, count(*) desc

In [ ]:

type(airports.groupby(['iso_country', 'type']).size()) # pandas.core.series.Series

In [ ]:

type(airports.groupby(['iso_country', 'type']).size().to_frame('size')) # pandas.core.frame.DataFrame

In [ ]:

airports.groupby(['iso_country', 'type']).size().to_frame('size').sort_values(['iso_country', 'size'])

In [ ]:

airports.groupby(['iso_country', 'type']).size().to_frame('size').sort_values(['iso_country', 'size'], ascending=[True, False])

In [ ]:

# reset_index()
airports.groupby(['iso_country', 'type']).size().to_frame('size').reset_index().sort_values(['iso_country', 'size'], ascending=[True, False])

In [ ]:

type(airports.groupby(['iso_country', 'type'])) # pandas.core.groupby.generic.DataFrameGroupBy

Top N records¶

first, let's create a dataframe of unique airports in each country

In [ ]:

# filter out closed airports
by_country = airports[~airports.type.isin(['closed'])].groupby('iso_country')['name'].nunique().to_frame('size').reset_index()

In [ ]:

by_country.head(3)

select iso_country from by_country order by size desc limit 10

In [ ]:

by_country.nlargest(10, columns='size')

In [ ]:

# alternative
by_country.sort_values('size', ascending=False).head(10)
# by_country.sort_values('size', ascending=False).iso_country.head(10)

select iso_country from by_country order by size desc limit 10 offset 10

In [ ]:

by_country.nlargest(20, columns='size').tail(10)

Followup: try to add rank per group

wait till proper data is ready

In [ ]:

# https://stackoverflow.com/questions/26720916/faster-way-to-rank-rows-in-subgroups-in-pandas-dataframe

# df["rank"] = df.groupby("group_ID")["value"].rank("dense", ascending=False)

Aggregate functions (MIN, MAX, MEAN)¶

In [ ]:

runways.head(3)

Calculate min, max, mean, and median length of a runway:

select max(length_ft), min(length_ft), avg(length_ft), median(length_ft) from runways

In [ ]:

runways.agg({'length_ft':['max', 'min', 'mean', 'median']})

In [ ]:

runways.agg({'length_ft':['max', 'min', 'mean', 'median']}).T # or df.transpose()

In [ ]:

# runways.agg({'length_ft':['max', 'min', 'mean', 'median']}).transpose()

JOIN¶

inner join

select airport_ident, type, description, frequency_mhz from airport_freq join airports on airport_freq.airport_ref = airports.id where airports.ident = 'KLAX'

In [ ]:

airports[airports.ident == 'KLAX'][['id']]

In [ ]:

# airport_freq.merge(airports[airports.ident=='KLAX'], 
#                    left_on='airport_ref', 
#                    right_on='id')

In [ ]:

airport_freq.merge(airports[airports.ident == 'KLAX'][['id']], 
                   left_on='airport_ref', 
                   right_on='id', 
                   how='inner')[['airport_ident', 'type', 'description', 'frequency_mhz']]

UNION ALL and UNION¶

select name, municipality from airports where ident = 'KLAX' union all select name, municipality from airports where ident = 'KLGB'

Use pd.concat() to UNION ALL two dataframes:

In [ ]:

airports[airports.ident=='KLAX'][['name', 'municipality']]

In [ ]:

airports[airports.ident=='KLGB'][['name', 'municipality']]

In [ ]:

pd.concat([
    airports[airports.ident=='KLAX'][['name', 'municipality']], 
    airports[airports.ident=='KLGB'][['name', 'municipality']]
]) # axis=1 will be "join"

union all

In [ ]:

pd.concat([
    airports[airports.ident=='KLAX'][['name', 'municipality']], 
    airports[airports.ident=='KLAX'][['name', 'municipality']]
]) # union all

union

In [ ]:

pd.concat([
    airports[airports.ident=='KLAX'][['name', 'municipality']], 
    airports[airports.ident=='KLAX'][['name', 'municipality']]
]).drop_duplicates() # union

INSERT¶

create table heroes (id integer, name text);

insert into heroes values (1, 'Harry Potter');

insert into heroes values (2, 'Ron Weasley');

insert into heroes values (3, 'Hermione Granger');

In [ ]:

df1 = pd.DataFrame([[1, 'Harry Potter'], [2, 'Ron Weasley']], columns = ['id', 'name'])
# df1 = pd.DataFrame({'id': [1, 2], 'name': ['Harry Potter', 'Ron Weasley']})

In [ ]:

df1

In [ ]:

df2 = pd.DataFrame({'id':[3], 'name':['Hermione Granger']})

In [ ]:

df2

In [ ]:

pd.concat([df1, df2]).reset_index(drop=True) # try drop=False

UPDATE¶

update airports set home_link = 'http://www.lawa.org/welcomelax.aspx' where ident == 'KLAX';

In [ ]:

airports[airports.ident=='KLAX'][['home_link']]

In [ ]:

type(airports[airports.ident=='KLAX'][['home_link']])

In [ ]:

type(airports[airports.ident=='KLAX']['home_link'])

In [ ]:

# airports[airports.ident=='KLAX'][home_link='http://www.lawa.org/welcomelax.aspx']
# airports.loc[airports['ident'] == 'KLAX', 'home_link'] = 'http://www.lawa.org/welcomelax.aspx'

.loc - selects subsets of rows and columns by label only

.iloc - selects subsets of rows and columns by integer location only

.at selects a single scalar value in the DataFrame by label only

.iat selects a single scalar value in the DataFrame by integer location only

In [ ]:

airports.loc[airports['ident'] == 'KLAX', 'home_link']

In [ ]:

# airports.at[airports['ident'] == 'KLAX', 'home_link']
# must use index in df.at[]

In [ ]:

airports.loc[airports['ident'] == 'KLAX', 'home_link'] = 'http://www.lawa.org/welcomelax.aspx'

In [ ]:

airports[airports.ident=='KLAX'][['home_link']]

DELETE¶

In [ ]:

airports2 = airports.copy()

In [ ]:

lax_freq = airport_freq[airport_freq.airport_ident=='KLAX'].drop_duplicates()

In [ ]:

lax_freq

In [ ]:

lax_freq2 = lax_freq.copy() # make a copy

delete from lax_freq where type = 'MISC'

option 1: assign the DataFrame to a filtered version of itself:

In [ ]:

lax_freq = lax_freq[lax_freq.type != 'MISC']

In [ ]:

lax_freq

In [ ]:

lax_freq2[lax_freq2.type == 'MISC'].index

In [ ]:

# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.drop.html
lax_freq2.drop(lax_freq2[lax_freq2.type == 'MISC'].index)

Immutability¶

By default, most operators applied to a Pandas dataframe return a new object. Some operators accept a parameter inplace=True, so you can work with the original dataframe instead.

https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.reset_index.html

inplace: bool, default False
- Modify the DataFrame in place (do not create a new object).