data available via https://ourairports.com/data/
import pandas as pd
# airports = pd.read_csv('./data/airports.csv')
# airport_freq = pd.read_csv('./data/airport-frequencies.csv')
# runways = pd.read_csv('./data/runways.csv')
airports = pd.read_csv('https://davidmegginson.github.io/ourairports-data/airports.csv')
airport_freq = pd.read_csv('https://davidmegginson.github.io/ourairports-data/airport-frequencies.csv')
runways = pd.read_csv('https://davidmegginson.github.io/ourairports-data/runways.csv')
Order | Clause | Function |
---|---|---|
1 | FROM | Tables are joined to get the base data. |
2 | WHERE | The base data is filtered. |
3 | GROUP BY | The filtered base data is grouped. |
4 | HAVING | The grouped base data is filtered. |
5 | SELECT | The final data is returned. |
6 | ORDER BY | The final data is sorted. |
7 | LIMIT | The returned data is limited to row count |
select * from airports
airports
select * from airports limit 3
airports.head(3)
select id from airports where ident = 'KLAX'
airports[airports.ident == 'KLAX'].id
# or
airports[airports['ident']=='KLAX'].id
select distinct type from airport
airports.type.unique()
airports['type'].unique()
select * from airports where iso_region = 'US-CA' and type = 'seaplane_base'
airports[(airports['iso_region']=='US-CA') & (airports['type']=='seaplane_base')] # mask
select ident, name, municipality from airports where iso_region = 'US-CA' and type = 'large_airport'
airports[(airports['iso_region']=='US-CA') & (airports['type']=='large_airport')][['ident', 'name', 'municipality']]
# passing a list to __getitem__;
# https://stackoverflow.com/questions/11285613/selecting-multiple-columns-in-a-pandas-dataframe
a = airports[(airports['iso_region']=='US-CA') & (airports['type']=='large_airport')][['ident']]
b = airports[(airports['iso_region']=='US-CA') & (airports['type']=='large_airport')]['ident']
type(a)
type(b)
select * from airport_freq where airport_ident = 'KLAX' order by type
airport_freq[airport_freq['airport_ident']=='KLAX'].sort_values('type')
airport_freq[airport_freq.airport_ident == 'KLAX'].sort_values('type')
select * from airport_freq where airport_ident = 'KLAX' order by type desc
airport_freq[airport_freq.airport_ident == 'KLAX'].sort_values('type', ascending=False)
select * from airports where type in ('heliport', 'balloonport')
airports[airports.type.isin(['heliport', 'balloonport'])] # still, you need to pass a list
# airports[airports.type.isin('heliport', 'balloonport')] # TypeError
select * from airports where type not in ('heliport', 'balloonport')
airports[~airports.type.isin(['heliport', 'balloonport'])]
select iso_country, type, count(*) from airports group by iso_country, type order by iso_country, type
airports.groupby(['iso_country', 'type']).size() # as usual, pass a list to df.groupby()
select iso_country, type, count(*) from airports group by iso_country, type order by iso_country, count(*) desc
type(airports.groupby(['iso_country', 'type']).size()) # pandas.core.series.Series
type(airports.groupby(['iso_country', 'type']).size().to_frame('size')) # pandas.core.frame.DataFrame
airports.groupby(['iso_country', 'type']).size().to_frame('size').sort_values(['iso_country', 'size'])
airports.groupby(['iso_country', 'type']).size().to_frame('size').sort_values(['iso_country', 'size'], ascending=[True, False])
# reset_index()
airports.groupby(['iso_country', 'type']).size().to_frame('size').reset_index().sort_values(['iso_country', 'size'], ascending=[True, False])
type(airports.groupby(['iso_country', 'type'])) # pandas.core.groupby.generic.DataFrameGroupBy
first, let's create a dataframe of unique airports in each country
# filter out closed airports
by_country = airports[~airports.type.isin(['closed'])].groupby('iso_country')['name'].nunique().to_frame('size').reset_index()
by_country.head(3)
select iso_country from by_country order by size desc limit 10
by_country.nlargest(10, columns='size')
# alternative
by_country.sort_values('size', ascending=False).head(10)
# by_country.sort_values('size', ascending=False).iso_country.head(10)
select iso_country from by_country order by size desc limit 10 offset 10
by_country.nlargest(20, columns='size').tail(10)
Followup: try to add rank per group
wait till proper data is ready
# https://stackoverflow.com/questions/26720916/faster-way-to-rank-rows-in-subgroups-in-pandas-dataframe
# df["rank"] = df.groupby("group_ID")["value"].rank("dense", ascending=False)
runways.head(3)
Calculate min, max, mean, and median length of a runway:
select max(length_ft), min(length_ft), avg(length_ft), median(length_ft) from runways
runways.agg({'length_ft':['max', 'min', 'mean', 'median']})
runways.agg({'length_ft':['max', 'min', 'mean', 'median']}).T # or df.transpose()
# runways.agg({'length_ft':['max', 'min', 'mean', 'median']}).transpose()
inner join
select airport_ident, type, description, frequency_mhz from airport_freq join airports on airport_freq.airport_ref = airports.id where airports.ident = 'KLAX'
airports[airports.ident == 'KLAX'][['id']]
# airport_freq.merge(airports[airports.ident=='KLAX'],
# left_on='airport_ref',
# right_on='id')
airport_freq.merge(airports[airports.ident == 'KLAX'][['id']],
left_on='airport_ref',
right_on='id',
how='inner')[['airport_ident', 'type', 'description', 'frequency_mhz']]
select name, municipality from airports where ident = 'KLAX' union all select name, municipality from airports where ident = 'KLGB'
Use pd.concat() to UNION ALL two dataframes:
airports[airports.ident=='KLAX'][['name', 'municipality']]
airports[airports.ident=='KLGB'][['name', 'municipality']]
pd.concat([
airports[airports.ident=='KLAX'][['name', 'municipality']],
airports[airports.ident=='KLGB'][['name', 'municipality']]
]) # axis=1 will be "join"
union all
pd.concat([
airports[airports.ident=='KLAX'][['name', 'municipality']],
airports[airports.ident=='KLAX'][['name', 'municipality']]
]) # union all
union
pd.concat([
airports[airports.ident=='KLAX'][['name', 'municipality']],
airports[airports.ident=='KLAX'][['name', 'municipality']]
]).drop_duplicates() # union
create table heroes (id integer, name text);
insert into heroes values (1, 'Harry Potter');
insert into heroes values (2, 'Ron Weasley');
insert into heroes values (3, 'Hermione Granger');
df1 = pd.DataFrame([[1, 'Harry Potter'], [2, 'Ron Weasley']], columns = ['id', 'name'])
# df1 = pd.DataFrame({'id': [1, 2], 'name': ['Harry Potter', 'Ron Weasley']})
df1
df2 = pd.DataFrame({'id':[3], 'name':['Hermione Granger']})
df2
pd.concat([df1, df2]).reset_index(drop=True) # try drop=False
update airports set home_link = 'http://www.lawa.org/welcomelax.aspx' where ident == 'KLAX';
airports[airports.ident=='KLAX'][['home_link']]
type(airports[airports.ident=='KLAX'][['home_link']])
type(airports[airports.ident=='KLAX']['home_link'])
# airports[airports.ident=='KLAX'][home_link='http://www.lawa.org/welcomelax.aspx']
# airports.loc[airports['ident'] == 'KLAX', 'home_link'] = 'http://www.lawa.org/welcomelax.aspx'
.loc
- selects subsets of rows and columns by label only
.iloc
- selects subsets of rows and columns by integer location only
.at
selects a single scalar value in the DataFrame by label only
.iat
selects a single scalar value in the DataFrame by integer location only
airports.loc[airports['ident'] == 'KLAX', 'home_link']
# airports.at[airports['ident'] == 'KLAX', 'home_link']
# must use index in df.at[]
airports.loc[airports['ident'] == 'KLAX', 'home_link'] = 'http://www.lawa.org/welcomelax.aspx'
airports[airports.ident=='KLAX'][['home_link']]
airports2 = airports.copy()
lax_freq = airport_freq[airport_freq.airport_ident=='KLAX'].drop_duplicates()
lax_freq
lax_freq2 = lax_freq.copy() # make a copy
delete from lax_freq where type = 'MISC'
option 1: assign the DataFrame to a filtered version of itself:
lax_freq = lax_freq[lax_freq.type != 'MISC']
lax_freq
lax_freq2[lax_freq2.type == 'MISC'].index
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.drop.html
lax_freq2.drop(lax_freq2[lax_freq2.type == 'MISC'].index)
By default, most operators applied to a Pandas dataframe return a new object. Some operators accept a parameter inplace=True, so you can work with the original dataframe instead.
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.reset_index.html