#!/usr/bin/env python # coding: utf-8 # In[1]: import os, math, subprocess import pandas as pd import numpy as np import matplotlib.pyplot as plt # some settings for displaying Pandas results pd.set_option('display.width', 2000) pd.set_option('display.max_rows', 500) pd.set_option('display.max_columns', 500) pd.set_option('display.precision', 4) pd.set_option('display.max_colwidth', -1) # In[2]: # load data data_path = "home-credit-default-risk/application_train.csv" pdf_data = pd.read_csv(data_path) # # SELECT, WHERE, DISTINCT, LIMIT # In[3]: print(""" SELECT * FROM pdf_data LIMIT 3 """) pdf_data.head(3) # In[4]: print(""" SELECT NAME_CONTRACT_TYPE FROM pdf_data WHERE CODE_GENDER = 'M' """) pdf_data[pdf_data["CODE_GENDER"] == 'M'][["NAME_CONTRACT_TYPE"]].head() # In[5]: print(""" SELECT DISTINCT NAME_CONTRACT_TYPE FROM pdf_data """) pdf_data["NAME_CONTRACT_TYPE"].unique() # # SELECT with multiple conditions # In[6]: print(""" SELECT NAME_INCOME_TYPE, CODE_GENDER, AMT_INCOME_TOTAL FROM pdf_data WHERE CODE_GENDER = 'M' AND AMT_INCOME_TOTAL > 200000.0 """) condition = (pdf_data["CODE_GENDER"] == 'M') & (pdf_data["AMT_INCOME_TOTAL"] > 200000.0) pdf_data[condition][["NAME_INCOME_TYPE", "CODE_GENDER", "AMT_INCOME_TOTAL"]].head() # # ORDER BY # In[7]: print(""" SELECT NAME_INCOME_TYPE, AMT_INCOME_TOTAL FROM pdf_data ORDER BY AMT_INCOME_TOTAL """) pdf_data[["NAME_INCOME_TYPE", "AMT_INCOME_TOTAL"]].sort_values("AMT_INCOME_TOTAL").head() # In[8]: print(""" SELECT NAME_INCOME_TYPE, AMT_INCOME_TOTAL FROM pdf_data ORDER BY AMT_INCOME_TOTAL DESC """) pdf_data[["NAME_INCOME_TYPE", "AMT_INCOME_TOTAL"]].sort_values("AMT_INCOME_TOTAL", ascending=False).head() # # IN… NOT IN # In[9]: print(""" SELECT * FROM pdf_data WHERE SK_ID_CURR IN (100002, 100010, 100011) """) condition = pdf_data["SK_ID_CURR"].isin([100002, 100010, 100011]) pdf_data[condition].head() # In[10]: print(""" SELECT * FROM pdf_data WHERE SK_ID_CURR NOT IN (100002, 100010, 100011) """) condition = ~pdf_data["SK_ID_CURR"].isin([100002, 100010, 100011]) pdf_data[condition].head() # # GROUP BY, COUNT, ORDER BY # In[11]: print(""" SELECT CODE_GENDER, COUNT(TARGET) FROM pdf_data GROUP BY CODE_GENDER ORDER BY NUM_TARGET """) pdf_data.groupby("CODE_GENDER").agg({"TARGET": "count"}).sort_values("TARGET") # # Top N records # In[12]: print(""" SELECT AMT_INCOME_TOTAL FROM pdf_data ORDER BY NUM_TARGET DESC LIMIT 5 """) pdf_data.nlargest(5, columns="AMT_INCOME_TOTAL") # In[13]: print(""" SELECT AMT_INCOME_TOTAL FROM pdf_data ORDER BY NUM_TARGET DESC LIMIT 10 OFFSET 5 """) pdf_data.nlargest(10, columns="AMT_INCOME_TOTAL").tail(5) # # Aggregate functions (MIN, MAX, MEAN) # In[14]: print(""" SELECT MAX(AMT_INCOME_TOTAL), MIN(AMT_INCOME_TOTAL), MEAN(AMT_INCOME_TOTAL) FROM pdf_data """) pdf_data.agg({"AMT_INCOME_TOTAL": ["max", "min", "mean"]}).transpose() # # JOIN # # # - (INNER) JOIN: Returns records that have matching values in both tables # - LEFT (OUTER) JOIN: Return all records from the left table, and the matched records from the right table # - RIGHT (OUTER) JOIN: Return all records from the right table, and the matched records from the left table # - FULL (OUTER) JOIN: Return all records when there is a match in either left or right table # # In[15]: print(""" SELECT * FROM pdf1 INNER JOIN pdf2 ON pdf1.SK_ID_CURR = pdf2.SK_ID_CURR """) pdf1 = pdf_data[["SK_ID_CURR", "AMT_INCOME_TOTAL"]] pdf2 = pdf_data[["SK_ID_CURR", "CODE_GENDER", "FLAG_OWN_CAR"]] pdf1.merge(pdf2, on="SK_ID_CURR", how="inner").head() # # UNION ALL and UNION # In[16]: print(""" SELECT * FROM pdf1 UNION ALL SELECT * FROM pdf2 """) pdf1 = pdf_data[["CODE_GENDER", "FLAG_OWN_CAR"]] pdf2 = pdf_data[["CODE_GENDER", "FLAG_OWN_CAR"]] print("Union all:", pd.concat([pdf1, pdf2]).shape) print("Union:", pd.concat([pdf1, pdf2]).drop_duplicates().shape) # # INSERT # # ``` # INSERT INTO table_name (column1, column2, column3, ...) # VALUES (value1, value2, value3, ...); # ``` # In[17]: df1 = pd.DataFrame({'id': [1, 2], 'name': ['Harry Potter', 'Ron Weasley']}) df2 = pd.DataFrame({'id': [3], 'name': ['Hermione Granger']}) df3 = pd.concat([df1, df2]).reset_index(drop=True) df3 # # UPDATE # # ``` # UPDATE table_name # SET column1 = value1, column2 = value2, ... # WHERE condition; # ``` # In[18]: df3.loc[df3["id"] == 2, "name"] = "Ron" df3 # # DELETE # # ``` # DELETE FROM table_name WHERE condition; # ``` # In[19]: df3.drop(df3[df3["name"] == "Ron"].index, inplace=True) df3