#!/usr/bin/env python # coding: utf-8 # # Q-Q Plots # ## Preparation # In[1]: import pandas as pd from scipy.stats import norm, skewnorm, laplace, uniform from lets_plot import * LetsPlot.setup_html() # In[2]: def plot_matrix(plots=[], width=400, height=300, columns=2): bunch = GGBunch() for i in range(len(plots)): row = int(i / columns) column = i % columns bunch.add_plot(plots[i], column * width, row * height, width, height) return bunch.show() # In[3]: df = pd.read_csv("https://raw.githubusercontent.com/JetBrains/lets-plot-docs/master/data/mpg.csv") print(df.shape) hwy = "hwy" cty = "cty" drv = "drv" df.head() # ## Two types of Q-Q plots # ### `geom_qq()` and `geom_qq_line()` functions. # In[4]: ggplot(df, aes(sample=hwy)) + \ geom_qq(size=5, color="#3d3d3d", alpha=.3) + \ geom_qq_line(size=1) + \ ggtitle("Distribution of highway miles per gallon", \ "Comparison of sample quantiles with normal distribution quantiles") # ### `geom_qq2()` and `geom_qq2_line()` functions. # In[5]: ggplot(df, aes(x=cty, y=hwy)) + \ geom_qq2(size=5, color="#3d3d3d", alpha=.3) + \ geom_qq2_line(size=1) + \ ggtitle("City miles vs. highway miles (per gallon)", \ "Comparison of quantiles of two sample distributions") # ## Quick Q-Q plot: the `qq_plot()` function # # In the 'bistro' module there is a Q-Q plot in which points and lines of both types combined to the one function with some convenient defaults. # In[6]: from lets_plot.bistro.qq import qq_plot # In[7]: qq_plot(df, sample=hwy) + \ ggtitle("Distribution of highway miles per gallon", \ "Comparison of sample quantiles with normal distribution quantiles") # ## Deviations investigation # In[8]: n = 1_000 norm_df = pd.DataFrame({"sample": norm.rvs(size=n, random_state=42)}) skewed_df = pd.DataFrame({"sample": skewnorm.rvs(7, size=n, random_state=42)}) neg_kurtosis_df = pd.DataFrame({"sample": uniform.rvs(size=n, random_state=42)}) pos_kurtosis_df = pd.DataFrame({"sample": laplace.rvs(size=n, random_state=42)}) # In[9]: p1 = ggplot(norm_df) p11 = p1 + geom_histogram(aes(x="sample")) + ggtitle("Normal: histogram") p12 = p1 + geom_qq(aes(sample="sample")) + geom_qq_line(aes(sample="sample")) + ggtitle("Normal: Q-Q plot") p2 = ggplot(skewed_df) p21 = p2 + geom_histogram(aes(x="sample")) + ggtitle("Skewed: histogram") p22 = p2 + geom_qq(aes(sample="sample")) + geom_qq_line(aes(sample="sample")) + ggtitle("Skewed: Q-Q plot") p3 = ggplot(neg_kurtosis_df) p31 = p3 + geom_histogram(aes(x="sample")) + ggtitle("-Kurtosis: histogram") p32 = p3 + geom_qq(aes(sample="sample")) + geom_qq_line(aes(sample="sample")) + ggtitle("-Kurtosis: Q-Q plot") p4 = ggplot(pos_kurtosis_df) p41 = p4 + geom_histogram(aes(x="sample")) + ggtitle("+Kurtosis: histogram") p42 = p4 + geom_qq(aes(sample="sample")) + geom_qq_line(aes(sample="sample")) + ggtitle("+Kurtosis: Q-Q plot") plot_matrix([p11, p12, p21, p22, p31, p32, p41, p42]) # ## Choose a distribution # # The `distribution` parameter of the `qq_plot()` function. # In[10]: p1 = qq_plot(df, hwy, distribution="norm", quantiles=[.1, .9]) + \ ggtitle("Normal distribution") p2 = qq_plot(df, hwy, distribution="uniform", quantiles=[.1, .9]) + \ ggtitle("Uniform distribution") p3 = qq_plot(df, hwy, distribution="t", quantiles=[.1, .9]) + \ ggtitle("Student's t-distribution distribution") p4 = qq_plot(df, hwy, distribution="exp", quantiles=[.1, .9]) + \ ggtitle("Exponential distribution") plot_matrix([p1, p2, p3, p4]) # ## Q-Q stats with other geometries # In[11]: ggplot(df, aes(x=cty, y=hwy, color=drv)) + \ geom_line(stat="qq2") + \ geom_point(stat="qq2", shape=15) + \ geom_line(stat="qq2_line", color='#636363', linetype=5) + \ facet_grid(x=drv, scales="free") + \ xlab("cty quantiles") + ylab("hwy quantiles")