#!/usr/bin/env python
# coding: utf-8

# # Q-Q Plots

# ## Preparation

# In[1]:


import pandas as pd
from scipy.stats import norm, skewnorm, laplace, uniform

from lets_plot import *
LetsPlot.setup_html()


# In[2]:


def plot_matrix(plots=[], width=400, height=300, columns=2):
    bunch = GGBunch()
    for i in range(len(plots)):
        row = int(i / columns)
        column = i % columns
        bunch.add_plot(plots[i], column * width, row * height, width, height)
    return bunch.show()


# In[3]:


df = pd.read_csv("https://raw.githubusercontent.com/JetBrains/lets-plot-docs/master/data/mpg.csv")
print(df.shape)
hwy = "hwy"
cty = "cty"
drv = "drv"
df.head()


# ## Two types of Q-Q plots

# ### `geom_qq()` and `geom_qq_line()` functions.

# In[4]:


ggplot(df, aes(sample=hwy)) + \
    geom_qq(size=5, color="#3d3d3d", alpha=.3) + \
    geom_qq_line(size=1) + \
    ggtitle("Distribution of highway miles per gallon", \
            "Comparison of sample quantiles with normal distribution quantiles")


# ### `geom_qq2()` and `geom_qq2_line()` functions.

# In[5]:


ggplot(df, aes(x=cty, y=hwy)) + \
    geom_qq2(size=5, color="#3d3d3d", alpha=.3) + \
    geom_qq2_line(size=1) + \
    ggtitle("City miles vs. highway miles (per gallon)", \
            "Comparison of quantiles of two sample distributions")


# ## Quick Q-Q plot:  the `qq_plot()` function
# 
# In the 'bistro' module there is a Q-Q plot in which points and lines of both types combined to the one function with some convenient defaults.

# In[6]:


from lets_plot.bistro.qq import qq_plot


# In[7]:


qq_plot(df, sample=hwy) + \
    ggtitle("Distribution of highway miles per gallon", \
            "Comparison of sample quantiles with normal distribution quantiles")


# ## Deviations investigation

# In[8]:


n = 1_000
norm_df = pd.DataFrame({"sample": norm.rvs(size=n, random_state=42)})
skewed_df = pd.DataFrame({"sample": skewnorm.rvs(7, size=n, random_state=42)})
neg_kurtosis_df = pd.DataFrame({"sample": uniform.rvs(size=n, random_state=42)})
pos_kurtosis_df = pd.DataFrame({"sample": laplace.rvs(size=n, random_state=42)})


# In[9]:


p1 = ggplot(norm_df)
p11 = p1 + geom_histogram(aes(x="sample")) + ggtitle("Normal: histogram")
p12 = p1 + geom_qq(aes(sample="sample")) + geom_qq_line(aes(sample="sample")) + ggtitle("Normal: Q-Q plot")
p2 = ggplot(skewed_df)
p21 = p2 + geom_histogram(aes(x="sample")) + ggtitle("Skewed: histogram")
p22 = p2 + geom_qq(aes(sample="sample")) + geom_qq_line(aes(sample="sample")) + ggtitle("Skewed: Q-Q plot")
p3 = ggplot(neg_kurtosis_df)
p31 = p3 + geom_histogram(aes(x="sample")) + ggtitle("-Kurtosis: histogram")
p32 = p3 + geom_qq(aes(sample="sample")) + geom_qq_line(aes(sample="sample")) + ggtitle("-Kurtosis: Q-Q plot")
p4 = ggplot(pos_kurtosis_df)
p41 = p4 + geom_histogram(aes(x="sample")) + ggtitle("+Kurtosis: histogram")
p42 = p4 + geom_qq(aes(sample="sample")) + geom_qq_line(aes(sample="sample")) + ggtitle("+Kurtosis: Q-Q plot")

plot_matrix([p11, p12, p21, p22, p31, p32, p41, p42])


# ## Choose a distribution
# 
# The `distribution` parameter of the `qq_plot()` function.

# In[10]:


p1 = qq_plot(df, hwy, distribution="norm", quantiles=[.1, .9]) + \
    ggtitle("Normal distribution")
p2 = qq_plot(df, hwy, distribution="uniform", quantiles=[.1, .9]) + \
    ggtitle("Uniform distribution")
p3 = qq_plot(df, hwy, distribution="t", quantiles=[.1, .9]) + \
    ggtitle("Student's t-distribution distribution")
p4 = qq_plot(df, hwy, distribution="exp", quantiles=[.1, .9]) + \
    ggtitle("Exponential distribution")

plot_matrix([p1, p2, p3, p4])


# ## Q-Q stats with other geometries

# In[11]:


ggplot(df, aes(x=cty, y=hwy, color=drv)) + \
    geom_line(stat="qq2") + \
    geom_point(stat="qq2", shape=15) + \
    geom_line(stat="qq2_line", color='#636363', linetype=5) + \
    facet_grid(x=drv, scales="free") + \
    xlab("cty quantiles") + ylab("hwy quantiles")