import pandas as pd
from scipy.stats import norm, skewnorm, laplace, uniform
from lets_plot import *
LetsPlot.setup_html()
def plot_matrix(plots=[], width=400, height=300, columns=2):
bunch = GGBunch()
for i in range(len(plots)):
row = int(i / columns)
column = i % columns
bunch.add_plot(plots[i], column * width, row * height, width, height)
return bunch.show()
df = pd.read_csv("https://raw.githubusercontent.com/JetBrains/lets-plot-docs/master/data/mpg.csv")
print(df.shape)
hwy = "hwy"
cty = "cty"
drv = "drv"
df.head()
(234, 12)
Unnamed: 0 | manufacturer | model | displ | year | cyl | trans | drv | cty | hwy | fl | class | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | audi | a4 | 1.8 | 1999 | 4 | auto(l5) | f | 18 | 29 | p | compact |
1 | 2 | audi | a4 | 1.8 | 1999 | 4 | manual(m5) | f | 21 | 29 | p | compact |
2 | 3 | audi | a4 | 2.0 | 2008 | 4 | manual(m6) | f | 20 | 31 | p | compact |
3 | 4 | audi | a4 | 2.0 | 2008 | 4 | auto(av) | f | 21 | 30 | p | compact |
4 | 5 | audi | a4 | 2.8 | 1999 | 6 | auto(l5) | f | 16 | 26 | p | compact |
geom_qq()
and geom_qq_line()
functions.¶ggplot(df, aes(sample=hwy)) + \
geom_qq(size=5, color="#3d3d3d", alpha=.3) + \
geom_qq_line(size=1) + \
ggtitle("Distribution of highway miles per gallon", \
"Comparison of sample quantiles with normal distribution quantiles")
geom_qq2()
and geom_qq2_line()
functions.¶ggplot(df, aes(x=cty, y=hwy)) + \
geom_qq2(size=5, color="#3d3d3d", alpha=.3) + \
geom_qq2_line(size=1) + \
ggtitle("City miles vs. highway miles (per gallon)", \
"Comparison of quantiles of two sample distributions")
qq_plot()
function¶In the 'bistro' module there is a Q-Q plot in which points and lines of both types combined to the one function with some convenient defaults.
from lets_plot.bistro.qq import qq_plot
qq_plot(df, sample=hwy) + \
ggtitle("Distribution of highway miles per gallon", \
"Comparison of sample quantiles with normal distribution quantiles")
n = 1_000
norm_df = pd.DataFrame({"sample": norm.rvs(size=n, random_state=42)})
skewed_df = pd.DataFrame({"sample": skewnorm.rvs(7, size=n, random_state=42)})
neg_kurtosis_df = pd.DataFrame({"sample": uniform.rvs(size=n, random_state=42)})
pos_kurtosis_df = pd.DataFrame({"sample": laplace.rvs(size=n, random_state=42)})
p1 = ggplot(norm_df)
p11 = p1 + geom_histogram(aes(x="sample")) + ggtitle("Normal: histogram")
p12 = p1 + geom_qq(aes(sample="sample")) + geom_qq_line(aes(sample="sample")) + ggtitle("Normal: Q-Q plot")
p2 = ggplot(skewed_df)
p21 = p2 + geom_histogram(aes(x="sample")) + ggtitle("Skewed: histogram")
p22 = p2 + geom_qq(aes(sample="sample")) + geom_qq_line(aes(sample="sample")) + ggtitle("Skewed: Q-Q plot")
p3 = ggplot(neg_kurtosis_df)
p31 = p3 + geom_histogram(aes(x="sample")) + ggtitle("-Kurtosis: histogram")
p32 = p3 + geom_qq(aes(sample="sample")) + geom_qq_line(aes(sample="sample")) + ggtitle("-Kurtosis: Q-Q plot")
p4 = ggplot(pos_kurtosis_df)
p41 = p4 + geom_histogram(aes(x="sample")) + ggtitle("+Kurtosis: histogram")
p42 = p4 + geom_qq(aes(sample="sample")) + geom_qq_line(aes(sample="sample")) + ggtitle("+Kurtosis: Q-Q plot")
plot_matrix([p11, p12, p21, p22, p31, p32, p41, p42])
The distribution
parameter of the qq_plot()
function.
p1 = qq_plot(df, hwy, distribution="norm", quantiles=[.1, .9]) + \
ggtitle("Normal distribution")
p2 = qq_plot(df, hwy, distribution="uniform", quantiles=[.1, .9]) + \
ggtitle("Uniform distribution")
p3 = qq_plot(df, hwy, distribution="t", quantiles=[.1, .9]) + \
ggtitle("Student's t-distribution distribution")
p4 = qq_plot(df, hwy, distribution="exp", quantiles=[.1, .9]) + \
ggtitle("Exponential distribution")
plot_matrix([p1, p2, p3, p4])
ggplot(df, aes(x=cty, y=hwy, color=drv)) + \
geom_line(stat="qq2") + \
geom_point(stat="qq2", shape=15) + \
geom_line(stat="qq2_line", color='#636363', linetype=5) + \
facet_grid(x=drv, scales="free") + \
xlab("cty quantiles") + ylab("hwy quantiles")