import pandas as pd
from lets_plot import *
LetsPlot.setup_html()
df = pd.read_csv("https://raw.githubusercontent.com/JetBrains/lets-plot-docs/master/data/googleplaystore.csv")
print(df.shape)
df.head(3)
(10841, 13)
App | Category | Rating | Reviews | Size | Installs | Type | Price | Content Rating | Genres | Last Updated | Current Ver | Android Ver | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Photo Editor & Candy Camera & Grid & ScrapBook | ART_AND_DESIGN | 4.1 | 159 | 19M | 10,000+ | Free | 0 | Everyone | Art & Design | January 7, 2018 | 1.0.0 | 4.0.3 and up |
1 | Coloring book moana | ART_AND_DESIGN | 3.9 | 967 | 14M | 500,000+ | Free | 0 | Everyone | Art & Design;Pretend Play | January 15, 2018 | 2.0.0 | 4.0.3 and up |
2 | U Launcher Lite – FREE Live Cool Themes, Hide ... | ART_AND_DESIGN | 4.7 | 87510 | 8.7M | 5,000,000+ | Free | 0 | Everyone | Art & Design | August 1, 2018 | 1.2.4 | 4.0.3 and up |
def size_to_bytes(size):
size = size.lower()
if size == 'varies with device' or size == '':
return -1
if 'k' in size:
return int(float(size.split('k')[0]) * 1024)
if 'm' in size:
return int(float(size.split('m')[0]) * 1024 * 1024)
return int(size)
df = df[~df.Type.isna()]
df = df[~df.Reviews.astype(str).str.contains('M')]
df.Reviews = df.Reviews.astype(int)
df.Size = df.Size.astype(str).apply(size_to_bytes).astype(int)
df.Installs = df.Installs.astype(str).str.replace(',', '', regex=False)\
.str.replace('+', '', regex=False).astype(int)
df.Price = df.Price.astype(str).str.replace('$', '', regex=False).astype(float)
print(df.shape)
df.head(3)
(10839, 13)
App | Category | Rating | Reviews | Size | Installs | Type | Price | Content Rating | Genres | Last Updated | Current Ver | Android Ver | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Photo Editor & Candy Camera & Grid & ScrapBook | ART_AND_DESIGN | 4.1 | 159 | 19922944 | 10000 | Free | 0.0 | Everyone | Art & Design | January 7, 2018 | 1.0.0 | 4.0.3 and up |
1 | Coloring book moana | ART_AND_DESIGN | 3.9 | 967 | 14680064 | 500000 | Free | 0.0 | Everyone | Art & Design;Pretend Play | January 15, 2018 | 2.0.0 | 4.0.3 and up |
2 | U Launcher Lite – FREE Live Cool Themes, Hide ... | ART_AND_DESIGN | 4.7 | 87510 | 9122611 | 5000000 | Free | 0.0 | Everyone | Art & Design | August 1, 2018 | 1.2.4 | 4.0.3 and up |
cat_df = df.groupby('Category').Installs.mean().to_frame().reset_index()
ggplot() + \
geom_bar(aes(x=as_discrete('Category', order_by='Installs'), y='Installs', fill='Category'), \
data=cat_df, stat='identity', sampling=sampling_pick(cat_df.shape[0])) + \
scale_fill_brewer(type='qual', palette='Dark2') + \
xlab('category') + ylab('mean installations') + \
ggsize(600, 450) + \
ggtitle('Installations by Category') + \
theme(panel_grid_major_x='blank', legend_position='none')
Here we can see that some categories are much more popular than others.
gen_df = df.groupby('Genres').Installs.mean().to_frame().reset_index()
ggplot() + \
geom_bar(aes(x=as_discrete('Genres', order_by='Installs'), y='Installs', fill='Genres'), \
data=gen_df, stat='identity', sampling=sampling_pick(gen_df.shape[0]), \
tooltips=layer_tooltips().line('genre|@Genres')\
.format('@Installs', '.0f')\
.line('mean installations|@Installs')) + \
scale_fill_brewer(type='qual', palette='Dark2') + \
ylab('mean installations') + \
ggsize(600, 300) + \
ggtitle('Installations by Genre') + \
theme(panel_grid_major_x='blank', legend_position='none', \
axis_title_x='blank', axis_text_x='blank', axis_ticks_x='blank')
We see a big gap in popularity between different genres.
ggplot() + \
geom_bin2d(aes(x='Installs', y='Rating', fill='..count..'), \
data=df, color='white', size=1) + \
scale_fill_gradient(low='#e0ecf4', high='#8856a7') + \
scale_x_log10(name='installations') + \
ylim(1, 5) + ylab('rating') + \
ggsize(600, 300) + \
ggtitle('Connection Between Installations and Rating')
The rating and number of installations are more or less positively correlated. At least an app rated below 3 will not be popular.
ggplot() + \
geom_jitter(aes(x='Reviews', y='Installs', paint_a='Type'), \
data=df, shape=21, color='black', alpha=.1, fill_by='paint_a', seed=42) + \
geom_smooth(aes(x='Reviews', y='Installs', group='Type', paint_a='Type'), \
data=df, method='loess', deg=2, color_by='paint_a') + \
scale_x_log10(name='reviews') + scale_y_log10(name='installations') + \
scale_brewer('paint_a', palette='Set2') + \
ggsize(600, 450) + \
ggtitle('Connection Between Installations and Reviews')
The plot shows that the number of installations and the number of reviews are practically the same thing.
The smoothing curves are far enough from each other, so it's better to separate free applications from the paid ones.
ggplot() + \
geom_bin2d(aes(x='Reviews', y='Size', fill='..count..'), \
data=df, color='white', size=1) + \
scale_fill_gradient(low='#e5f5f9', high='#2ca25f') + \
scale_x_log10(name='reviews') + scale_y_log10(name='size') + \
ggsize(600, 300) + \
ggtitle('Connection Between Reviews and Size')
It looks like we might not be interested in apps that are lighter than 1 Mb. For the others there is but minor correlation.
ggplot() + \
geom_bin2d(aes(x='Reviews', y='Price', fill='..count..'), \
data=df[df.Type == 'Paid'], color='white', size=1) + \
scale_fill_gradient(low='#ffeda0', high='#f03b20') + \
scale_x_log10(name='reviews') + scale_y_log10(name='price') + \
ggsize(600, 300) + \
ggtitle('Connection Between Price and Reviews')
I see nothing but chaos here. Anyway, paid apps are not very common, and others are either free of charge or use different sources of monetization.