This Project is about free Android and IOS apps which consists of in-apps ads. The number of users of researching apps determines our revenue for each given app. The Project goal is to analyze available data to help developers understand what type of apps are likely to attract more users.
After analyzing the data, we defined a concept of the Profitable app profile. The great notice that really can increase our chances to maximize a profit is adding interactive elements (like podcasts) and even gamification within the app.
For more details, please refer to the the full analysis below.
def open_dataset (file_name, header = False):
opened_file = open(file_name, encoding='utf8')
from csv import reader
red_file = reader(opened_file)
data_set = list(red_file)
if header:
return data_set[1:]
return data_set
android_apps = open_dataset('googleplaystore.csv')
ios_apps = open_dataset('AppleStore.csv')
Here is a link for documentation on the dataset with Android apps: Android_apps_doc. And the same for IOS: IOS_apps_doc.
def explore_data(dataset, start, end, rows_and_columns=False):
dataset_slice = dataset[start:end]
for row in dataset_slice:
print(row)
print('\n') # adds a new (empty) line after each row
if rows_and_columns:
print('Number of rows:', len(dataset))
print('Number of columns:', len(dataset[0]))
explore_data(android_apps, 0, 2, True)
explore_data(ios_apps, 0, 2, True)
print(android_apps[10473])
del android_apps[10473]
print(len(android_apps))
def duplicates_checking(data_set, app_name_id_column = 0):
unique_apps = []
duplicate_apps = []
for app in data_set:
app_name = app[app_name_id_column]
if app_name in unique_apps:
duplicate_apps.append(app_name)
else:
unique_apps.append(app_name)
print('Numbers of duplicate apps:', len(duplicate_apps))
print('\n')
print('Examples of duplicate apps:', duplicate_apps[:5])
duplicates_checking(android_apps)
print('\n')
duplicates_checking(ios_apps, 1)
for app in android_apps:
if app[0] == 'Facebook':
print(app)
print('\n')
for app in ios_apps:
if app[1] == 'Mannequin Challenge':
print(app)
def reviews_max(data_set, header = True, app_name_column = 0, reviews_column = 3):
total_reviews = {}
if header:
for app in data_set[1:]:
name = app[app_name_column]
reviews = float(app[reviews_column])
if name in total_reviews and total_reviews[name]<reviews:
total_reviews[name] = reviews
elif name not in total_reviews:
total_reviews[name] = reviews
else:
for app in data_set:
name = app[app_name_column]
reviews = float(app[reviews_column])
if name in total_reviews and total_reviews[name]<reviews:
total_reviews[name] = reviews
elif name not in total_reviews:
total_reviews[name] = reviews
return total_reviews
android_reviews = reviews_max(android_apps)
ios_reviews = reviews_max(ios_apps, True, 1, 8)
# Checking for data in criterion dictionaries.
print(android_reviews['Facebook'])
print(len(android_reviews))
print(10842-1181-1) # apps_amount - duplicate_apps - header
print(ios_reviews['Mannequin Challenge'])
print(len(ios_reviews))
print(7198-2-1)
def dataset_cleaning (data_set, criterion_dictionary, app_name_column = 0, reviews_column = 3, header = True):
dataset_cleaned = []
already_added = []
if header:
for app in data_set[1:]:
name = app[app_name_column]
reviews = float(app[reviews_column])
if name not in already_added and reviews == criterion_dictionary[name]:
dataset_cleaned.append(app)
already_added.append(name)
else:
for app in data_set:
name = app[app_name_column]
reviews = float(app[reviews_column])
if name not in already_added and reviews == criterion_dictionary[name]:
dataset_cleaned.append(app)
already_added.append(name)
return dataset_cleaned, already_added
android_cleaned, android_added = dataset_cleaning(android_apps, android_reviews)
print('Expected length for android_cleaned is 9659 rows, actual length is:', len(android_cleaned))
print('\n')
ios_cleaned, ios_added = dataset_cleaning(ios_apps, ios_reviews, 1, 8)
print('Expected length for ios_cleaned is 7195 rows, actual length is:', len(ios_cleaned))
def english_app_check (app_name):
out_of_range_count = 0
for character in app_name:
ASCII_number = ord(character)
if out_of_range_count == 3:
return False
elif ASCII_number > 127:
out_of_range_count += 1
return True
print(english_app_check('Instagram'))
print(english_app_check('爱奇艺PPS -《欢乐颂2》电视剧热播'))
print(english_app_check('Docs To Go™ Free Office Suite'))
print(english_app_check('Instachat 😜'))
def only_english_apps (data_set, app_name_column = 0):
english_apps_only = []
excluded_apps = []
for app in data_set:
english_app = english_app_check(app[app_name_column])
if english_app:
english_apps_only.append(app)
else:
excluded_apps.append(app[app_name_column])
return english_apps_only, excluded_apps
android_eng, android_non_eng = only_english_apps(android_cleaned)
print('Amount of Android English apps is: ', len(android_eng))
print('Some of non-English apps: ', android_non_eng[:5])
print('\n')
ios_eng, ios_non_eng = only_english_apps(ios_cleaned)
print('Amount of IOS English apps is: ', len(ios_eng))
print('Some of non-English apps: ', ios_non_eng[:5])
def isolate_free_apps (data_set, app_price_column = 7):
free_apps = []
for app in data_set:
if app[app_price_column] == '0.0' or app[app_price_column] == '0':
free_apps.append(app)
return free_apps
android_free = isolate_free_apps(android_eng)
print('Amount of isolated non-free Android apps: ', len(android_eng) - len(android_free))
print('\n')
ios_free = isolate_free_apps(ios_eng, 4)
print('Amount of isolated non-free IOS apps: ', len(ios_eng) - len(ios_free))
we need choose the right analysing way of both datasets corresponding to our main Project goal. Here we have to speak about validation strategy which has three steps:
So that, we need to find app profiles that are successful in both markets. Let's begin the analysis by determining the most common genres for each market.
def freq_table (data_set, column_number):
dictionary = {}
data_set_length = len(data_set)
for app in data_set:
item = app[column_number]
if item in dictionary:
dictionary[item] += 1
else:
dictionary[item] = 1
for key in dictionary:
dictionary[key] = round(dictionary[key] / data_set_length * 100, 2)
return dictionary
def display_table(dataset, index):
table = freq_table(dataset, index)
table_display = []
for key in table:
key_val_as_tuple = (table[key], key)
table_display.append(key_val_as_tuple)
table_sorted = sorted(table_display, reverse = True)
for entry in table_sorted:
print(entry[1], ':', entry[0])
display_table (android_free, 1)
print('\n')
display_table (android_free, 9)
print('\n')
display_table (ios_free, 11)
ios_genres = freq_table(ios_free, 11)
for genre in ios_genres:
total = 0
len_genre = 0
for app in ios_free:
name = app[11]
if name == genre:
rating = float(app[5])
total += rating
len_genre += 1
print(genre, "with an average rating:", round(total / len_genre, 1), "and number of apps:", len_genre)
print('\n')
#Lets explore what is an average rating of all ios free apps.
total = 0
for app in ios_free:
rating = float(app[5])
total += rating
average_rating = total / len(ios_free)
print(round(average_rating, 2))
According to output we can see that average rating of ios_free apps is 19759. Reference to this the apps which user rating is upper is looking good as app profile recommendation. Here we have Reference, Social Networking and Music apps as the most atractive for developing. Speaking specifally we need to detalize what apps are included in this case. So we'll be able to get an understading for recomended app profile.
def top5_genre (data_set, genre_name, app_column = 1, genre_column = 11, rating_column = 5, android_check = False):
genre_apps = {}
for app in data_set:
name = app[app_column]
genre = app[genre_column]
#One extra step in definition for android dataset to update invalid characters within number of installs.
if android_check:
installs = app[rating_column]
installs = installs.replace(',', '')
installs = installs.replace('+', '')
app[rating_column] = installs
rating = float(app[rating_column])
if genre == genre_name:
genre_apps[name] = rating
sorted_genre_apps = []
for key in genre_apps:
key_val_as_tuple = (genre_apps[key], key)
sorted_genre_apps.append(key_val_as_tuple)
sorted_genre_apps = sorted(sorted_genre_apps, reverse = True)
print('Top-5 in', genre_name, 'genre:')
for entry in sorted_genre_apps[:5]:
print('App', entry[1], 'with rating', entry[0])
print('\n')
top5_genre(ios_free, 'Reference')
top5_genre(ios_free, 'Social Networking')
top5_genre(ios_free, 'Music')
As we can see now there are an interesting case inside the most rating genres. Among 'top five' apps within Reference genre we see a few apps that have a religious context. Within Music genre we see applications that allow us to chose music by own interests and the best of that apps allow users to create flexible playlists. At last we have very popular social nets in Social networking apps. Notice that in this list we have two biggest messengers and the biggest service for picture exchanging Pinterest.
So that the most pfofitable app profile can be an app that will have all of explored features from prime genres. This app might be based on some historical book or some international magazine which gathering a big community of people and include chating possibility. Also the app might have its own ready-made music playlists created special for community.
print(android_free[0][5])
For this we are going to replace all invalid characters with a readable one.
android_genres = freq_table(android_free, 1)
for category in android_genres:
total = 0
len_category = 0
for app in android_free:
category_app = app[1]
if category_app == category:
len_category += 1
installs = app[5]
installs = installs.replace(',', '')
installs = installs.replace('+', '')
total += float(installs)
print(category, "with an average installs:", round(total / len_category, 1), "and total number of apps:", len_category)
print('\n')
#Lets explore what is an average amount of installs for android free apps.
total = 0
for app in android_free:
installs = app[5]
installs = installs.replace(',', '')
installs = installs.replace('+', '')
installs = float(installs)
total += installs
average_rating = total / len(android_free)
print(round(average_rating, 2))
As we can see upper there are a few extrimaly populate categories within Android market place. For instanse, we have Communication category with above 38 millions installs, next to it is Video_Players app category with almost 25 millions installs and on the third place we have Social category with nearly 23 millions installs. Let's look specifically over them.
top5_genre(android_free, 'COMMUNICATION', 0, 1, 5, True)
top5_genre(android_free, 'VIDEO_PLAYERS', 0, 1, 5, True)
top5_genre(android_free, 'SOCIAL', 0, 1, 5, True)
According to output we can see relation between popular apps in both datasets. Our main idea about composited application becomes confirmed. It gathers the features of certain media and text content, chating and posting on individual user level. We can also note that within Android market a large segment of apps belongs to Video_Players genre. Users need some interactive stuff and this observation may include in final App Profile.
Therefore, through all these steps we made there is a concept of our Profitable app profile. The great notice that really can increase our chances to maximize a profit is adding interactive elements (like podcasts) and even gamification within the app. Also it must have a large comunication functionality with chat or forum included. It gives us a pretty good flexibility and evidently multiplicate a competitiveness on the store. Thank you for your time i hope it were useful. Have a nice day!