Profitable App Profiles for the App Store and Google Play Markets
from csv import reader
### The Google Play data set ###
opened_file = open('googleplaystore.csv')
read_file = reader(opened_file)
android = list(read_file)
android_header = android[0]
android = android[1:]
### The App Store data set ###
opened_file = open('AppleStore.csv')
read_file = reader(opened_file)
ios = list(read_file)
ios_header = ios[0]
ios = ios[1:]
print(android_header)
#print(android[:2])
print("\n")
print(ios_header)
#print(ios[:2])
# dataset (list of lists)
# start & end (integers representing start and end of slice)
# row_and_columns (Boolean, default is False)
def explore_data(dataset, start, end, rows_and_columns=False):
dataset_slice = dataset[start:end]
for row in dataset_slice:
print(row)
print("\n") # Adds a new (empty) line after each row
if rows_and_columns:
print("Number of rows:", len(dataset))
print("Number of columns:", len(dataset[0]))
explore_data(android,0,2,True)
print("----------")
explore_data(ios,0,2,True)
Data Cleaning Process
# Find Errors in the data, by using the length of the rows
print("Errors in android:")
for row in android:
if len(row) != len(android_header):
print(row)
print("Error row #:", android.index(row))
print("\n")
print("Errors in ios:")
for row in ios:
if len(row) != len(ios_header):
print(row)
print("Error row #:", ios.index(row))
# Found from kaggle.com - Wrong rating for entry 10472 (Same as found earlier)
# this entry has missing 'Rating' and a column shift happened for next columns
print(android_header,"\n")
explore_data(android,10471,10473)
del android[10472] # Deleting the incorrect row
explore_data(android,10471,10473)
# Find duplicate entries, based on the App names
android_duplicate_apps = []
android_unique_apps = []
ios_duplicate_apps = []
ios_unique_apps = []
for app in android:
name = app[0]
if name in android_unique_apps:
android_duplicate_apps.append(name)
else:
android_unique_apps.append(name)
for app in ios:
name = app[0]
if name in ios_unique_apps:
ios_duplicate_apps.append(name)
else:
ios_unique_apps.append(name)
print("Number of android duplicate apps:", len(android_duplicate_apps))
#print("Examples of duplicate apps:", duplicate_apps[:15])
#print("\n")
print("Number of android unique apps:", len(android_unique_apps))
print("\n")
print("Number of ios duplicate apps:", len(ios_duplicate_apps))
print("Number of ios unique apps:", len(ios_unique_apps))
# Removing the duplicate entries, based on the reviews number [3]
# Each dictionary key is a unique app name and the corresponding
# dictionary value is the highest number of reviews of that app.
# Use the information stored in the dictionary and create a new
# data set, which will have only one entry per app.
android_reviews_max = {}
ios_reviews_max = {}
for app in android:
name = app[0]
n_reviews = float(app[3])
if name in android_reviews_max and android_reviews_max[name] < n_reviews:
android_reviews_max[name] = n_reviews
elif name not in android_reviews_max:
android_reviews_max[name] = n_reviews
print("Android expected length:", len(android) - len(android_duplicate_apps))
print("Android actual length:", len(android_reviews_max))
#print("\n")
#print("ios expected length:", len(ios) - len(ios_duplicate_apps))
#print("ios actual length:", len(ios_reviews_max))
# Remove the duplicate rows and create a new clean list
android_clean = []
already_added = []
for app in android:
name = app[0]
n_reviews = float(app[3])
if (android_reviews_max[name] == n_reviews) and (name not in already_added):
android_clean.append(app)
already_added.append(name)
explore_data(android_clean,0,2,True)
# Non-English App Examples that need removed
print(ios[813][1])
print(ios[6731][1])
print("\n")
print(android_clean[4412][0])
print(android_clean[7940][0])
# Checking for English Words leaving in some words with Emoji's and symbols
def is_english(string):
non_ascii = 0
for character in string:
if ord(character) > 127: # The first 127 characters are English
non_ascii += 1
if non_ascii > 3:
return False
else:
return True
print(is_english('Docs To Go™ Free Office Suite'))
print(is_english('Instachat 😜'))
# Filter out the non-English apps
android_english = []
ios_english = []
for app in android_clean:
name = app[0]
if is_english(name):
android_english.append(app)
for app in ios:
name = app[1]
if is_english(name):
ios_english.append(app)
print("Android apps:")
explore_data(android_english,0,2,True)
print("\n")
print("ios apps:")
explore_data(ios_english,0,2,True)
# Remove all paid apps
android_final = []
ios_final = []
for app in android_english:
price = app[7]
if price == "0":
android_final.append(app)
for app in ios_english:
price = app[4]
if price == "0.0":
ios_final.append(app)
print("Number of free Android apps:", len(android_final))
print("\n")
print("Number of free ios apps:", len(ios_final))
# Determine the kind of apps that attract most users, using a frequency table
def freq_table(dataset, index):
table = {}
total = 0
for row in dataset:
total += 1
value = row[index]
if value in table:
table[value] += 1
else:
table[value] = 1
table_percentages = {}
for key in table:
percentage = (table[key] / total) * 100
table_percentages[key] = percentage
return table_percentages
# Creating a Table showing the app genre by %
# (reverse = True) prints highest to lowest
def display_table(dataset, index):
table = freq_table(dataset, index)
table_display = []
for key in table:
key_val_as_tuple = (table[key], key)
table_display.append(key_val_as_tuple)
table_sorted = sorted(table_display, reverse = True)
for entry in table_sorted:
print(entry[1], ":", entry[0])
# List for ios Apps, % by "prime_genre"
display_table(ios_final, 11)
# List for Android Apps, % by "Category"
display_table(android_final, 1)
# List for Android Apps, % by "Genres"
display_table(android_final, 9)
# Sorting a list
a_list = [50, 100, 20]
print(sorted(a_list))
print(sorted(a_list, reverse = True))
print("\n")
# Sorting a list of tuples (Doesn't work correctly with a normal dictionary)
freq_table_as_tuple = [(50, "Genre_1"), (100, "Genre_2"), (20, "Genre_3")]
print(sorted(freq_table_as_tuple))
print(sorted(freq_table_as_tuple, reverse = True))
# Use a loop inside of a loop
# to calculate the average number of user ratings for each genre
some_strings = ["First", "Second"]
some_integers = [1, 2, 3, 4, 5]
string_integer_list = []
for string in some_strings:
print(string)
string_integer_list.append(string)
for integer in some_integers:
print(integer)
string_integer_list.append(integer)
print(string_integer_list)
# Most Popular Apps by Genre on the App Store
#Calculate the avg number or user ratings per app genre
genres_ios = freq_table(ios_final, 11)
#avg_app_genre = []
for genre in genres_ios:
total = 0
len_genre = 0
for app in ios_final:
genre_app = app[11]
if genre_app == genre:
n_ratings = float(app[5])
total += n_ratings
len_genre += 1
avg_n_ratings = total / len_genre
print(genre, ":", avg_n_ratings)
#avg_app_genre.append(str(avg_n_ratings) + " : " + genre)
#print(sorted(avg_app_genre, reverse = True))
# Breakdown of "Navigation" Apps
for app in ios_final:
if app[11] == "Navigation":
print(app[1], ":", app[5]) # name and number of ratings
# Breakdown of "Reference" Apps
for app in ios_final:
if app[11] == "Reference":
print(app[1], ":", app[5]) # name and number of ratings
# Number of installs for Android store
display_table(android_final, 5)
# Calculate the average number of installs per genre
categories_android = freq_table(android_final, 1)
cat_list = []
for category in categories_android:
total = 0
len_category = 0
for app in android_final:
category_app = app[1]
if category_app == category:
n_installs = app[5]
# Need to remove the , and + from the string to convert them to a float
n_installs = n_installs.replace(",", "")
n_installs = n_installs.replace("+", "")
total += float(n_installs)
len_category += 1
avg_n_installs = total / len_category
print(category, ":", avg_n_installs)
# Breakdown of "COMMUNICATION" apps per downloads
for app in android_final:
if app[1] == "COMMUNICATION" and (app[5] == "1,000,000,000+"
or app[5] == "500,000,000+"
or app[5] == "100,000,000+"):
print(app[0], ":", app[5])
# Avg for "COMMUNICATION" with apps removed that have over 100 million installs
under_100_m = []
for app in android_final:
n_installs = app[5]
# Need to remove the , and + from the string to convert them to a float
n_installs = n_installs.replace(",", "")
n_installs = n_installs.replace("+", "")
if (app[1] == "COMMUNICATION") and (float(n_installs) < 100000000):
under_100_m.append(float(n_installs))
sum(under_100_m) / len(under_100_m)
# Breakdown of "BOOKS_AND_REFERENCE" apps
for app in android_final:
if app[1] == "BOOKS_AND_REFERENCE":
print(app[0], ":", app[5])
# Breakdown of "BOOKS_AND_REFERENCE" apps per downloads
for app in android_final:
if app[1] == "BOOKS_AND_REFERENCE" and (app[5] == "1,000,000,000+"
or app[5] == "500,000,000+"
or app[5] == "100,000,000+"):
print(app[0], ":", app[5])
# Breakdown of "BOOKS_AND_REFERENCE" apps per downloads
for app in android_final:
if app[1] == "BOOKS_AND_REFERENCE" and (app[5] == "1,000,000+"
or app[5] == "5,000,000+"
or app[5] == "10,000,000+"
or app[5] == "50,000,000+"):
print(app[0], ":", app[5])
In this project, we analyzed data about the App Store and Google Play mobile apps with the goal of recommending an app profile that can be profitable for both markets.
We concluded that taking a popular book (perhaps a more recent book) and turning it into an app could be profitable for both the Google Play and the App Store markets. The markets are already full of libraries, so we need to add some special features besides the raw version of the book. This might include daily quotes from the book, an audio version of the book, quizzes on the book, a forum where people can discuss the book, etc.