#!/usr/bin/env python # coding: utf-8 # ## Guided Project: # Profitable App Profiles for the App Store and Google Play Markets # In[1]: from csv import reader ### The Google Play data set ### opened_file = open('googleplaystore.csv') read_file = reader(opened_file) android = list(read_file) android_header = android[0] android = android[1:] ### The App Store data set ### opened_file = open('AppleStore.csv') read_file = reader(opened_file) ios = list(read_file) ios_header = ios[0] ios = ios[1:] # In[2]: print(android_header) #print(android[:2]) print("\n") print(ios_header) #print(ios[:2]) # In[3]: # dataset (list of lists) # start & end (integers representing start and end of slice) # row_and_columns (Boolean, default is False) def explore_data(dataset, start, end, rows_and_columns=False): dataset_slice = dataset[start:end] for row in dataset_slice: print(row) print("\n") # Adds a new (empty) line after each row if rows_and_columns: print("Number of rows:", len(dataset)) print("Number of columns:", len(dataset[0])) explore_data(android,0,2,True) print("----------") explore_data(ios,0,2,True) # **Data Cleaning Process** # # - Detect inaccurate data, and correct or remove it. # - Detect duplicate data, and remove the duplicates. # - Remove non-English apps. # - Remove apps that aren't free. # In[4]: # Find Errors in the data, by using the length of the rows print("Errors in android:") for row in android: if len(row) != len(android_header): print(row) print("Error row #:", android.index(row)) print("\n") print("Errors in ios:") for row in ios: if len(row) != len(ios_header): print(row) print("Error row #:", ios.index(row)) # In[5]: # Found from kaggle.com - Wrong rating for entry 10472 (Same as found earlier) # this entry has missing 'Rating' and a column shift happened for next columns print(android_header,"\n") explore_data(android,10471,10473) del android[10472] # Deleting the incorrect row explore_data(android,10471,10473) # In[6]: # Find duplicate entries, based on the App names android_duplicate_apps = [] android_unique_apps = [] ios_duplicate_apps = [] ios_unique_apps = [] for app in android: name = app[0] if name in android_unique_apps: android_duplicate_apps.append(name) else: android_unique_apps.append(name) for app in ios: name = app[0] if name in ios_unique_apps: ios_duplicate_apps.append(name) else: ios_unique_apps.append(name) print("Number of android duplicate apps:", len(android_duplicate_apps)) #print("Examples of duplicate apps:", duplicate_apps[:15]) #print("\n") print("Number of android unique apps:", len(android_unique_apps)) print("\n") print("Number of ios duplicate apps:", len(ios_duplicate_apps)) print("Number of ios unique apps:", len(ios_unique_apps)) # In[7]: # Removing the duplicate entries, based on the reviews number [3] # Each dictionary key is a unique app name and the corresponding # dictionary value is the highest number of reviews of that app. # Use the information stored in the dictionary and create a new # data set, which will have only one entry per app. android_reviews_max = {} ios_reviews_max = {} for app in android: name = app[0] n_reviews = float(app[3]) if name in android_reviews_max and android_reviews_max[name] < n_reviews: android_reviews_max[name] = n_reviews elif name not in android_reviews_max: android_reviews_max[name] = n_reviews print("Android expected length:", len(android) - len(android_duplicate_apps)) print("Android actual length:", len(android_reviews_max)) #print("\n") #print("ios expected length:", len(ios) - len(ios_duplicate_apps)) #print("ios actual length:", len(ios_reviews_max)) # In[8]: # Remove the duplicate rows and create a new clean list android_clean = [] already_added = [] for app in android: name = app[0] n_reviews = float(app[3]) if (android_reviews_max[name] == n_reviews) and (name not in already_added): android_clean.append(app) already_added.append(name) explore_data(android_clean,0,2,True) # In[9]: # Non-English App Examples that need removed print(ios[813][1]) print(ios[6731][1]) print("\n") print(android_clean[4412][0]) print(android_clean[7940][0]) # In[10]: # Checking for English Words leaving in some words with Emoji's and symbols def is_english(string): non_ascii = 0 for character in string: if ord(character) > 127: # The first 127 characters are English non_ascii += 1 if non_ascii > 3: return False else: return True print(is_english('Docs To Go™ Free Office Suite')) print(is_english('Instachat 😜')) # In[11]: # Filter out the non-English apps android_english = [] ios_english = [] for app in android_clean: name = app[0] if is_english(name): android_english.append(app) for app in ios: name = app[1] if is_english(name): ios_english.append(app) print("Android apps:") explore_data(android_english,0,2,True) print("\n") print("ios apps:") explore_data(ios_english,0,2,True) # In[12]: # Remove all paid apps android_final = [] ios_final = [] for app in android_english: price = app[7] if price == "0": android_final.append(app) for app in ios_english: price = app[4] if price == "0.0": ios_final.append(app) print("Number of free Android apps:", len(android_final)) print("\n") print("Number of free ios apps:", len(ios_final)) # In[13]: # Determine the kind of apps that attract most users, using a frequency table def freq_table(dataset, index): table = {} total = 0 for row in dataset: total += 1 value = row[index] if value in table: table[value] += 1 else: table[value] = 1 table_percentages = {} for key in table: percentage = (table[key] / total) * 100 table_percentages[key] = percentage return table_percentages # Creating a Table showing the app genre by % # (reverse = True) prints highest to lowest def display_table(dataset, index): table = freq_table(dataset, index) table_display = [] for key in table: key_val_as_tuple = (table[key], key) table_display.append(key_val_as_tuple) table_sorted = sorted(table_display, reverse = True) for entry in table_sorted: print(entry[1], ":", entry[0]) # In[14]: # List for ios Apps, % by "prime_genre" display_table(ios_final, 11) # In[15]: # List for Android Apps, % by "Category" display_table(android_final, 1) # In[16]: # List for Android Apps, % by "Genres" display_table(android_final, 9) # In[17]: # Sorting a list a_list = [50, 100, 20] print(sorted(a_list)) print(sorted(a_list, reverse = True)) print("\n") # Sorting a list of tuples (Doesn't work correctly with a normal dictionary) freq_table_as_tuple = [(50, "Genre_1"), (100, "Genre_2"), (20, "Genre_3")] print(sorted(freq_table_as_tuple)) print(sorted(freq_table_as_tuple, reverse = True)) # In[35]: # Use a loop inside of a loop # to calculate the average number of user ratings for each genre some_strings = ["First", "Second"] some_integers = [1, 2, 3, 4, 5] string_integer_list = [] for string in some_strings: print(string) string_integer_list.append(string) for integer in some_integers: print(integer) string_integer_list.append(integer) print(string_integer_list) # In[18]: # Most Popular Apps by Genre on the App Store #Calculate the avg number or user ratings per app genre genres_ios = freq_table(ios_final, 11) #avg_app_genre = [] for genre in genres_ios: total = 0 len_genre = 0 for app in ios_final: genre_app = app[11] if genre_app == genre: n_ratings = float(app[5]) total += n_ratings len_genre += 1 avg_n_ratings = total / len_genre print(genre, ":", avg_n_ratings) #avg_app_genre.append(str(avg_n_ratings) + " : " + genre) #print(sorted(avg_app_genre, reverse = True)) # In[19]: # Breakdown of "Navigation" Apps for app in ios_final: if app[11] == "Navigation": print(app[1], ":", app[5]) # name and number of ratings # In[20]: # Breakdown of "Reference" Apps for app in ios_final: if app[11] == "Reference": print(app[1], ":", app[5]) # name and number of ratings # In[21]: # Number of installs for Android store display_table(android_final, 5) # In[22]: # Calculate the average number of installs per genre categories_android = freq_table(android_final, 1) cat_list = [] for category in categories_android: total = 0 len_category = 0 for app in android_final: category_app = app[1] if category_app == category: n_installs = app[5] # Need to remove the , and + from the string to convert them to a float n_installs = n_installs.replace(",", "") n_installs = n_installs.replace("+", "") total += float(n_installs) len_category += 1 avg_n_installs = total / len_category print(category, ":", avg_n_installs) # In[23]: # Breakdown of "COMMUNICATION" apps per downloads for app in android_final: if app[1] == "COMMUNICATION" and (app[5] == "1,000,000,000+" or app[5] == "500,000,000+" or app[5] == "100,000,000+"): print(app[0], ":", app[5]) # In[26]: # Avg for "COMMUNICATION" with apps removed that have over 100 million installs under_100_m = [] for app in android_final: n_installs = app[5] # Need to remove the , and + from the string to convert them to a float n_installs = n_installs.replace(",", "") n_installs = n_installs.replace("+", "") if (app[1] == "COMMUNICATION") and (float(n_installs) < 100000000): under_100_m.append(float(n_installs)) sum(under_100_m) / len(under_100_m) # In[27]: # Breakdown of "BOOKS_AND_REFERENCE" apps for app in android_final: if app[1] == "BOOKS_AND_REFERENCE": print(app[0], ":", app[5]) # In[28]: # Breakdown of "BOOKS_AND_REFERENCE" apps per downloads for app in android_final: if app[1] == "BOOKS_AND_REFERENCE" and (app[5] == "1,000,000,000+" or app[5] == "500,000,000+" or app[5] == "100,000,000+"): print(app[0], ":", app[5]) # In[30]: # Breakdown of "BOOKS_AND_REFERENCE" apps per downloads for app in android_final: if app[1] == "BOOKS_AND_REFERENCE" and (app[5] == "1,000,000+" or app[5] == "5,000,000+" or app[5] == "10,000,000+" or app[5] == "50,000,000+"): print(app[0], ":", app[5]) # In this project, we analyzed data about the App Store and Google Play mobile apps with the goal of recommending an app profile that can be profitable for both markets. # # We concluded that taking a popular book (perhaps a more recent book) and turning it into an app could be profitable for both the Google Play and the App Store markets. The markets are already full of libraries, so we need to add some special features besides the raw version of the book. This might include daily quotes from the book, an audio version of the book, quizzes on the book, a forum where people can discuss the book, etc.