#!/usr/bin/env python # coding: utf-8 # # App Profile Profitability Across iOS and Android # # As a company focused on developing free mobile apps, in-app ads are a key income source. Driving downloads of created applications drives revenue based on increased views and interactions with these in-app ads. Therefore, it is key to identify app profiles which deliver the most profitability in this sense. # # The goal of this project it to do just that, identify which app profiles will consistently deliver the most ad traffic, and, therefore, revenue. # In[1]: from csv import reader ###Google Play Data### opened_file = open('googleplaystore.csv') read_file = reader(opened_file) android = list(read_file) android_header = android[0] android = android[1:] ###iOS Store Data### opened_file = open('AppleStore.csv') read_file = reader(opened_file) ios = list(read_file) ios_header = ios[0] ios = ios[1:] # In[2]: def explore_data(dataset, start, end, rows_and_columns=False): dataset_slice = dataset[start:end] for row in dataset_slice: print(row) print('\n') # adds a new (empty) line after each row if rows_and_columns: print('Number of rows:', len(dataset)) print('Number of columns:', len(dataset[0])) print(android_header) print('\n') explore_data(android, 0, 3, True) # In[3]: print(ios_header) print('\n') explore_data(ios, 0, 3, True) # The android data set returned 10,841 entries with 13 columns. The columns that appear most useful are 'App', 'Category', 'Reviews', 'Installs', 'Type, 'Price', and 'Genre'. # # The ios data set returned 7,197 entries with 16 columns. The columns in this group to focus on appear to be 'track_name', 'currency', 'price', 'rating_count_tot', 'rating_count_ver', and 'prime_genre'. These column names are not as intuitive, so descriptions can be found [here](https://www.kaggle.com/ramamet4/app-store-apple-data-set-10k-apps/home). # In[4]: print(android[10472]) #bad data print('\n') print(android_header) #header for column count print('\n') print(android[0]) #good row example for comparison # In[5]: print(len(android)) del android[10472] #only run once print(len(android)) # Above data was excluded due to erroneous data. Upon further review, several rows in the Google Play data are duplicates, as seen when a pass is made through the data looking for 'Instagram'. # In[6]: for app in android: name = app[0] if name == 'Instagram': print(app) # In total, 1181 instances of duplicated data. # In[7]: duplicate_apps = [] unique_apps = [] for app in android: name = app[0] if name in unique_apps: duplicate_apps.append(name) else: unique_apps.append(name) print('Number of duplicate apps: ', len(duplicate_apps)) print('\n') print('Examples of duplicate apps:', duplicate_apps[:15]) # The plan to exclude duplicates will be tied to total number of reviews. In theory, a higher review count means the data is more recent. Using this logic, only the row with the highest review count will be included in the final data set. # # This will involve creating a dictionary that contains each app name paired with its highest review count; followed by using that dictionary to create a working data set with one entry per app associated with the highest listed review count. # In[8]: reviews_max = {} for app in android: name = app[0] n_reviews = float(app[3]) if name in reviews_max and reviews_max[name] < n_reviews: reviews_max[name] = n_reviews elif name not in reviews_max: reviews_max[name] = n_reviews # In[9]: print('Expected length:', len(android) - 1181) print('Actual length:', len(reviews_max)) # In[10]: android_clean = [] already_added = [] for app in android: name = app[0] n_reviews = float(app[3]) if (reviews_max[name] == n_reviews) and (name not in already_added): android_clean.append(app) already_added.append(name) # In[11]: explore_data(android_clean, 0, 3, True) # There are no duplicates in the ios data set. # # Our next step will be to remove all non-english apps, given that our company only writes apps for an english speaking audience. To do this we will check text strings against ASCII data (0-127 are english characters). # In[12]: def is_english(string): for character in string: if ord(character) > 127: return False return True print(is_english('Instagram')) print(is_english('爱奇艺PPS -《欢乐颂2》电视剧热播')) # Quick check of the function is fine, but some english apps utilize special characters. These apps would be excluded using this function as seen below. # In[13]: print(is_english('Docs To Go™ Free Office Suite')) print(is_english('Instachat 😜')) print(ord('™')) print(ord('😜')) # To minimize the loss of data, only apps with more than 3 characters outside of the ASCII range (0-127) will be excluded. # In[14]: def is_english(string): non_ascii = 0 for character in string: if ord(character) > 127: non_ascii += 1 if non_ascii > 3: return False else: return True print(is_english('Docs To Go™ Free Office Suite')) print(is_english('Instachat 😜')) print(is_english('爱奇艺PPS -《欢乐颂2》电视剧热播')) # In[15]: android_english = [] ios_english = [] for app in android_clean: name = app[0] if is_english(name): android_english.append(app) for app in ios: name = app[1] if is_english(name): ios_english.append(app) print(explore_data(android_english, 0, 3, True)) print('\n') print(explore_data(ios_english, 0, 3, True)) # The data has now been scrubbed of all inaccurate, duplicated, and non-english apps. It has left data sets of 9,614 rows for Android, and 6,183 rows for ios. # # The company only creates free apps, so the last step is to extract all the free apps. # In[16]: android_final = [] ios_final = [] for app in android_english: price = app[7] if price == '0': android_final.append(app) for app in ios_english: price = app[4] if price == '0.0': ios_final.append(app) print(len(android_final)) print(len(ios_final)) # The data is now scrubbed down to only include free and english apps. It also excludes all inaccurate and duplicated rows. # # We are left with 8864 Android rows, and 3222 ios rows. This constitutes enough data to begin our analysis. # ## Analysis # # Our goal is to determine which app types attract the most revenue, because revenue is highly influenced by number of app users. # # To minimize risks and overhead, our validation strategy for an app idea is comprised of three steps: # # 1. Build a minimal Android version of the app, and add it to Google Play. # 2. If the app has a good response from users, we develop it further. # 3. If the app is profitable after six months, we build an iOS version of the app and add it to the App Store. # # The end goal is for apps on both platforms, so it is important to find app profiles that are successful across both operating systems. # # To begin, the most common genres for each market need to be identified. To accomplish this frequency tables for certain columns will be created for each data set. # In[17]: def freq_table(dataset, index): table = {} total = 0 for row in dataset: total += 1 value = row[index] if value in table: table[value] += 1 else: table[value] = 1 table_percentages = {} for key in table: percentage = (table[key] / total) * 100 table_percentages[key] = percentage return table_percentages def display_table(dataset, index): table = freq_table(dataset, index) table_display = [] for key in table: key_val_as_tuple = (table[key], key) table_display.append(key_val_as_tuple) table_sorted = sorted(table_display, reverse = True) for entry in table_sorted: print(entry[1], ':', entry[0]) # In[18]: display_table(ios_final, -5) # Upon review of the above ios data it is seen that Games are the most common genre, with Entertainment as the runner-up. There is a steep drop-off seen after games that should be noted. The majority of apps are designed for entertainment. This data alone points to avoiding Games as the genre is heavily saturated. This could also point to a high adoption rate, but not a high utilization rate. It would be difficult for an app to really stand-out without massive overhead increases for design. # In[19]: display_table(android_final, 1) # Android apps skew in the opposite direction with more apps focused on practicality in lieu of entertainment when Category data is viewed. Family is the highest rated category with Games as runner-up. It should be noted that apps in the Family category are primarily apps for younger children. The focus on practicality is further illustrated when the percentages are listed for the Genre data below. # In[20]: display_table(android_final, -4) # The difference between Category and Genre comes to the forefront when granularity is considered. The big picture is all that needs to be considered, so Category will only be considered going forward. # # The next step is to get an idea of which app profiles have the most users. In Android we can just compare the Installs, but ios is a little different. Install data isn't tracked, so user ratings will be substituted using average number of ratings per genre in place of Installs. # In[21]: genres_ios = freq_table(ios_final, -5) for genre in genres_ios: total = 0 len_genre = 0 for app in ios_final: genre_app = app[-5] if genre_app == genre: n_ratings = float(app[5]) total += n_ratings len_genre += 1 avg_n_ratings = total / len_genre print(genre, ':', avg_n_ratings) # Navigation has the highest average rating, but this can be attributed to included navigation services or third party apps like WAZE which have a high user count as seen below. # In[22]: for app in ios_final: if app[-5] == 'Navigation': print(app[1], ':', avg_n_ratings) # Music and Social Networking fall victim to this same phenomenon due to apps for music like Spotify and Pandora; or Facebook and Instagram for social networking. # # This pattern makes Navigation, Music, and Social Networking look like more popular genres than they are in actuality. # # The Reference category presents this same dilemma because, although the genre has 74,942 ratings, the majority are for two apps: The Bible and Dictionary.com. This is illustrated below as well. # In[23]: for app in ios_final: if app[-5] == 'Reference': print(app[1], ':', avg_n_ratings) # Despite this, there might be some potential here. The Bible and the dictionary are two key reference materials for many users, so they are often utilized by those that would download them. Perhaps another reference book could be added with enhanced features. The app could send daily quotes, include the audio book, or even feature quizzes that match the user to characters. # # Factoring in the assumptions listed above about games, i.e. over-saturation of the genre and therefore recreational apps, this might be a good fit pending comparison to the Android data. # In[24]: display_table(android_final, 5) # The Android Installs are good data, but a little imprecise. The numbers will be left alone and extra characters excluded. This means an app rated at 500+ will be assumed to be 500 even if it could be up to 999. Again, it is just the big picture being considered. # In[25]: categories_android = freq_table(android_final, 1) for category in categories_android: total = 0 len_category = 0 for app in android_final: category_app = app[1] if category_app == category: n_installs = app[5] n_installs = n_installs.replace('+', '') n_installs = n_installs.replace(',', '') total += float(n_installs) len_category += 1 avg_n_installs = total / len_category print(category, ':', avg_n_installs) # Communication has the highest number of install, but this is skewed by a few apps with over a billion installs as seen below. # In[26]: for app in android_final: if app[1] == 'COMMUNICATION' and (app[5] == '1,000,000,000+' or app[5] == '500,000,000+' or app[5] == '100,000,000+'): print(app[0], ':', app[5]) # The average installs drop by roughly 10 times when all Communication apps with over 100,000,000 installs are excluded, per below # In[27]: under_100_m = [] for app in android_final: n_installs = app[5] n_installs = n_installs.replace('+', '') n_installs = n_installs.replace(',', '') if (app[1] == 'COMMUNICATION') and (float(n_installs) < 100000000): under_100_m.append(float(n_installs)) sum(under_100_m) / len(under_100_m) # This trend follows fairly closely with the Genres mentioned from ios, especially Games and Social. It is worth noting that many of the apps on Android have some big players that would be hard to compete against, i.e. Google. Google has many apps across the categories with the most installs. # # Books and Reference seem quite popular on Android, but it is worth looking at more closely given its promise on the ios platform. # In[28]: for app in android_final: if app[1] == 'BOOKS_AND_REFERENCE': print(app[0], ':', app[5]) # This category houses more than just the Books genre in ios. There are multiple catalogs, apps, and reference materials for subjects like coding, religion, geneaology, and ebook readers. # # Again, the trend seems to be a few highly installed apps are distorting our view. # In[29]: for app in android_final: if app[1] == 'BOOKS_AND_REFERENCE' and (app[5] == '1,000,000,000+' or app[5] == '500,000,000+' or app[5] == '100,000,000+'): print(app[0], ':', app[5]) # Accounting for these apps, there seems to be some room left in the middle of the pack. The list of apps with 1,000,000 to 100,000,000 installs is below. # In[30]: for app in android_final: if app[1] == 'BOOKS_AND_REFERENCE' and (app[5] == '1,000,000+' or app[5] == '5,000,000+' or app[5] == '10,000,000+' or app[5] == '50,000,000+'): print(app[0], ':', app[5]) # This portion of the category is full of references in the form of dictionaries and libraries, along with software for reading ebooks. There are several apps for the Quran, which indicates there is potential to turn an existing book into a good reference app. This market is flooded with simple library apps, so whatever book is developed must have new features (quizzes, audio, quote generator, forums) to stand out further. It is also recommended that the