#!/usr/bin/env python # coding: utf-8 # # Best money-making apps in the Google Play Market and AppStore # # # # # Introduction # There are many people use smartphones nowadays. # Companies creating apps have to be proactive and inventive to # impress client. # # We work as data scientists for company that builds free to use apps. Our main revenue consist of in-app ads. # # The goal of this project is to analyze datasets from App Store and Google Play Market and find app profiles that are: # # * Attractive for users # * Free of charge # * For English speaking audience # # Dataset containing around 10.000 Android apps from the Google Play:[Link](https://dq-content.s3.amazonaws.com/350/googleplaystore.csv) # # Dataset containing around 7.000 iOS apps from the App Store:[Link](https://dq-content.s3.amazonaws.com/350/AppleStore.csv) # # # Explore datasets # In[1]: # Open two data sets # Turn both into lists of lists from csv import reader # AppStore data set AppleFile=open('AppleStore.csv',encoding='utf8') apple_apps=list(reader(AppleFile)) apple_header= apple_apps[0] # Header of App Store dataset apple=apple_apps[1:] # App Store dataset without header # Google Play data set GoogleFile=open('googleplaystore.csv',encoding='utf8') google_apps=list(reader(GoogleFile)) google_header= google_apps[0] # Header of Google Play Market dataset google=google_apps[1:] # Google Play Market dataset without header # Explore_data function prints out rows of each dataset in readable way: # Function shows quantity of rows and columns def explore_data(dataset,start,end, rows_and_columns=False): dataset_slice= dataset[start:end] for row in dataset_slice: print(row,'\n') if rows_and_columns: print('Number of rows:',len(dataset)) print('Number of columns',len(dataset[0])) # Print out several rows of `Google Play Store` data set to get the general information. # # # In[2]: print(google_header) explore_data(google,1,6,True) # Now let's take a look at `App Store` data set: # In[3]: print(apple_header) explore_data(apple,1,6,True) # # Data Cleaning # Before entering into the analysis we do data cleaning including: # # * deleting wrong or incorrect data # * removing duplicate data # * modifying data (if needed) to reach the goal of analysis # # # # # # ## 1.Remove wrong data # The `Google Play` dataset has [discussion](https://www.kaggle.com/lava18/google-play-store-apps/discussion) section which [contains](https://www.kaggle.com/lava18/google-play-store-apps/discussion/66015) one report about in the row 10472. # This problem is about missing value in the column 'Category'. # # To make sure that other rows in `Google Play` have the same length we need to execute following check. # # # In[4]: errors_g=[] i=0 for i in range(len(google)): if len(google[i])!=len(google_header): # Check if length of each entry does not coincide with length of the header print('Row ',i,' contains errors.') print(google[i]) errors_g.append(i) # In case of error saves the row number in list (errors_g) for e in errors_g: # Loop over list (errors_g) del google[e] # and delete rows containing failures from Google Play Market dataset print('Row ',e,' deleted') # Perform lenght-check for `App Store` dataset as well: # In[5]: errors_a=[] i=0 for i in range(len(apple)): if len(apple[i])!=len(apple_header): # Check if length of each entry does not coincide with length of the header print('Row ',i,' contains errors.') print(apple[i]) errors_a.append(i) # In case of error saves the row number in list (errors_a) for e in errors_a: # Loop over list (errors_a) del google[e] # and delete rows containing failures from App Store dataset print('Row ',e,' deleted') # ## 2. Remove Duplicate Entries # # # Following examination of `Google Play Store` dataset reveals that it contains duplicate data. # # For instance: # In[6]: for item in google: name=item[0] if name=='KakaoTalk: Free Calls & Text': print(item) # Let's count the quantity of duplicate and unique apps in` Google Play Store` dataset: # In[7]: duplicate=[] unique=[] for item in google: #Program loop over Google Play Store dataset name=item[0] if name in unique: duplicate.append(name) # If entry has duplicates save in duplicate list else: unique.append(name) print('Number of duplicate apps: ',len(duplicate)) print() print('Examples of duplicate apps: ', duplicate[0:17]) # Duplicate entries of apps should be deleted to avoid multi-counting. To choose criteria for removal let's examine rows with duplicate data. The main difference is located in column 4 related to the number of reviews. # # The higher the number of reviews, more recent data is. # Instead of removing rows randomly we keep only the row with highest review # number. # # Following this procedure for 'Google Allo' the entry with the highest review 347086 remains, any other will be removed. # # ['Google Allo', 'COMMUNICATION', '4.3', **'346982'**, 'Varies with device', '10,000,000+', 'Free', '0', 'Everyone', 'Communication', 'January 23, 2018', 'Varies with device', '4.1 and up'] # # ['Google Allo', 'COMMUNICATION', '4.3', **'346980'**, 'Varies with device', '10,000,000+', 'Free', '0', 'Everyone', 'Communication', 'January 23, 2018', 'Varies with device', '4.1 and up'] # # ['Google Allo', 'COMMUNICATION', '4.3', **'347086'**, 'Varies with device', '10,000,000+', 'Free', '0', 'Everyone', 'Communication', 'January 23, 2018', 'Varies with device', '4.1 and up'] # # # # # Using dictionary we create new list with dataset. New dataset will contain only one entry per app and for each app highest review. # In[8]: # To get rid of duplicated data we create a dictionary {reviews_max} containing # unique apps in the format- name : highest numbers of reviews. reviews_max={} for app in google: name=app[0] n_reviews=float(app[3]) if name in reviews_max and reviews_max[name] 127 def ascii(any_string): counter=0 for char in any_string: if ord(char) > 127: # Calculates number of symbols with ASCII-code > 127 counter += 1 # (0-127, English letters, special symbols like !,#,?,@, etc.) if counter>3: return False else: return True # In[10]: # Test ascii fuction print(ascii('Flame - درب عقلك يوميا')) print(ascii('বাংলা টিভি প্রো BD Bangla TV')) print(ascii('Cъновник BG')) print(ascii('Instachat 😜')) print(ascii('Bonjour 2017 Abidjan CI ❤❤❤❤❤')) # In[11]: # Separate Engish-based apps and save in lists eng_google and eng_apple eng_google=[] eng_apple=[] for app in android_clean: # Android_clean list (unique Android apps with highest rating) name=app[0] if ascii(name): # If name does not contain > 3 symbols out of range 0-127 ===> save name in the eng_google eng_google.append(app) for app in apple: # Loop over apple list since it does not have duplicates and problems with length of entries name=app[1] if ascii(name): # If name does not contain > 3 symbols out of range 0-127 ===> save name in the eng_apple eng_apple.append(app) print('Number of English apps in the Google Play Market dataset: ',len(eng_google)) print('Number of English apps in the App Store dataset: ',len(eng_apple)) # ## 4. Isolation of Free Apps # # To attract more users our goal is to focus on free to download and use apps. As mentioned in the introduction - our main source of income is in-built ads. In this section, we separate free apps and save them, while deleting non-free apps. # In[12]: free_google=[] free_apple=[] for app in eng_google: price=app[7] if price=='0': free_google.append(app) for app in eng_apple: price=app[4] if app[4]=='0.0': free_apple.append(app) print('Quantity of free English apps in the Google Play Market dataset:',len(free_google)) print('Quantity of free English apps in the App Store dataset:',len(free_apple)) # # Plan for analysis # # So far we cleaned the data to prepare it for analysis. # Before proceeding with analysis we choose a strategy to reach the goal and avoid large costs. # # We will stick to the following plan: # # 1. Find an app profile that seems to be attractive for users in both `Google Play` and `App Store`. # # 2. Create version of the app and place it in the `Google Play Store`. In case of positive feedback we will develop app further. # # 3. In case of positive profit, create version of the app for the `App Store`. # # # # # # # # ## 1. The Most Common Genres for each market # # The `Google Play Market` dataset has a column `genre`. # The `App Store` dataset has columns `category` and `genre`. # # We will start by defining the most common genres or categories for each market. # # In[13]: # freq_table function return frequency table with percentages def freq_table(dataset,index): freq_dict={} # Frequency table as dictionary for app in dataset: item=app[index] if item in freq_dict: # If item is already in frequency table we add value freq_dict[item]+=1 else: freq_dict[item]=1 # Otherwise we create element of dictionary and define initial value for key in freq_dict: freq_dict[key]= round(freq_dict[key]/(len(dataset))*100,2) # Calculate the percent of certain [key] in the dataset return freq_dict # Display_table function returns dataset sorted in: # - ascedning order (reverse=False) # - descending order (reverse=True) def display_table(dataset, index): table = freq_table(dataset, index) table_display = [] for key in table: key_val_as_tuple = (table[key], key) table_display.append(key_val_as_tuple) table_sorted = sorted(table_display, reverse = True) for entry in table_sorted: print(entry[1], ':', entry[0]) # In[14]: # List of GENRES in the App Store sorted by percentages display_table(free_apple,11) # The most common genre among English, free apps of the `App Store` is `Games`. The next common is `Entertainment`. # The least common is `Catalogs` and `Medical`. # # General impression is that most of the apps are created for entertainment ( games, social networking, video, shopping, etc.), but this does not imply that apps created for fun have the greatest amount of users. # # 58% of all apps are related to Gaming in particular. # Based only on the analysis of common genres is hard to recommend an app profile. # In[15]: # List of CATEGORIES in the Google Play Market sorted by percentages display_table(free_google,1) # The most common category of `Google Play Market` is `Family`. The next common is `Game`. # The least common are `Beauty` and `Comics`. # # If we explore `Family` thorougly, we can see that category includes mostly games apps for kids. # It means that in reality games has a share of 28,63% # # General impression is that apps designed for entertainment purposes are popular, but they are in balance with apps for practical purposes. # # # In[16]: # List of GENRES in the Google Play Market sorted by percentages display_table(free_google,9) # The most popular genre in the `Google Play Market` is `Tools` and the next popular is `Entertainment`. # # It is very difficult to find out difference between `Category` and `Genre`. It seems that some categories and genres duplicate each other. `Genre` has more subсells than `Categories`. Right now we are trying to find general picture and will not use `Categories` in the analysis. # # # **Summary:** # # Note: Only English apps are in scope of our analysis. # # 1. `App Store` is dominated by apps designed for entertainment. Such genres as `Games`,`Entertainment`,`Photo & Video` have the lagrest share. # # 2. `Google Play Market` has more even landscape, practical and funs apps are balanced. Apps for gaming are still in the majority. # # ## 2. Most Popular Apps by reviews in the App Store # # We got the apps distribution by genres and categories, now we determine types of apps with the most users. # # `Google Play Market` has the column `installs`, so it is possible to calculate average number of downloads for each genre. # # In `App Store`column `installs` is missing and we will take total number of user rating as a substitute. # Total number is ` column rating_count_tot`. # # # In[17]: def freq_table_genre(dataset,index): freq_dict={} for app in dataset: item=app[index] if item in freq_dict: freq_dict[item]+=1 else: freq_dict[item]=1 return freq_dict # Prime_genre shows- genre: number of apps in Google Play Market prime_genre=freq_table_genre(free_apple,11) # Get list_ratings with data in format- genre: average number of user rating list_ratings={} for genre in prime_genre: total=0 len_genre=0 for item in free_apple: genre_app=item[11] rating=item[5] if genre_app==genre: total+=float(rating) len_genre+=1 average=(format(total/len_genre,'.2f')) list_ratings[genre]= float(average) # Sort list_rating in descending order table_display = [] for key in list_ratings: key_val_as_tuple = (list_ratings[key], key) table_display.append(key_val_as_tuple) table_sorted = sorted(table_display, reverse = True) for entry in table_sorted: print(entry[1], ':', entry[0]) # # `Navigation` genre` has the highest rating. Meanwhile genre rating seems to be influenced by quite high ratings of following apps: # # * Waze - GPS Navigation, Maps & Real-time Traffic : 345046 # * Google Maps - Navigation & Transit : 154911 # # Excluding two apps above from the list will help us to see that `Navigation` is not very interesting for users. # In[18]: length=0 total=0 for j in free_apple: name=j[1] genre=j[11] if genre=='Navigation' and name!='Waze - GPS Navigation, Maps & Real-time Traffic' and name!='Google Maps - Navigation & Transit': length+=1 total+=float(j[5]) average=total/length print('Navigation rating:',average) # `Reference` is the next highest rating genre. Following apps skew the rating of genre because of their-own high rating: # * Bible : 985920 # * Dictionary.com Dictionary & Thesaurus : 200047 # * Dictionary.com Dictionary & Thesaurus for iPad : 54175 # In[19]: length=0 total=0 for j in free_apple: name=j[1] genre=j[11] if genre=='Reference' and name!='Bible' and name!='Dictionary.com Dictionary & Thesaurus' and name!='Dictionary.com Dictionary & Thesaurus for iPad': length+=1 total+=float(j[5]) average=total/length print('Reference rating:',average) # # At first sight the result of `Social Networking` is skewed by such a headliners as Facebook, Pinterest, WhatsApp etc. # If we remove ratings of several "top apps" the rating of `Social Networking` remains rather high. That allows us to draw a conclusion that `Social Networking` is popular. # # In[20]: length=0 total=0 for j in free_apple: name=j[1] genre=j[11] if genre=='Social Networking' and name!='Facebook' and name!='Pinterest' and name!='WhatsApp'\ and name!='Skype for iPhone' and name!='Messenger' and name!='Kik': length+=1 total+=float(j[5]) average=total/length print('Social Networking rating:',average) # Let's see what happens to `Food & Drink` and `Shopping` if we remove some high-rated apps. # In[21]: length=0 total=0 for j in free_apple: name=j[1] genre=j[11] if genre=='Food & Drink' and name!='Starbucks' and name!='Domino\'s Pizza USA' : length+=1 total+=float(j[5]) average=total/length print('Food & Drink rating:',average) # In[22]: length=0 total=0 for j in free_apple: name=j[1] genre=j[11] if genre=='Shopping' and name!='Groupon - Deals, Coupons & Discount Shopping App ' and name!='Wish - Shopping Made Fun' \ and name!='Wish - Shopping Made Fun': length+=1 total+=float(j[5]) average=total/length print('Shopping rating:',average) # In[23]: #This helps to explore apps belonging to certain genre and its total rating(rating_count_tot) for j in free_apple: genre=j[11] app=j[1] rating_count_tot=j[5] if genre=='Photo & Video': print(app,' : ',rating_count_tot) # **Outputs** # # Following genres are skewed by strong market players: # * `Navigation` - Waze, Google Map. # * `Reference` - Bible, Dictionary.com Dictionary & Thesaurus. # * `Photo & Video`- Instagram, Snapchat, YouTube. # * `Book` - Kindle, Audible. # * `Food & Drink`- Starbucks, Domino's Pizza USA. # # `Social Networking` is influenced by Facebook, Pinterest, Skype, WhatsApp, Kik, but even without them genre seems to be popular. # Creating app in such genre means buisness competition with leaders. # # # In spite of Pandora, Spotify and Shazam impact on `Music` ratings there are plenty of other relatively popular apps. # We should take into consideration that people might not spend much time surfing in the `Music` apps, rather have them in the background while listening to the music. # # Same pattern for `Weather` which has relatively high ratings. People do not tend to spend time in weather apps. # # # `Games` dominating the App Store in terms of numbers of apps. If we explore `Games`, we'll see there are plenty of # app with very high ratings and average rating. # Going this direction demands following analisys about the most popular genres of games. # # # `Food & Drink`, `Finance`, `Travel`- require additional activities, for instance: open a restaurant, get some experience in # cooking, hiring finance professionals, etc. # # `Shopping` seems to have potential. This genre still has a quite high rating even if we remove some highest-rated apps. # # # ## 3. Most Popular Apps by downloads in the Google Play # # There is data about numbers of downloads for the `Google Play Market`. # Since this data is open-ended we do not know exact number of installs. # For example category `10.000+` includes all values greater than 10.000. # We are going to use those values and consider `10.000+` as `10.000` or `200.000+` as `200.000`. # # In[24]: # Prime_category is frequency table for each category of the Google Play Market prime_category=freq_table_genre(free_google,1) inter_list={} for category in prime_category: total=0 len_category=0 for item in free_google: cat_app=item[1] # category of app installs=item[5] # number of dowloads if cat_app==category: installs=installs.replace('+','') # removing '+' from downloads value installs=installs.replace(',','') # removing ',' from downloads value total+=float(installs) len_category+=1 average=total/len_category # average number of downloads per category inter_list[category]=average # dictionary inter_list contains categories as keys and average number # downloads as values # Display_table_cat shows the dataset in descending order def display_table_cat(dataset): table_display = [] for key in dataset: key_val_as_tuple = (dataset[key], key) table_display.append(key_val_as_tuple) table_sorted = sorted(table_display, reverse = True) for entry in table_sorted: print(entry[1], ':', entry[0]) # Show inter_list in descending order display_table_cat(inter_list) # In[28]: # Helps to explore content of categories for app in free_google: if app[1]=='BUSINESS': print(app[0],' : ', app[5]) # It is interesting to investigate what happens to the downloads indicator if we remove some high-rated apps from account. # In[27]: reduced = [] for app in free_google: index='COMMUNICATION' cat = app[1] installs = app[5] installs = installs.replace(',', '') installs = installs.replace('+', '') if ( cat == index) and (float(installs) < 100000000): # Remove all apps over certain number of downloads reduced.append(float(installs)) print(index,'updated rating: ',sum(reduced) / len(reduced)) # **Outputs** # # Distribution of applications by categories in the `Google Play Market` is different compared to `App Store`. # # For instance: # * 1)`COMMUNICATION` and `SOCIAL` are two different categories in `Google Play Market`. # In `App Store` we have `Social Networking` genre. # # * 2)`Photo & Video` genre in `App Store` and in `Google Play Market` there are `PHOTOGRAPHY` and `VIDEO_PLAYERS` categories. # # It should be taken into account that there are many categories dominated by few giants. # For example: # * `SOCIAL`- Facebook, Google+. # * `VIDEO_PLAYERS`- YouTube, Motorola Gallery. # * `TOOLS` - Google,Account Manager. # # We are looking for categories apps that: # * 1) Users spend plenty of time in. # * 2) Have relatively high number of downloads after removing ratings of some top-rated apps. # # There are list of such categories: # * `COMMUNICATION` # * `VIDEO_PLAYERS` # * `SOCIAL` # * `GAME` # * `SHOPPING` # # # # # # # # Conclusions and results # # In this project, we analyzed apps in the Google Play Market and App Store to find app profiles that are attractive for users in both markets. # # The distribution of applications by genre and category may create some obstacles to analysis in principle. We assume that the distribution is correct from the beginning. # # The following categories have the potential to create applications: # * `Social Networking` # * `Games` # * `Shopping` # # # It is clear that the market is full of competitive apps in principle. # Creating an app in these categories may mean competition and attracting specialized developers.