#!/usr/bin/env python
# coding: utf-8

# # Best money-making apps in the Google Play Market and AppStore 
# 
# 
# 

# # Introduction
# There are many people use smartphones nowadays.
# Companies creating apps have to be proactive and inventive to 
# impress client.
# 
# We work as data scientists for company that builds free  to use apps. Our main revenue consist of in-app ads.
# 
# The goal of this project is to analyze datasets from App Store and Google Play Market and find app profiles that are:
# 
# * Attractive for users
# * Free of charge
# * For English speaking audience 
# 
# Dataset containing around 10.000 Android apps from the Google Play:[Link](https://dq-content.s3.amazonaws.com/350/googleplaystore.csv)
# 
# Dataset containing around 7.000 iOS apps from the App Store:[Link](https://dq-content.s3.amazonaws.com/350/AppleStore.csv)
# 

# # Explore datasets

# In[1]:


# Open two data sets
# Turn both into lists of lists
from csv import reader

# AppStore data set
AppleFile=open('AppleStore.csv',encoding='utf8')
apple_apps=list(reader(AppleFile))

apple_header= apple_apps[0]  # Header of App Store dataset              
apple=apple_apps[1:]         # App Store dataset without header

# Google Play data set
GoogleFile=open('googleplaystore.csv',encoding='utf8')
google_apps=list(reader(GoogleFile))


google_header= google_apps[0] # Header of Google Play Market dataset  
google=google_apps[1:]        # Google Play Market dataset without header


# Explore_data function prints out rows of each dataset in readable way:
# Function shows quantity of rows and columns

def explore_data(dataset,start,end, rows_and_columns=False):
    
    dataset_slice= dataset[start:end]
    
    for row in dataset_slice:
        print(row,'\n')
        
    
    if rows_and_columns:
        print('Number of rows:',len(dataset))
        print('Number of columns',len(dataset[0]))


# Print out several rows of `Google Play Store` data set to get the general information.
# 
# 

# In[2]:


print(google_header)
explore_data(google,1,6,True)


# Now let's take a look at `App Store` data set:

# In[3]:


print(apple_header)
explore_data(apple,1,6,True)


# # Data Cleaning
#  Before entering into the analysis we do data cleaning including:
# 
# * deleting wrong or incorrect data
# * removing duplicate data
# * modifying data (if needed) to reach the goal of analysis
# 
# 
# 
# 
# 

# ## 1.Remove wrong data
# The `Google Play` dataset has [discussion](https://www.kaggle.com/lava18/google-play-store-apps/discussion) section which [contains](https://www.kaggle.com/lava18/google-play-store-apps/discussion/66015) one report about in the row 10472.
# This problem is about missing value in the column 'Category'.
# 
# To make sure that other rows in `Google Play`  have the same length  we need to execute following check.
# 
# 

# In[4]:


errors_g=[]
i=0
for i in range(len(google)):
    
    if len(google[i])!=len(google_header): # Check if length of each entry does not coincide with length of the header
        print('Row ',i,' contains errors.')
        print(google[i])
        errors_g.append(i)                # In case of error saves the row number in list (errors_g)

for e in errors_g:                        # Loop over list (errors_g) 
    del google[e]                         # and delete rows containing failures from Google Play Market dataset
    print('Row ',e,' deleted')


# Perform lenght-check for `App Store` dataset as well:

# In[5]:


errors_a=[]
i=0
for i in range(len(apple)):
    if len(apple[i])!=len(apple_header):  # Check if length of each entry does not coincide with length of the header
        print('Row ',i,' contains errors.')
        print(apple[i])
        errors_a.append(i)                # In case of error saves the row number in list (errors_a)

        
for e in errors_a:                        # Loop over list (errors_a)
    del google[e]                         # and delete rows containing failures from App Store dataset
    print('Row ',e,' deleted')


# ## 2. Remove Duplicate Entries
# 
# 
# Following examination of `Google Play Store` dataset reveals that  it contains duplicate data.  
# 
# For instance:

# In[6]:


for item in google:
    name=item[0]
    
    if name=='KakaoTalk: Free Calls & Text':
        print(item)


# Let's count the quantity of duplicate and unique apps in` Google Play Store` dataset:

# In[7]:


duplicate=[]
unique=[]

for item in google:      #Program loop over Google Play Store dataset
    name=item[0]
    if name in unique:  
        duplicate.append(name) # If entry has duplicates save in duplicate list
    else:
        unique.append(name) 
        
print('Number of duplicate apps: ',len(duplicate))
print()
print('Examples of duplicate apps: ', duplicate[0:17])


# Duplicate entries of apps should be deleted to avoid multi-counting. To choose criteria for removal let's examine rows with duplicate data. The main difference is located in column 4 related to the number of reviews.
# 
# The higher the number of reviews, more recent data is.   
# Instead of removing rows randomly we keep only the row with highest review 
# number.
# 
# Following this procedure for 'Google Allo' the entry with the highest review 347086 remains, any other will be removed.
# 
# ['Google Allo', 'COMMUNICATION', '4.3', **'346982'**, 'Varies with device', '10,000,000+', 'Free', '0', 'Everyone', 'Communication', 'January 23, 2018', 'Varies with device', '4.1 and up']
# 
# ['Google Allo', 'COMMUNICATION', '4.3', **'346980'**, 'Varies with device', '10,000,000+', 'Free', '0', 'Everyone', 'Communication', 'January 23, 2018', 'Varies with device', '4.1 and up']
# 
# ['Google Allo', 'COMMUNICATION', '4.3', **'347086'**, 'Varies with device', '10,000,000+', 'Free', '0', 'Everyone', 'Communication', 'January 23, 2018', 'Varies with device', '4.1 and up']
# 
# 

# 
# 
# Using dictionary we create new list with dataset. New dataset will contain only one entry per app and for each app highest review.

# In[8]:


# To get rid of duplicated data we create a dictionary {reviews_max} containing 
# unique apps in the format-   name : highest numbers of reviews.


reviews_max={}
for app in google:
    name=app[0]           
    n_reviews=float(app[3])       
    
    if name in reviews_max and reviews_max[name]<n_reviews:        # If name is in dictionary save the highest number of reviews                                                       
        reviews_max[name]=n_reviews                             
            
    if name not in reviews_max:                                 # Add name and reviews if it is not in the dictionary yet
        reviews_max[name]=n_reviews
        

# In android_clean list we keep only unique apps entries with highest reviews.    
android_clean=[]
already_added=[]

for app in google:
    name=app[0]            
    n_reviews=float(app[3])
    
    if n_reviews == reviews_max[name] and name not in already_added:    # If number of review  = max. review
        android_clean.append(app)                                    # save app in the android_clean list
        already_added.append(name)
        
print(len(android_clean))


# ## 3. Non-english apps removal.
# 
# Our company uses English to develop apps. Thus in scope of this analysis we keep only apps for English-speaking audience. 
# Exploring our datasets reveals that some apps are designed for non-English users. We remove those apps.

# In[9]:


# Helper function ascii
# Checking number of unacceptable symbols contains string
# unacceptable symbols ASCII-code > 127

def ascii(any_string): 
    counter=0
    
    for char in any_string:              
        if ord(char) > 127:                # Calculates number of symbols with ASCII-code > 127
            counter += 1                   # (0-127, English letters, special symbols like !,#,?,@, etc.)
        
    if counter>3:                        
        return False
    else:
        return True
            

# In[10]:


# Test ascii fuction
print(ascii('Flame - درب عقلك يوميا'))
print(ascii('বাংলা টিভি প্রো BD Bangla TV'))
print(ascii('Cъновник BG'))
print(ascii('Instachat 😜'))
print(ascii('Bonjour 2017 Abidjan CI ❤❤❤❤❤'))


# In[11]:


# Separate Engish-based apps and save in lists eng_google and eng_apple

eng_google=[]
eng_apple=[]

for app in android_clean:       # Android_clean list (unique Android apps with highest rating)
    name=app[0]
    
    if ascii(name):             # If name does not contain > 3 symbols out of range 0-127 ===> save name in the eng_google
        eng_google.append(app)

        
for app in apple:               # Loop over apple list since it does not have duplicates and problems with length of entries
    name=app[1]
    
    if ascii(name):             # If name does not contain > 3 symbols out of range 0-127 ===> save name in the eng_apple
        eng_apple.append(app) 
        
print('Number of English apps in the Google Play Market dataset: ',len(eng_google))
print('Number of English apps in the App Store dataset: ',len(eng_apple))


# ## 4. Isolation of Free Apps
# 
#  To attract more users our goal is to focus on free to download and use apps. As mentioned in the introduction - our main source of income is in-built ads. In this section, we separate free apps and save them, while deleting non-free apps. 

# In[12]:


free_google=[]
free_apple=[]

for app in eng_google:
    price=app[7]
    
    if price=='0':
        free_google.append(app)
        
        
for app in eng_apple:
    price=app[4]
    
    if app[4]=='0.0':
        free_apple.append(app)
        

print('Quantity of free English apps in the Google Play Market dataset:',len(free_google))
print('Quantity of free English apps in the App Store dataset:',len(free_apple))


# # Plan for analysis
# 
# So far we cleaned the data to prepare it for analysis.
# Before proceeding with analysis we choose a strategy to reach the goal and avoid large costs.
# 
# We will stick to the following plan:
# 
# 1. Find an app profile that seems to be attractive for users in both `Google Play` and `App Store`.
# 
# 2. Create version of the app and place it in the `Google Play Store`. In case of positive feedback we will develop app          further.
# 
# 3. In case of positive profit, create version of the app for the `App Store`. 
# 
# 
# 
# 
# 
# 
# 

# ## 1. The Most Common Genres for each market
# 
# The `Google Play Market` dataset has a column `genre`.
# The `App Store` dataset has columns `category` and `genre`.
# 
# We will start by defining the most common genres or categories for each market.
# 

# In[13]:


# freq_table function return frequency table with percentages

def freq_table(dataset,index):
    freq_dict={}                          # Frequency table as dictionary
    
    for app in dataset:
        item=app[index]
        
        if item in freq_dict:            # If item is already in frequency table we add value
            freq_dict[item]+=1         
        else:
            freq_dict[item]=1            # Otherwise we create element of dictionary and define initial value
            
    for key in freq_dict:
         
        freq_dict[key]= round(freq_dict[key]/(len(dataset))*100,2)    # Calculate the percent of certain [key] in the dataset
        
    return freq_dict


# Display_table function returns dataset sorted in:
# - ascedning order (reverse=False)
# - descending order (reverse=True)

def display_table(dataset, index):
    table = freq_table(dataset, index)
    table_display = []
    
    for key in table:
        key_val_as_tuple = (table[key], key)
        table_display.append(key_val_as_tuple)

    table_sorted = sorted(table_display, reverse = True)
    for entry in table_sorted:
        print(entry[1], ':', entry[0])
        
  
# In[14]:


# List of GENRES in the App Store sorted by percentages
display_table(free_apple,11)  


# The most common genre among English, free apps of the `App Store` is `Games`. The next common is `Entertainment`. 
# The least common is `Catalogs` and `Medical`.
# 
# General impression is that most of the apps are created for entertainment ( games, social networking, video, shopping, etc.), but this does not imply that apps created for fun have the greatest amount of users.
# 
# 58% of all apps are related to Gaming in particular.
# Based only on the analysis of common genres is hard to recommend an app profile. 

# In[15]:


# List of CATEGORIES in the Google Play Market sorted by percentages
display_table(free_google,1)


# The most common category of `Google Play Market` is `Family`. The next common is `Game`.
# The least common are `Beauty` and `Comics`.
# 
# If we explore `Family` thorougly, we can see that category includes mostly games apps for kids.
# It means that in reality games has a share of 28,63%
# 
# General impression is that apps designed for entertainment purposes are popular, but they are in balance with apps for practical purposes.
# 
# 

# In[16]:


# List of GENRES in the Google Play Market sorted by percentages
display_table(free_google,9)


# The most popular genre in the `Google Play Market` is `Tools` and the next popular is `Entertainment`.
# 
# It is very difficult to find out difference between `Category` and `Genre`. It seems that some categories and genres duplicate each other. `Genre` has more subсells than `Categories`. Right now we are trying to find general picture and will not use `Categories` in the analysis.
# 
# 

# **Summary:**
# 
# Note: Only English apps are in scope of our analysis.
# 
# 1. `App Store` is dominated by apps designed for entertainment. Such genres as `Games`,`Entertainment`,`Photo & Video` have the lagrest share.
# 
# 2. `Google Play Market` has more even landscape, practical and funs apps are balanced. Apps for gaming are still in the majority.
# 

# ## 2. Most Popular Apps by reviews in the App Store
# 
# We got the apps distribution by genres and categories, now we determine types of apps with the most users.
# 
# `Google Play Market` has the column `installs`, so it is possible to calculate average number of downloads for each genre.
# 
# In `App Store`column `installs` is missing and  we will take total number of user rating as a substitute.
# Total number is ` column rating_count_tot`. 
# 
# 

# In[17]:


def freq_table_genre(dataset,index):
    freq_dict={}
    
    for app in dataset:
        item=app[index]
        if item in freq_dict:
            freq_dict[item]+=1
        else:
            freq_dict[item]=1
        
    return freq_dict


# Prime_genre shows-  genre: number of apps in Google Play Market
prime_genre=freq_table_genre(free_apple,11) 


# Get list_ratings with data in format-  genre: average number of user rating
list_ratings={} 

for genre in prime_genre:
    total=0
    len_genre=0
    
    for item in free_apple:
        genre_app=item[11]
        rating=item[5]
        
        if genre_app==genre:
            total+=float(rating)
            len_genre+=1
            
    average=(format(total/len_genre,'.2f'))
    list_ratings[genre]= float(average)


# Sort list_rating in descending order
table_display = []    
for key in list_ratings:
    key_val_as_tuple = (list_ratings[key], key)
    table_display.append(key_val_as_tuple)


table_sorted = sorted(table_display, reverse = True)
for entry in table_sorted:   
    print(entry[1], ':', entry[0])
       
        
# 
# `Navigation` genre` has the highest rating. Meanwhile genre rating seems to be influenced by quite high ratings of following apps:
# 
# * Waze - GPS Navigation, Maps & Real-time Traffic  :  345046
# * Google Maps - Navigation & Transit  :  154911
# 
# Excluding two apps above from the list will help us to see that `Navigation` is not very interesting for users.

# In[18]:


length=0
total=0
for j in free_apple:
    name=j[1]
    genre=j[11]
    if genre=='Navigation' and name!='Waze - GPS Navigation, Maps & Real-time Traffic' and name!='Google Maps - Navigation & Transit':
        length+=1
        total+=float(j[5])
average=total/length
print('Navigation rating:',average)


# `Reference` is the next highest rating genre. Following apps skew the rating of genre because of their-own high rating: 
# * Bible  :  985920
# * Dictionary.com Dictionary & Thesaurus  :  200047
# * Dictionary.com Dictionary & Thesaurus for iPad  :  54175

# In[19]:


length=0
total=0
for j in free_apple:
    name=j[1]
    genre=j[11]
    if genre=='Reference' and name!='Bible' and name!='Dictionary.com Dictionary & Thesaurus' and name!='Dictionary.com Dictionary & Thesaurus for iPad':
        length+=1
        total+=float(j[5])
average=total/length
print('Reference rating:',average)


# 
# At first sight the result of `Social Networking` is skewed by such a headliners as Facebook, Pinterest, WhatsApp etc.
# If we remove ratings of several "top apps" the rating of `Social Networking` remains rather high. That allows us to draw a conclusion that  `Social Networking`  is popular.
# 

# In[20]:


length=0
total=0
for j in free_apple:
    name=j[1]
    genre=j[11]
    if genre=='Social Networking' and name!='Facebook' and name!='Pinterest' and name!='WhatsApp'\
        and name!='Skype for iPhone' and name!='Messenger' and name!='Kik':
        length+=1
        total+=float(j[5])
average=total/length
print('Social Networking rating:',average)


# Let's see what happens to  `Food & Drink` and `Shopping` if we remove some high-rated apps.

# In[21]:


length=0
total=0
for j in free_apple:
    name=j[1]
    genre=j[11]
    if genre=='Food & Drink' and name!='Starbucks' and name!='Domino\'s Pizza USA' :
        length+=1
        total+=float(j[5])
average=total/length
print('Food & Drink rating:',average)


# In[22]:


length=0
total=0
for j in free_apple:
    name=j[1]
    genre=j[11]
    if genre=='Shopping' and name!='Groupon - Deals, Coupons & Discount Shopping App ' and name!='Wish - Shopping Made Fun' \
        and name!='Wish - Shopping Made Fun':
        length+=1
        total+=float(j[5])
average=total/length
print('Shopping rating:',average)


# In[23]:


#This helps to explore apps belonging to certain genre and its total rating(rating_count_tot)
for j in free_apple:
    genre=j[11]
    app=j[1]
    rating_count_tot=j[5]
    if genre=='Photo & Video':
        print(app,' : ',rating_count_tot)


# **Outputs**
# 
# Following genres are skewed by strong market players:
# * `Navigation` - Waze, Google Map.
# * `Reference` - Bible, Dictionary.com Dictionary & Thesaurus.
# * `Photo & Video`- Instagram, Snapchat, YouTube.
# * `Book` - Kindle, Audible.
# * `Food & Drink`- Starbucks, Domino's Pizza USA.
# 
# `Social Networking` is influenced by Facebook, Pinterest, Skype, WhatsApp, Kik, but even without them genre seems to be popular.
# Creating app in such genre means buisness competition with leaders. 
# 
# 
# In spite of Pandora, Spotify and Shazam impact on  `Music` ratings there are plenty of other relatively popular apps.
# We should take into consideration that people might not spend much time surfing in the `Music` apps, rather have them in the background while listening to the music.
# 
# Same pattern for `Weather` which has relatively high ratings. People do not tend to spend time in weather apps.
# 
# 
# `Games` dominating the App Store in terms of numbers of apps. If we explore `Games`, we'll see there are plenty of 
# app with very high ratings and average rating. 
# Going this direction demands following analisys about the most popular genres of games.
# 
# 
# `Food & Drink`, `Finance`, `Travel`- require additional activities, for instance: open a restaurant, get some experience in
# cooking, hiring finance professionals, etc.
# 
# `Shopping` seems to have potential. This genre still has a quite high rating even if we remove some highest-rated apps.
# 
# 

# ## 3. Most Popular Apps by downloads in the Google Play
# 
# There is data about numbers of downloads for the `Google Play Market`.
# Since this data is open-ended we do not know exact number of installs.
# For example category `10.000+` includes all values greater than 10.000.
# We are going to use those values and consider `10.000+` as `10.000` or `200.000+` as `200.000`.
# 

# In[24]:


# Prime_category is frequency table for each category  of the Google Play Market
prime_category=freq_table_genre(free_google,1)

inter_list={}
for category in prime_category:
    total=0
    len_category=0
    
    for item in free_google:
        cat_app=item[1]      # category of app 
        installs=item[5]     # number of dowloads
        
        if cat_app==category:
            installs=installs.replace('+','') # removing '+' from downloads value
            installs=installs.replace(',','') # removing ',' from downloads value
            total+=float(installs)
            len_category+=1
            
    average=total/len_category            # average number of downloads per category        
    inter_list[category]=average          # dictionary inter_list contains categories as keys and average number 
                                          # downloads as values


# Display_table_cat shows the dataset in descending order
def display_table_cat(dataset):
    
    table_display = []
    for key in dataset:
        key_val_as_tuple = (dataset[key], key)
        table_display.append(key_val_as_tuple)

    table_sorted = sorted(table_display, reverse = True)
    for entry in table_sorted:
        print(entry[1], ':', entry[0])
        
# Show inter_list in descending order
display_table_cat(inter_list)


# In[28]:


# Helps to explore content of categories
for app in free_google:
    if app[1]=='BUSINESS':
        print(app[0],' : ', app[5])


# It is interesting to investigate what happens to the downloads indicator if we remove some high-rated apps from account.

# In[27]:


reduced = []

for app in free_google:
    index='COMMUNICATION'
    cat = app[1]
    installs = app[5]
    installs = installs.replace(',', '')
    installs = installs.replace('+', '')
    
    if ( cat == index) and (float(installs) < 100000000):  # Remove all apps over certain number of downloads
        reduced.append(float(installs))
        
print(index,'updated rating: ',sum(reduced) / len(reduced))


# **Outputs**
# 
# Distribution of applications by categories in the `Google Play Market` is different compared to `App Store`.
# 
# For instance:
# * 1)`COMMUNICATION` and `SOCIAL` are two different categories in `Google Play Market`.
# In `App Store` we have `Social Networking` genre.
# 
# * 2)`Photo & Video` genre in `App Store` and in `Google Play Market` there are `PHOTOGRAPHY` and `VIDEO_PLAYERS` categories.
# 
# It should be taken into account that there are many categories dominated by few giants.
# For example:
# * `SOCIAL`- Facebook, Google+.
# * `VIDEO_PLAYERS`- YouTube, Motorola Gallery.
# * `TOOLS` - Google,Account Manager.
# 
# We are looking for categories apps that:
# * 1) Users spend plenty of time in.
# * 2) Have relatively high number of downloads after removing ratings of some top-rated apps.
# 
# There are list of such categories:
# * `COMMUNICATION`
# * `VIDEO_PLAYERS`
# * `SOCIAL`
# * `GAME`
# * `SHOPPING`
# 
# 
# 
# 
# 
# 

# # Conclusions and results
# 
# In this project, we analyzed apps in the Google Play Market and App Store to find app profiles that are attractive for users in both markets.
# 
# The distribution of applications by genre and category may create some obstacles to analysis in principle. We assume that the distribution is correct from the beginning.
# 
# The following categories have the potential to create applications:
# * `Social Networking`
# * `Games`
# * `Shopping`
# 
# 
# It is clear that the market is full of competitive apps in principle.
# Creating an app in these categories may mean competition and attracting specialized developers.