Mobile App Data

This project is about identifying profitable profiles for the App Store and Google Play Markets

In [1]:
### Google Play data set ### 
opened_file = open('googleplaystore.csv')
from csv import reader
read_file = reader(opened_file)
android = list(read_file)
android_header = android[0]
android = android[1:]

### App Store data set ###
opened_file = open('AppleStore.csv')
read_file = reader(opened_file)
ios = list(read_file)
ios_header =ios[0]
ios = ios[2:]

def explore_data(dataset, start, end, rows_and_columns=False):
    dataset_slice = dataset[start:end]    
    for row in dataset_slice:
        print(row)
        print('\n') # adds a new (empty) line after each row

    if rows_and_columns:
        print('Number of rows:', len(dataset))
        print('Number of columns:', len(dataset[0]))

print(android_header)
print('\n')
explore_data(android, 0, 3, True)
['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver']


['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['Coloring book moana', 'ART_AND_DESIGN', '3.9', '967', '14M', '500,000+', 'Free', '0', 'Everyone', 'Art & Design;Pretend Play', 'January 15, 2018', '2.0.0', '4.0.3 and up']


['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']


Number of rows: 10841
Number of columns: 13
In [2]:
print(android[10472])  # incorrect row
print('\n')
print(android_header)  # header
print('\n')
print(android[0])      # correct row

del(android[10472])
print(android[10472])  # deleted row
['Life Made WI-Fi Touchscreen Photo Frame', '1.9', '19', '3.0M', '1,000+', 'Free', '0', 'Everyone', '', 'February 11, 2018', '1.0.19', '4.0 and up']


['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver']


['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']
['osmino Wi-Fi: free WiFi', 'TOOLS', '4.2', '134203', '4.1M', '10,000,000+', 'Free', '0', 'Everyone', 'Tools', 'August 7, 2018', '6.06.14', '4.4 and up']
In [3]:
### Google Play data set has duplicates, so let's get rid of em! ###
for app in android:
    name = app[0]
    if name == 'Instagram':
        print(app)
['Instagram', 'SOCIAL', '4.5', '66577313', 'Varies with device', '1,000,000,000+', 'Free', '0', 'Teen', 'Social', 'July 31, 2018', 'Varies with device', 'Varies with device']
['Instagram', 'SOCIAL', '4.5', '66577446', 'Varies with device', '1,000,000,000+', 'Free', '0', 'Teen', 'Social', 'July 31, 2018', 'Varies with device', 'Varies with device']
['Instagram', 'SOCIAL', '4.5', '66577313', 'Varies with device', '1,000,000,000+', 'Free', '0', 'Teen', 'Social', 'July 31, 2018', 'Varies with device', 'Varies with device']
['Instagram', 'SOCIAL', '4.5', '66509917', 'Varies with device', '1,000,000,000+', 'Free', '0', 'Teen', 'Social', 'July 31, 2018', 'Varies with device', 'Varies with device']
In [4]:
### I'm not going to remove duplicates randomly. 
### Rather, I'm going to delete all except the first one found ###
seen_duplicates = []
unique_apps = []

for app in android:
    name == app[0]
    if name in unique_apps:
        seen_duplicates.append(name)
    else:
        unique_apps.append(name)
        
print('Number of duplicates:', len(seen_duplicates))
print('\n')
print('Number of unique', len(unique_apps))
print('Examples of duplicate apps:', seen_duplicates[:15] )
Number of duplicates: 10839


Number of unique 1
Examples of duplicate apps: ['iHoroscope - 2018 Daily Horoscope & Astrology', 'iHoroscope - 2018 Daily Horoscope & Astrology', 'iHoroscope - 2018 Daily Horoscope & Astrology', 'iHoroscope - 2018 Daily Horoscope & Astrology', 'iHoroscope - 2018 Daily Horoscope & Astrology', 'iHoroscope - 2018 Daily Horoscope & Astrology', 'iHoroscope - 2018 Daily Horoscope & Astrology', 'iHoroscope - 2018 Daily Horoscope & Astrology', 'iHoroscope - 2018 Daily Horoscope & Astrology', 'iHoroscope - 2018 Daily Horoscope & Astrology', 'iHoroscope - 2018 Daily Horoscope & Astrology', 'iHoroscope - 2018 Daily Horoscope & Astrology', 'iHoroscope - 2018 Daily Horoscope & Astrology', 'iHoroscope - 2018 Daily Horoscope & Astrology', 'iHoroscope - 2018 Daily Horoscope & Astrology']
In [5]:
### Removing duplicate entries and store
### separate lists for new cleaned data set and 
### just app names for detecting duplicates ###

reviews_max = {}
for app in android[1:]:
    name = app[0]
    n_reviews = float(app[3])
    if (name in reviews_max) and (reviews_max[name] < n_reviews):
        reviews_max[name] = n_reviews
    if (name not in reviews_max):
        reviews_max[name] = n_reviews
print('Expected length:', len(android) - 1181)
print('Actual length:', len(reviews_max))
        
android_clean = []
already_added = []

for app in android[1:]:
    name = app[0]
    n_reviews = float(app[3])
    if n_reviews == reviews_max[name] and name not in already_added:
        android_clean.append(app)
        already_added.append(name)
Expected length: 9659
Actual length: 9658
In [6]:
### exploring android_clean data set to ensure it displays as expected ###
explore_data(android_clean, 0, 3, True)
['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']


['Sketch - Draw & Paint', 'ART_AND_DESIGN', '4.5', '215644', '25M', '50,000,000+', 'Free', '0', 'Teen', 'Art & Design', 'June 8, 2018', 'Varies with device', '4.2 and up']


['Pixel Draw - Number Art Coloring Book', 'ART_AND_DESIGN', '4.3', '967', '2.8M', '100,000+', 'Free', '0', 'Everyone', 'Art & Design;Creativity', 'June 20, 2018', '1.1', '4.4 and up']


Number of rows: 9658
Number of columns: 13
In [7]:
## adding function that takes a string and determines if there is any 
## character that doesn't belong to the set of common English characters 
## if there are more than 3 chars that fall outside the ASCII range (0-127)
## it is determined to be non-english
def english_only(language):
    count = 0
    for char in language:
        if(ord(char) > 127):
            count += 1;
    
    if count > 3:
        return False;
    else:
        return True;

print(english_only('Instagram'))
print(english_only('爱奇艺PPS -《欢乐颂2》电视剧热播'))
print(english_only('Docs To Go™ Free Office Suite'))
print(english_only('Instachat 😜'))
True
False
True
True
In [8]:
## separate Android and iOS apps and find out how many of each we have ###
android_english = []
ios_english = []

for app in android_clean:
    name = app[0]
    if english_only(name):
        android_english.append(app)

for app in ios:
    name = app[1];
    if english_only(name):
        ios_english.append(app)
        
explore_data(android_english, 0, 3, True)
print('\n')
explore_data(ios_english, 0, 3, True)
['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']


['Sketch - Draw & Paint', 'ART_AND_DESIGN', '4.5', '215644', '25M', '50,000,000+', 'Free', '0', 'Teen', 'Art & Design', 'June 8, 2018', 'Varies with device', '4.2 and up']


['Pixel Draw - Number Art Coloring Book', 'ART_AND_DESIGN', '4.3', '967', '2.8M', '100,000+', 'Free', '0', 'Everyone', 'Art & Design;Creativity', 'June 20, 2018', '1.1', '4.4 and up']


Number of rows: 9613
Number of columns: 13


['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1']


['529479190', 'Clash of Clans', '116476928', 'USD', '0.0', '2130805', '579', '4.5', '4.5', '9.24.12', '9+', 'Games', '38', '5', '18', '1']


['420009108', 'Temple Run', '65921024', 'USD', '0.0', '1724546', '3842', '4.5', '4.0', '1.6.2', '9+', 'Games', '40', '5', '1', '1']


Number of rows: 6182
Number of columns: 16
In [9]:
## isolating the free android and iOS apps ###
for app in android_clean:
    name = app[0]
    if(english_only):
        android_english.append(name)

for app in ios:
    name = app[1];
    if(english_only):
        ios_english.append(name)
In [10]:
### We want to find and app profile that fits both the App Store and Google Play because by analyzing apps that are successful on both, we have a way to measure the threshold of entry 
### in order to be/remain competitive on those platforms ###

explore_data(android_english, 0, 3, True)
['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']


['Sketch - Draw & Paint', 'ART_AND_DESIGN', '4.5', '215644', '25M', '50,000,000+', 'Free', '0', 'Teen', 'Art & Design', 'June 8, 2018', 'Varies with device', '4.2 and up']


['Pixel Draw - Number Art Coloring Book', 'ART_AND_DESIGN', '4.3', '967', '2.8M', '100,000+', 'Free', '0', 'Everyone', 'Art & Design;Creativity', 'June 20, 2018', '1.1', '4.4 and up']


Number of rows: 19271
Number of columns: 13
In [11]:
### function to generate frequency tables to show percentages ###

def freq_table(dataset, index):
    table = {}
    total = 0
    
    for row in dataset:
        total += 1
        value = row[index]
        if value in table:
            table[value] += 1
        else:
            table[value] = 1
    
    table_percentages = {}
    for key in table:
        percentage = (table[key] / total) * 100
        table_percentages[key] = percentage
    
    return table_percentages

### function to display the percentages in desc ###
def display_table(dataset, index, label):
    table = freq_table(dataset, index)
    table_display = []
    for key in table:
        key_val_as_tuple = (table[key], key)
        table_display.append(key_val_as_tuple)

    table_sorted = sorted(table_display, reverse = True)
    print("Column: " + "***" + label.upper() + "***")
    for entry in table_sorted:
        print(entry[1], ':', entry[0])
    
In [12]:
display_table(ios, -5, "ios")
Column: ***IOS***
Games : 53.66870483602001
Entertainment : 7.434685936631462
Education : 6.2951639799888826
Photo & Video : 4.849916620344636
Utilities : 3.446359088382435
Health & Fitness : 2.501389660922735
Productivity : 2.473596442468038
Social Networking : 2.3068371317398557
Lifestyle : 2.001111728738188
Music : 1.9177320733740968
Shopping : 1.6953863257365203
Sports : 1.584213451917732
Book : 1.556420233463035
Finance : 1.4452473596442468
Travel : 1.1256253474152307
News : 1.0422456920511394
Weather : 1.000555864369094
Reference : 0.8893829905503057
Food & Drink : 0.8754863813229572
Business : 0.792106725958866
Navigation : 0.6392440244580322
Medical : 0.3196220122290161
Catalogs : 0.13896609227348528
In [13]:
display_table(android_clean, 1, "Category")
Column: ***CATEGORY***
FAMILY : 19.403603230482503
GAME : 9.79498861047836
TOOLS : 8.583557672395942
BUSINESS : 4.348726444398427
MEDICAL : 4.089873679850901
PERSONALIZATION : 3.8931455787947815
PRODUCTIVITY : 3.8724373576309796
LIFESTYLE : 3.8206668047214745
FINANCE : 3.57216815075585
SPORTS : 3.36508593911783
COMMUNICATION : 3.26154483329882
HEALTH_AND_FITNESS : 2.9819838475874922
PHOTOGRAPHY : 2.909505073514185
NEWS_AND_MAGAZINES : 2.629944087802858
SOCIAL : 2.4746324290743424
BOOKS_AND_REFERENCE : 2.2986125491820255
TRAVEL_AND_LOCAL : 2.2675502174363222
SHOPPING : 2.091530337544005
DATING : 1.7601987989231724
VIDEO_PLAYERS : 1.6980741354317663
MAPS_AND_NAVIGATION : 1.356388486229033
FOOD_AND_DRINK : 1.1596603851729135
EDUCATION : 1.1078898322634085
ENTERTAINMENT : 0.9008076206253882
AUTO_AND_VEHICLES : 0.8800993994615862
LIBRARIES_AND_DEMO : 0.8697452888796852
WEATHER : 0.8179747359701802
HOUSE_AND_HOME : 0.755850072478774
EVENTS : 0.662663077241665
PARENTING : 0.6212466349140608
ART_AND_DESIGN : 0.6212466349140608
COMICS : 0.5798301925864567
BEAUTY : 0.5487678608407538
In [14]:
display_table(android_clean, -4, "Genres")
Column: ***GENRES***
Tools : 8.57320356181404
Entertainment : 5.808656036446469
Education : 5.280596396769518
Business : 4.348726444398427
Medical : 4.089873679850901
Personalization : 3.8931455787947815
Productivity : 3.8724373576309796
Lifestyle : 3.8103126941395735
Finance : 3.57216815075585
Sports : 3.427210602609236
Communication : 3.26154483329882
Action : 3.0958790639884035
Health & Fitness : 2.9819838475874922
Photography : 2.909505073514185
News & Magazines : 2.629944087802858
Social : 2.4746324290743424
Books & Reference : 2.2986125491820255
Travel & Local : 2.2571961068544213
Shopping : 2.091530337544005
Simulation : 1.9983433423068957
Arcade : 1.9051563470697868
Dating : 1.7601987989231724
Casual : 1.7084282460136675
Video Players & Editors : 1.6773659142679642
Maps & Navigation : 1.356388486229033
Puzzle : 1.2321391592462207
Food & Drink : 1.1596603851729135
Role Playing : 1.0871816110996066
Strategy : 0.9836405052805964
Racing : 0.9422240629529923
Auto & Vehicles : 0.8800993994615862
Libraries & Demo : 0.8697452888796852
Weather : 0.8179747359701802
House & Home : 0.755850072478774
Adventure : 0.755850072478774
Events : 0.662663077241665
Art & Design : 0.5798301925864567
Comics : 0.5694760820045558
Beauty : 0.5487678608407538
Card : 0.4866431973493477
Parenting : 0.4762890867674467
Board : 0.4348726444398426
Casino : 0.4038103126941396
Trivia : 0.3934562021122386
Educational;Education : 0.3934562021122386
Educational : 0.38310209153033753
Education;Education : 0.37274798094843653
Casual;Pretend Play : 0.25885276454752537
Word : 0.23814454338372334
Music : 0.1967281010561193
Puzzle;Brain Games : 0.17601987989231727
Education;Pretend Play : 0.17601987989231727
Racing;Action & Adventure : 0.16566576931041624
Entertainment;Music & Video : 0.1553116587285152
Board;Brain Games : 0.14495754814661418
Arcade;Action & Adventure : 0.14495754814661418
Educational;Pretend Play : 0.13460343756471319
Casual;Action & Adventure : 0.13460343756471319
Casual;Brain Games : 0.12424932698281217
Action;Action & Adventure : 0.12424932698281217
Simulation;Action & Adventure : 0.07247877407330709
Parenting;Education : 0.07247877407330709
Entertainment;Brain Games : 0.07247877407330709
Parenting;Music & Video : 0.062124663491406086
Educational;Brain Games : 0.062124663491406086
Education;Creativity : 0.062124663491406086
Casual;Creativity : 0.062124663491406086
Art & Design;Creativity : 0.062124663491406086
Educational;Creativity : 0.051770552909505066
Adventure;Action & Adventure : 0.051770552909505066
Sports;Action & Adventure : 0.04141644232760406
Role Playing;Pretend Play : 0.04141644232760406
Role Playing;Action & Adventure : 0.04141644232760406
Education;Brain Games : 0.04141644232760406
Education;Action & Adventure : 0.04141644232760406
Simulation;Pretend Play : 0.031062331745703043
Simulation;Education : 0.031062331745703043
Puzzle;Action & Adventure : 0.031062331745703043
Music;Music & Video : 0.031062331745703043
Entertainment;Creativity : 0.031062331745703043
Entertainment;Action & Adventure : 0.031062331745703043
Educational;Action & Adventure : 0.031062331745703043
Education;Music & Video : 0.031062331745703043
Casual;Education : 0.031062331745703043
Board;Action & Adventure : 0.031062331745703043
Video Players & Editors;Music & Video : 0.02070822116380203
Strategy;Action & Adventure : 0.02070822116380203
Puzzle;Creativity : 0.02070822116380203
Entertainment;Pretend Play : 0.02070822116380203
Card;Action & Adventure : 0.02070822116380203
Books & Reference;Education : 0.02070822116380203
Video Players & Editors;Creativity : 0.010354110581901015
Trivia;Education : 0.010354110581901015
Travel & Local;Action & Adventure : 0.010354110581901015
Tools;Education : 0.010354110581901015
Strategy;Education : 0.010354110581901015
Strategy;Creativity : 0.010354110581901015
Role Playing;Education : 0.010354110581901015
Role Playing;Brain Games : 0.010354110581901015
Racing;Pretend Play : 0.010354110581901015
Puzzle;Education : 0.010354110581901015
Parenting;Brain Games : 0.010354110581901015
Music & Audio;Music & Video : 0.010354110581901015
Lifestyle;Pretend Play : 0.010354110581901015
Lifestyle;Education : 0.010354110581901015
Health & Fitness;Education : 0.010354110581901015
Health & Fitness;Action & Adventure : 0.010354110581901015
Entertainment;Education : 0.010354110581901015
Communication;Creativity : 0.010354110581901015
Comics;Creativity : 0.010354110581901015
Casual;Music & Video : 0.010354110581901015
Books & Reference;Creativity : 0.010354110581901015
Board;Pretend Play : 0.010354110581901015
Art & Design;Pretend Play : 0.010354110581901015
Art & Design;Action & Adventure : 0.010354110581901015
Arcade;Pretend Play : 0.010354110581901015
Adventure;Education : 0.010354110581901015
Adventure;Brain Games : 0.010354110581901015
In [15]:
freq_table(ios, -5)
Out[15]:
{'Book': 1.556420233463035,
 'Business': 0.792106725958866,
 'Catalogs': 0.13896609227348528,
 'Education': 6.2951639799888826,
 'Entertainment': 7.434685936631462,
 'Finance': 1.4452473596442468,
 'Food & Drink': 0.8754863813229572,
 'Games': 53.66870483602001,
 'Health & Fitness': 2.501389660922735,
 'Lifestyle': 2.001111728738188,
 'Medical': 0.3196220122290161,
 'Music': 1.9177320733740968,
 'Navigation': 0.6392440244580322,
 'News': 1.0422456920511394,
 'Photo & Video': 4.849916620344636,
 'Productivity': 2.473596442468038,
 'Reference': 0.8893829905503057,
 'Shopping': 1.6953863257365203,
 'Social Networking': 2.3068371317398557,
 'Sports': 1.584213451917732,
 'Travel': 1.1256253474152307,
 'Utilities': 3.446359088382435,
 'Weather': 1.000555864369094}
In [27]:
### calculate most popular apps by genre  ###

genres_ios = freq_table(ios, -5)

for genre in genres_ios:
    total = 0
    len_genre = 0
    for genre_app in ios:
        genre_app = app[-5]
        if genre_app == genre:
            user_ratings = float(app[5])
            total += user_ratings
            len_genre += 1
    avg_user_ratings = total / len_genre
print(genre)
print(avg_user_ratings)
ZeroDivisionErrorTraceback (most recent call last)
<ipython-input-27-9da2f3792fc3> in <module>()
     12             total += user_ratings
     13             len_genre += 1
---> 14     avg_user_ratings = total / len_genre
     15 print(genre)
     16 print(avg_user_ratings)

ZeroDivisionError: division by zero