Guided Project NO.1

Profitable App Profiles for the App Store and Google Play Markets

A project to collect, analyze, and present a profitable profile for an AppStore and Google play app.

In [27]:
from csv import reader

opened_file = open('AppleStore.csv') #iOS 
read_file = reader(opened_file)
ios = list(read_file)
ios_header = ios[0]
ios_body = ios[1:]

opened_file = open('googleplaystore.csv') #Android
read_file = reader(opened_file)
android = list(read_file)
android_header = android[0]
android_body = android[1:]

def explore_data(dataset, start, end, rows_and_columns=False):
    dataset_slice = dataset[start:end]
    for row in dataset_slice:
        print(row)
        print('\n') #adds a new emmpty line after each row
        
        if rows_and_columns:
            print('NO of rows: ', len(dataset))
            print('NO of columns: ', len(dataset[0]))
['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver']


NO of rows:  10842
NO of columns:  13
['Life Made WI-Fi Touchscreen Photo Frame', '1.9', '19', '3.0M', '1,000+', 'Free', '0', 'Everyone', '', 'February 11, 2018', '1.0.19', '4.0 and up']


NO of rows:  10842
NO of columns:  13
10842
10841

Data Cleaning

Removing duplicate data entries, and keeping the entry with highest reviews count for each app

In [30]:
duplicate_apps =[]
unique_apps = []
for app in android:
    name = app[0]
    if name in unique_apps:
        duplicate_apps.append(name)
    else:
         unique_apps.append(name)
Number of dublicate apps:  1181
Dublicate apps sample:  ['Quick PDF Scanner + OCR FREE', 'Box', 'Google My Business', 'ZOOM Cloud Meetings', 'join.me - Simple Meetings', 'Box', 'Zenefits', 'Google Ads', 'Google My Business', 'Slack', 'FreshBooks Classic', 'Insightly CRM', 'QuickBooks Accounting: Invoicing & Expenses', 'HipChat - Chat Built for Teams', 'Xero Accounting Software']
In [47]:
reviews_max ={}
for row in android[1:]:
    n_reviews = float(row[3])
    name = row[0]
    if name in reviews_max and reviews_max[name] < n_reviews:
        reviews_max[name] = n_reviews
    
    elif name not in reviews_max:
        reviews_max[name] = n_reviews
    name = row[0]
    n_reviews = float(row[3])
9659
10054
['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


NO of rows:  10054
NO of columns:  13
['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']


NO of rows:  10054
NO of columns:  13
['Sketch - Draw & Paint', 'ART_AND_DESIGN', '4.5', '215644', '25M', '50,000,000+', 'Free', '0', 'Teen', 'Art & Design', 'June 8, 2018', 'Varies with device', '4.2 and up']


NO of rows:  10054
NO of columns:  13
In [55]:
android_clean = []
already_added =[]
for app in android[1:]:
    name = app[0]
    n_reviews = float(app[3])
    
    if (reviews_max[name] == n_reviews) and (name not in already_added):
        android_clean.append(app)
        already_added.append(name)
        
explore_data(android_clean, 0, 2, True)
['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


NO of rows:  9659
NO of columns:  13
['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']


NO of rows:  9659
NO of columns:  13

Removing non-English Apps

In [64]:
def english_char(string):
    non_ASCII = 0
    for char in string:
        if ord(char) > 127:
            non_ASCII +=1
            
    if non_ASCII > 3:
        return False
    else:
        return True
english_char('爱奇艺PPS -《欢乐颂2》电视剧热播')
Out[64]:
False
In [68]:
android_english =[]
ios_english = []

for app in android_clean:
    name = app[0]
    if english_char(name):
        android_english.append(app)

for app in ios:
    name = app[1]
    if english_char(name):
        ios_english.append(app)
        
print(len(android_english))
print('/n')

print(len(ios_english))
9614
/n
6184

Isolating free apps

In [71]:
free_android =[]
free_ios = []
for app in android_english:
    price = app[7]
    if price == '0':
        free_android.append(app)
        
for app in ios_english:
    price = app[4]
    if price == '0.0':
        free_ios.append(app)
        
print(len(free_android), len(free_ios))
8864 3222

Apps per genre

In [78]:
def freq_table(dataset, index):
    frequency_table = {}
    total = 0
    for app in dataset:
        total += 1
        freq = app[index]
        if freq in frequency_table:
            frequency_table[freq] +=1
        else:
            frequency_table[freq] = 1
    table_percentages = {}
    for key in frequency_table:
        percentage = (frequency_table[key] / total) * 100
        table_percentages[key] = percentage 
    
    return table_percentages
   
    
def display_table(dataset, index):
    table = freq_table(dataset, index)
    table_display = []
    for key in table:
        key_val_as_tuple = (table[key], key)
        table_display.append(key_val_as_tuple)
        
    table_sorted = sorted(table_display, reverse =True)
    for entry in table_sorted:
        print(entry[1], ':', entry[0])
FAMILY : 18.907942238267147
GAME : 9.724729241877256
TOOLS : 8.461191335740072
BUSINESS : 4.591606498194946
LIFESTYLE : 3.9034296028880866
PRODUCTIVITY : 3.892148014440433
FINANCE : 3.7003610108303246
MEDICAL : 3.531137184115524
SPORTS : 3.395758122743682
PERSONALIZATION : 3.3167870036101084
COMMUNICATION : 3.2378158844765346
HEALTH_AND_FITNESS : 3.0798736462093865
PHOTOGRAPHY : 2.944494584837545
NEWS_AND_MAGAZINES : 2.7978339350180503
SOCIAL : 2.6624548736462095
TRAVEL_AND_LOCAL : 2.33528880866426
SHOPPING : 2.2450361010830324
BOOKS_AND_REFERENCE : 2.1435018050541514
DATING : 1.861462093862816
VIDEO_PLAYERS : 1.7937725631768955
MAPS_AND_NAVIGATION : 1.3989169675090252
FOOD_AND_DRINK : 1.2409747292418771
EDUCATION : 1.1620036101083033
ENTERTAINMENT : 0.9589350180505415
LIBRARIES_AND_DEMO : 0.9363718411552346
AUTO_AND_VEHICLES : 0.9250902527075812
HOUSE_AND_HOME : 0.8235559566787004
WEATHER : 0.8009927797833934
EVENTS : 0.7107400722021661
PARENTING : 0.6543321299638989
ART_AND_DESIGN : 0.6430505415162455
COMICS : 0.6204873646209386
BEAUTY : 0.5979241877256317
In [86]:
#explore_data(android, 0, 3)
display_table(free_android, 1)
print('/n')
display_table(free_android, 9)
'''MOST_COMMON_IOS GENRES: Games, Entertainment
WE need to look at the number of downloads to build an accurate profile'''

'''MOST COMMON ANDROD GENRES: Family, games'''
FAMILY : 18.907942238267147
GAME : 9.724729241877256
TOOLS : 8.461191335740072
BUSINESS : 4.591606498194946
LIFESTYLE : 3.9034296028880866
PRODUCTIVITY : 3.892148014440433
FINANCE : 3.7003610108303246
MEDICAL : 3.531137184115524
SPORTS : 3.395758122743682
PERSONALIZATION : 3.3167870036101084
COMMUNICATION : 3.2378158844765346
HEALTH_AND_FITNESS : 3.0798736462093865
PHOTOGRAPHY : 2.944494584837545
NEWS_AND_MAGAZINES : 2.7978339350180503
SOCIAL : 2.6624548736462095
TRAVEL_AND_LOCAL : 2.33528880866426
SHOPPING : 2.2450361010830324
BOOKS_AND_REFERENCE : 2.1435018050541514
DATING : 1.861462093862816
VIDEO_PLAYERS : 1.7937725631768955
MAPS_AND_NAVIGATION : 1.3989169675090252
FOOD_AND_DRINK : 1.2409747292418771
EDUCATION : 1.1620036101083033
ENTERTAINMENT : 0.9589350180505415
LIBRARIES_AND_DEMO : 0.9363718411552346
AUTO_AND_VEHICLES : 0.9250902527075812
HOUSE_AND_HOME : 0.8235559566787004
WEATHER : 0.8009927797833934
EVENTS : 0.7107400722021661
PARENTING : 0.6543321299638989
ART_AND_DESIGN : 0.6430505415162455
COMICS : 0.6204873646209386
BEAUTY : 0.5979241877256317
/n
Tools : 8.449909747292418
Entertainment : 6.069494584837545
Education : 5.347472924187725
Business : 4.591606498194946
Productivity : 3.892148014440433
Lifestyle : 3.892148014440433
Finance : 3.7003610108303246
Medical : 3.531137184115524
Sports : 3.463447653429603
Personalization : 3.3167870036101084
Communication : 3.2378158844765346
Action : 3.1024368231046933
Health & Fitness : 3.0798736462093865
Photography : 2.944494584837545
News & Magazines : 2.7978339350180503
Social : 2.6624548736462095
Travel & Local : 2.3240072202166067
Shopping : 2.2450361010830324
Books & Reference : 2.1435018050541514
Simulation : 2.0419675090252705
Dating : 1.861462093862816
Arcade : 1.8501805054151623
Video Players & Editors : 1.7712093862815883
Casual : 1.7599277978339352
Maps & Navigation : 1.3989169675090252
Food & Drink : 1.2409747292418771
Puzzle : 1.128158844765343
Racing : 0.9927797833935018
Role Playing : 0.9363718411552346
Libraries & Demo : 0.9363718411552346
Auto & Vehicles : 0.9250902527075812
Strategy : 0.9138086642599278
House & Home : 0.8235559566787004
Weather : 0.8009927797833934
Events : 0.7107400722021661
Adventure : 0.6768953068592057
Comics : 0.6092057761732852
Beauty : 0.5979241877256317
Art & Design : 0.5979241877256317
Parenting : 0.4963898916967509
Card : 0.45126353790613716
Casino : 0.42870036101083037
Trivia : 0.41741877256317694
Educational;Education : 0.39485559566787
Board : 0.3835740072202166
Educational : 0.3722924187725632
Education;Education : 0.33844765342960287
Word : 0.2594765342960289
Casual;Pretend Play : 0.236913357400722
Music : 0.2030685920577617
Racing;Action & Adventure : 0.16922382671480143
Puzzle;Brain Games : 0.16922382671480143
Entertainment;Music & Video : 0.16922382671480143
Casual;Brain Games : 0.13537906137184114
Casual;Action & Adventure : 0.13537906137184114
Arcade;Action & Adventure : 0.12409747292418773
Action;Action & Adventure : 0.10153429602888085
Educational;Pretend Play : 0.09025270758122744
Simulation;Action & Adventure : 0.078971119133574
Parenting;Education : 0.078971119133574
Entertainment;Brain Games : 0.078971119133574
Board;Brain Games : 0.078971119133574
Parenting;Music & Video : 0.06768953068592057
Educational;Brain Games : 0.06768953068592057
Casual;Creativity : 0.06768953068592057
Art & Design;Creativity : 0.06768953068592057
Education;Pretend Play : 0.056407942238267145
Role Playing;Pretend Play : 0.04512635379061372
Education;Creativity : 0.04512635379061372
Role Playing;Action & Adventure : 0.033844765342960284
Puzzle;Action & Adventure : 0.033844765342960284
Entertainment;Creativity : 0.033844765342960284
Entertainment;Action & Adventure : 0.033844765342960284
Educational;Creativity : 0.033844765342960284
Educational;Action & Adventure : 0.033844765342960284
Education;Music & Video : 0.033844765342960284
Education;Brain Games : 0.033844765342960284
Education;Action & Adventure : 0.033844765342960284
Adventure;Action & Adventure : 0.033844765342960284
Video Players & Editors;Music & Video : 0.02256317689530686
Sports;Action & Adventure : 0.02256317689530686
Simulation;Pretend Play : 0.02256317689530686
Puzzle;Creativity : 0.02256317689530686
Music;Music & Video : 0.02256317689530686
Entertainment;Pretend Play : 0.02256317689530686
Casual;Education : 0.02256317689530686
Board;Action & Adventure : 0.02256317689530686
Video Players & Editors;Creativity : 0.01128158844765343
Trivia;Education : 0.01128158844765343
Travel & Local;Action & Adventure : 0.01128158844765343
Tools;Education : 0.01128158844765343
Strategy;Education : 0.01128158844765343
Strategy;Creativity : 0.01128158844765343
Strategy;Action & Adventure : 0.01128158844765343
Simulation;Education : 0.01128158844765343
Role Playing;Brain Games : 0.01128158844765343
Racing;Pretend Play : 0.01128158844765343
Puzzle;Education : 0.01128158844765343
Parenting;Brain Games : 0.01128158844765343
Music & Audio;Music & Video : 0.01128158844765343
Lifestyle;Pretend Play : 0.01128158844765343
Lifestyle;Education : 0.01128158844765343
Health & Fitness;Education : 0.01128158844765343
Health & Fitness;Action & Adventure : 0.01128158844765343
Entertainment;Education : 0.01128158844765343
Communication;Creativity : 0.01128158844765343
Comics;Creativity : 0.01128158844765343
Casual;Music & Video : 0.01128158844765343
Card;Action & Adventure : 0.01128158844765343
Books & Reference;Education : 0.01128158844765343
Art & Design;Pretend Play : 0.01128158844765343
Art & Design;Action & Adventure : 0.01128158844765343
Arcade;Pretend Play : 0.01128158844765343
Adventure;Education : 0.01128158844765343
Out[86]:
'MOST_COMMON_IOS GENRES: Games, Entertainment\nWE need to look at the number of downloads to build an accurate profile'
In [93]:
unique_ios_genres = freq_table(free_ios, -5)

for genre in unique_ios_genres:
    total = 0
    len_genre = 0
    for app in free_ios:
        genre_app = app[-5]
        if genre == genre_app:
            user_rating_cont = float(app[5])
            total += user_rating_cont
            len_genre +=1
    average_user_rating = total/len_genre

    print(average_user_rating, genre)
            
74942.11111111111 Reference
21028.410714285714 Productivity
31467.944444444445 Finance
71548.34905660378 Social Networking
23008.898550724636 Sports
21248.023255813954 News
28243.8 Travel
26919.690476190477 Shopping
16485.764705882353 Lifestyle
33333.92307692308 Food & Drink
7003.983050847458 Education
23298.015384615384 Health & Fitness
39758.5 Book
7491.117647058823 Business
52279.892857142855 Weather
14029.830708661417 Entertainment
57326.530303030304 Music
86090.33333333333 Navigation
22788.6696905016 Games
28441.54375 Photo & Video
4004.0 Catalogs
612.0 Medical
18684.456790123455 Utilities
In [94]:
display_table(free_android, 5)
1,000,000+ : 15.726534296028879
100,000+ : 11.552346570397113
10,000,000+ : 10.548285198555957
10,000+ : 10.198555956678701
1,000+ : 8.393501805054152
100+ : 6.915613718411552
5,000,000+ : 6.825361010830325
500,000+ : 5.561823104693141
50,000+ : 4.7721119133574
5,000+ : 4.512635379061372
10+ : 3.5424187725631766
500+ : 3.2490974729241873
50,000,000+ : 2.3014440433213
100,000,000+ : 2.1322202166064983
50+ : 1.917870036101083
5+ : 0.78971119133574
1+ : 0.5076714801444043
500,000,000+ : 0.2707581227436823
1,000,000,000+ : 0.22563176895306858
0+ : 0.04512635379061372
0 : 0.01128158844765343
In [100]:
unique_android_genres = freq_table(free_android, 1)
for category in unique_android_genres:
    total = 0
    len_category = 0
    for app in free_android:
        category_app = app[1]
        if category_app == category:
            n_installs = app[5]
            n_installs = n_installs.replace(',', '')
            n_installs = n_installs.replace('+', '')
            total += float(n_installs)
            len_category +=1
    average_installs = total / len_category
    print(category, ':', average_installs)
NEWS_AND_MAGAZINES : 9549178.467741935
PRODUCTIVITY : 16787331.344927534
BEAUTY : 513151.88679245283
AUTO_AND_VEHICLES : 647317.8170731707
VIDEO_PLAYERS : 24727872.452830188
GAME : 15588015.603248259
HOUSE_AND_HOME : 1331540.5616438356
EDUCATION : 1833495.145631068
FOOD_AND_DRINK : 1924897.7363636363
BUSINESS : 1712290.1474201474
PERSONALIZATION : 5201482.6122448975
FINANCE : 1387692.475609756
LIFESTYLE : 1437816.2687861272
FAMILY : 3695641.8198090694
DATING : 854028.8303030303
ART_AND_DESIGN : 1986335.0877192982
ENTERTAINMENT : 11640705.88235294
WEATHER : 5074486.197183099
LIBRARIES_AND_DEMO : 638503.734939759
COMMUNICATION : 38456119.167247385
BOOKS_AND_REFERENCE : 8767811.894736841
SHOPPING : 7036877.311557789
COMICS : 817657.2727272727
MEDICAL : 120550.61980830671
PHOTOGRAPHY : 17840110.40229885
EVENTS : 253542.22222222222
MAPS_AND_NAVIGATION : 4056941.7741935486
SOCIAL : 23253652.127118643
SPORTS : 3638640.1428571427
TOOLS : 10801391.298666667
TRAVEL_AND_LOCAL : 13984077.710144928
PARENTING : 542603.6206896552
HEALTH_AND_FITNESS : 4188821.9853479853