Title: Profitable App Profiles for the App Store and Google Play Markets

Introduction: To analyse the number of users who use our apps, and what apps attract the most users on Google Play and App Store.

In [1]:
def explore_data(dataset, start, end, rows_and_columns=False):
    dataset_slice = dataset[start:end]    
    for row in dataset_slice:
        print(row)
        print('\n') # adds a new (empty) line after each row

    if rows_and_columns:
        print('Number of rows:', len(dataset))
        print('Number of columns:', len(dataset[0]))
In [2]:
from csv import reader

opened_file = open('AppleStore.csv')
read_file = reader(opened_file)
#opened_file.close()
apps_data = list(read_file)
appsapple_header = apps_data[0]
appsapple_data = apps_data[1:]

opened_file2 = open('googleplaystore.csv')
read_file2 = reader(opened_file2)
#opened_file2.close()
appsg_data = list(read_file2)
appsgoogle_header = appsg_data[0]
appsgoogle_data = appsg_data[1:]
In [3]:
#print(appsgoogle_header)
#print(appsgoogle_data)
#print(appsgoogle_header)
#print('\n')
#explore_data(appsapple_data, 0, 2, True)
In [4]:
#print(appsgoogle_header)
#print('\n')
#explore_data(appsgoogle_data, 0, 2, True)
In [5]:
#print(appsgoogle_data[10472])
In [6]:
#print(len(appsgoogle_data))
#del appsgoogle_data[10472]  # don't run this more than once
#print(len(appsgoogle_data))

The Google play data set has duplicate entries

In [7]:
for app in appsgoogle_data:
    name = app[0]
    if name == 'Instagram':
        print(app)
['Instagram', 'SOCIAL', '4.5', '66577313', 'Varies with device', '1,000,000,000+', 'Free', '0', 'Teen', 'Social', 'July 31, 2018', 'Varies with device', 'Varies with device']
['Instagram', 'SOCIAL', '4.5', '66577446', 'Varies with device', '1,000,000,000+', 'Free', '0', 'Teen', 'Social', 'July 31, 2018', 'Varies with device', 'Varies with device']
['Instagram', 'SOCIAL', '4.5', '66577313', 'Varies with device', '1,000,000,000+', 'Free', '0', 'Teen', 'Social', 'July 31, 2018', 'Varies with device', 'Varies with device']
['Instagram', 'SOCIAL', '4.5', '66509917', 'Varies with device', '1,000,000,000+', 'Free', '0', 'Teen', 'Social', 'July 31, 2018', 'Varies with device', 'Varies with device']
In [8]:
duplicate_apps = []
unique_apps = []

for app in appsgoogle_data:
    name = app[0]
    if name in unique_apps:
        duplicate_apps.append(name)
    else:
        unique_apps.append(name)

print('Number of duplicate apps:', len(duplicate_apps))
Number of duplicate apps: 1181

Going to remove all duplicates, but keep the record that has the most number of reviews as that is probably the most recent record.

In [9]:
reviews_max = {}

for app in appsgoogle_data:
    name = app[0]
    n_reviews = float(app[3])
    
    if name in reviews_max and reviews_max[name] < n_reviews:
        reviews_max[name] = n_reviews
        
    elif name not in reviews_max:
        reviews_max[name] = n_reviews

#print(reviews_max)

ValueErrorTraceback (most recent call last)
<ipython-input-9-23c3ea3ce216> in <module>()
      3 for app in appsgoogle_data:
      4     name = app[0]
----> 5     n_reviews = float(app[3])
      6 
      7     if name in reviews_max and reviews_max[name] < n_reviews:

ValueError: could not convert string to float: '3.0M'