#!/usr/bin/env python # coding: utf-8 # Title: Profitable App Profiles for the App Store and Google Play Markets # # Introduction: To analyse the number of users who use our apps, and what apps attract the most users on Google Play and App Store. # # # In[1]: def explore_data(dataset, start, end, rows_and_columns=False): dataset_slice = dataset[start:end] for row in dataset_slice: print(row) print('\n') # adds a new (empty) line after each row if rows_and_columns: print('Number of rows:', len(dataset)) print('Number of columns:', len(dataset[0])) # In[2]: from csv import reader opened_file = open('AppleStore.csv') read_file = reader(opened_file) #opened_file.close() apps_data = list(read_file) appsapple_header = apps_data[0] appsapple_data = apps_data[1:] opened_file2 = open('googleplaystore.csv') read_file2 = reader(opened_file2) #opened_file2.close() appsg_data = list(read_file2) appsgoogle_header = appsg_data[0] appsgoogle_data = appsg_data[1:] # In[3]: #print(appsgoogle_header) #print(appsgoogle_data) #print(appsgoogle_header) #print('\n') #explore_data(appsapple_data, 0, 2, True) # In[4]: #print(appsgoogle_header) #print('\n') #explore_data(appsgoogle_data, 0, 2, True) # In[5]: #print(appsgoogle_data[10472]) # In[6]: #print(len(appsgoogle_data)) #del appsgoogle_data[10472] # don't run this more than once #print(len(appsgoogle_data)) # The Google play data set has duplicate entries # In[7]: for app in appsgoogle_data: name = app[0] if name == 'Instagram': print(app) # In[8]: duplicate_apps = [] unique_apps = [] for app in appsgoogle_data: name = app[0] if name in unique_apps: duplicate_apps.append(name) else: unique_apps.append(name) print('Number of duplicate apps:', len(duplicate_apps)) # Going to remove all duplicates, but keep the record that has the most number of reviews as that is probably the most recent record. # In[9]: reviews_max = {} for app in appsgoogle_data: name = app[0] n_reviews = float(app[3]) if name in reviews_max and reviews_max[name] < n_reviews: reviews_max[name] = n_reviews elif name not in reviews_max: reviews_max[name] = n_reviews #print(reviews_max)