#open csv file for each dataset, read using reader function imported from csv module, store each in variable as list of lists opened_file_google = open('/content/drive/My Drive/Datasets/googleplaystore.csv') opened_file_apple = open('/content/drive/My Drive/Datasets/AppleStore.csv') from csv import reader google_data = list(reader(opened_file_google)) google_data_header = google_data[0] google_data = google_data[1:] apple_data = list(reader(opened_file_apple)) apple_data_header = apple_data[0] apple_data = apple_data[1:] def explore_data(dataset, start, end, rows_and_columns=False): '''Passed dataset paraemter as list of lists, prints rows of dataset and if rows_and_columns parameter is passed True then prints number of rows (including header row) and number of columns in dataset''' dataset_slice = dataset[start:end] for row in dataset_slice: print(row) print('\n') # adds a new (empty) line after each row if rows_and_columns: print('Number of rows:', len(dataset)) print('Number of columns:', len(dataset[0])) print(google_data_header) print('\n') explore_data(google_data, 0, 3, True) print('\n'*3) print(apple_data_header) print('\n') explore_data(apple_data, 0, 3, True) #Delete a row with error as identified in discussion forum in dataset documentation print(len(google_data)) print(google_data[10472]) del google_data[10472] print(len(google_data)) name = 'Coloring book moana' for app in google_data: if app[0] == name: print(app) print(google_data.index(app)) duplicate_entries = [] unique_entries = [] for app in google_data: name = app[0] if name in unique_entries: duplicate_entries.append(name) else: unique_entries.append(name) print('Number of duplicate apps: ', len(duplicate_entries)) print('Examples of duplicate apps: ', duplicate_entries[:10]) #initialize empty dictionary reviews_max #loop over apps in google_data and update reviews column value if entry is a duplicate. Else add key, value pair to reviews_max if app name is in reviews_max reviews_max = {} for app in google_data: name = app[0] n_reviews = float(app[3]) if name in reviews_max: if n_reviews > reviews_max[name]: reviews_max[name] = n_reviews else: reviews_max[name] = n_reviews #print lengths of container variables to check loop has worked correctly print('Length of google_data minus length of duplicate entries: ', len(google_data) - len(duplicate_entries)) print('Length of unique_entries: ', len(unique_entries)) print('Length of reviews_max: ', len(reviews_max)) #create two empty lists to store cleaned dataset and to store named of apps already added to cleaned dataset #loop through apps in original dataset and store name and number of reviews #if number of reviews is equal to the max number of reviews for apps of same name AND name of app in not in the list of names of apps already added then append app to cleaned dataset #note: some rows in original dataset have duplicate entries with same number of reviews hence 'name not in already_added' required to prevent duplicates of these rows in cleaned data google_cleaned = [] already_added = [] for app in google_data: name = app[0] n_reviews = float(app[3]) if (n_reviews == reviews_max[name]) & (name not in already_added): google_cleaned.append(app) already_added.append(name) #explore the cleaed dataset explore_data(google_cleaned, 0, 3, True) def is_english(s): '''is_english returns True if the string only contains characters with an output from ord() function in the range 0 to 127 and False if the string contains one or more characters outside that range''' for character in s: if ord(character) > 127: return False return True #test is_english function on some strings print(is_english('Instagram')) print(is_english('爱奇艺PPS -《欢乐颂2》电视剧热播')) print(is_english('Docs To Go™ Free Office Suite')) print(is_english('Instachat 😜')) def is_english(s): '''is_english returns True if the string contains 3 or less characters with an output from ord() function outside the range 0 to 127 and False if the string contains 4 or more characters outside that range''' count = 0 for character in s: if ord(character) > 127: count += 1 if count == 4: return False return True #test modified is_english function on same strings print(is_english('Instagram')) print(is_english('爱奇艺PPS -《欢乐颂2》电视剧热播')) print(is_english('Docs To Go™ Free Office Suite')) print(is_english('Instachat 😜')) english_google_cleaned = [] english_apple_data = [] for app in google_cleaned: name = app[0] if is_english(name): english_google_cleaned.append(app) for app in apple_data: name = app[1] #name is in second column (index=1) of apple_data dataset if is_english(name): english_apple_data.append(app) explore_data(english_google_cleaned, 0, 3, True) print('\n') explore_data(english_apple_data, 0, 3, True) free_english_google_cleaned = [] free_english_apple_data = [] for app in english_google_cleaned: if (app[6] == 'Free') | (app[7] == '0'): free_english_google_cleaned.append(app) for app in english_apple_data: if app[4] == '0.0': free_english_apple_data.append(app) explore_data(free_english_google_cleaned, 0, 3, True) print('\n') explore_data(free_english_apple_data, 0, 3, True) print(google_data_header) print('\n') print(apple_data_header) def freq_table(dataset, index): '''dataset is expected to be a list of lists and index is expected to be an integer freq_table returns the relative frequency table (as a dictionary) for any column we want. ''' table = {} total = len(dataset) for app in dataset: key = app[index] if key in table: table[key] += 1 else: table[key] = 1 for key in table: table[key] = (table[key] / total) * 100 return table print(freq_table(free_english_google_cleaned, 1)) def display_table(dataset, index): '''Takes in two parameters: dataset and index. dataset is expected to be a list of lists, and index is expected to be an integer. Generates a frequency table using the freq_table() function. Transforms the frequency table into a list of tuples (value, key), then sorts the list in a descending order using sorted() function. Prints the entries of the frequency table. Does not return anyhting. ''' table = freq_table(dataset, index) table_display = [] for key in table: key_val_as_tuple = (table[key], key) table_display.append(key_val_as_tuple) table_sorted = sorted(table_display, reverse = True) for entry in table_sorted: print(entry[1], ':', entry[0]) #use the display_table function to display the frequency table of the prime_genre column from english_apple_data display_table(free_english_apple_data, 11) print('Categories frequency table: ') print('\n') display_table(free_english_google_cleaned, 1) print('\n') print('Genres frequency table: ') print('\n') display_table(free_english_google_cleaned, 9) prime_genre_table = freq_table(free_english_apple_data, 11) genre_dict = {} for genre in prime_genre_table: total_num_ratings = 0 len_genre = 0 for app in free_english_apple_data: genre_app = app[11] if genre_app == genre: app_num_ratings = float(app[5]) total_num_ratings += app_num_ratings len_genre += 1 mean_num_ratings = total_num_ratings / len_genre genre_dict[genre] = mean_num_ratings genre_list = [] for key in genre_dict: key_val_as_tuple = (genre_dict[key], key) genre_list.append(key_val_as_tuple) genre_list_sorted = sorted(genre_list, reverse=True) for entry in genre_list_sorted: print(entry[1], ':', entry[0]) for app in free_english_apple_data: if app[11] == 'Navigation': print(app[1], ':', app[5]) for app in free_english_apple_data: if app[11] == 'Reference': print(app[1], ':', app[5]) print() for app in free_english_apple_data: if app[11] == 'Book': print(app[1], ':', app[5]) for app in free_english_apple_data: if app[11] == 'Social Networking': print(app[1], ':', app[5]) genre_dict = {} for genre in prime_genre_table: total_num_ratings = 0 len_genre = 0 for app in free_english_apple_data: genre_app = app[11] if genre_app == genre: app_num_ratings = float(app[5]) total_num_ratings += app_num_ratings #added a loop to remove apps for average calculation which have over 20% of total number of ratings in genre new_total_num_ratings = total_num_ratings for app in free_english_apple_data: genre_app = app[11] if genre_app == genre: app_num_ratings = float(app[5]) if app_num_ratings >= 0.2*total_num_ratings: new_total_num_ratings -= app_num_ratings else: len_genre += 1 mean_num_ratings = new_total_num_ratings / len_genre genre_dict[genre] = mean_num_ratings genre_list = [] for key in genre_dict: key_val_as_tuple = (genre_dict[key], key) genre_list.append(key_val_as_tuple) genre_list_sorted = sorted(genre_list, reverse=True) for entry in genre_list_sorted: print(entry[1], ':', entry[0]) display_table(free_english_google_cleaned, 5) #'Installs' column is column indexed 5 categories_table = freq_table(free_english_google_cleaned, 1) category_dict = {} for category in categories_table: total_installs = 0 len_category = 0 for app in free_english_google_cleaned: category_app = app[1] if category_app == category: n_installs = app[5] n_installs = float(n_installs.replace('+', '').replace(',', '')) total_installs += n_installs len_category += 1 mean_installs = total_installs / len_category category_dict[category] = mean_installs category_list = [] for key in category_dict: key_val_as_tuple = (category_dict[key], key) category_list.append(key_val_as_tuple) category_list_sorted = sorted(category_list, reverse=True) for entry in category_list_sorted: print(entry[1], ':', entry[0]) for app in free_english_google_cleaned: if (app[1] == 'COMMUNICATION') & ((app[5] == '1,000,000,000+') | (app[5] == '500,000,000+') | (app[5] == '100,000,000+')): print(app[0], ':', app[5]) for category in ['VIDEO_PLAYERS', 'SOCIAL']: print(category) for app in free_english_google_cleaned: if (app[1] == category) & ((app[5] == '1,000,000,000+') | (app[5] == '500,000,000+') | (app[5] == '100,000,000+')): print(app[0], ':', app[5]) print() category_dict = {} for category in categories_table: total_installs = 0 len_category = 0 for app in free_english_google_cleaned: category_app = app[1] if category_app == category: n_installs = app[5] n_installs = float(n_installs.replace('+', '').replace(',', '')) total_installs += n_installs #added a nested loop to remove apps for average calculation for each genre which have over 100000000 installs new_total_installs = total_installs for app in free_english_google_cleaned: category_app = app[1] if category_app == category: n_installs = app[5] n_installs = float(n_installs.replace('+', '').replace(',', '')) if n_installs >= 100000000: new_total_installs -= n_installs else: len_category += 1 mean_installs = new_total_installs / len_category category_dict[category] = mean_installs category_list = [] for key in category_dict: key_val_as_tuple = (category_dict[key], key) category_list.append(key_val_as_tuple) category_list_sorted = sorted(category_list, reverse=True) for entry in category_list_sorted: print(entry[1], ':', entry[0]) for app in free_english_google_cleaned: if (app[1] == 'PHOTOGRAPHY') & ((app[5] == '1,000,000,000+') | (app[5] == '500,000,000+') | (app[5] == '100,000,000+')): print(app[0], ':', app[5]) print('\n'*2) for app in free_english_google_cleaned: if app[1] == 'PHOTOGRAPHY': print(app[0], ':', app[5]) for app in free_english_google_cleaned: if app[1] == 'PHOTOGRAPHY' and (app[5] == '1,000,000+' or app[5] == '5,000,000+' or app[5] == '10,000,000+' or app[5] == '50,000,000+'): print(app[0], ':', app[5]) for app in free_english_google_cleaned: if (app[1] == 'COMMUNICATION') & ((app[5] == '1,000,000,000+') | (app[5] == '500,000,000+') | (app[5] == '100,000,000+')): print(app[0], ':', app[5]) print('\n') for app in free_english_google_cleaned: if (app[1] == 'SOCIAL') and (app[5] == '1,000,000+' or app[5] == '5,000,000+' or app[5] == '10,000,000+' or app[5] == '50,000,000+'): print(app[0], ':', app[5])