#!/usr/bin/env python # coding: utf-8 # # Mobile App Data # This project is about identifying profitable profiles for the App Store and Google Play Markets # In[1]: ### Google Play data set ### opened_file = open('googleplaystore.csv') from csv import reader read_file = reader(opened_file) android = list(read_file) android_header = android[0] android = android[1:] ### App Store data set ### opened_file = open('AppleStore.csv') read_file = reader(opened_file) ios = list(read_file) ios_header =ios[0] ios = ios[2:] def explore_data(dataset, start, end, rows_and_columns=False): dataset_slice = dataset[start:end] for row in dataset_slice: print(row) print('\n') # adds a new (empty) line after each row if rows_and_columns: print('Number of rows:', len(dataset)) print('Number of columns:', len(dataset[0])) print(android_header) print('\n') explore_data(android, 0, 3, True) # In[2]: print(android[10472]) # incorrect row print('\n') print(android_header) # header print('\n') print(android[0]) # correct row del(android[10472]) print(android[10472]) # deleted row # In[3]: ### Google Play data set has duplicates, so let's get rid of em! ### for app in android: name = app[0] if name == 'Instagram': print(app) # In[4]: ### I'm not going to remove duplicates randomly. ### Rather, I'm going to delete all except the first one found ### seen_duplicates = [] unique_apps = [] for app in android: name == app[0] if name in unique_apps: seen_duplicates.append(name) else: unique_apps.append(name) print('Number of duplicates:', len(seen_duplicates)) print('\n') print('Number of unique', len(unique_apps)) print('Examples of duplicate apps:', seen_duplicates[:15] ) # In[5]: ### Removing duplicate entries and store ### separate lists for new cleaned data set and ### just app names for detecting duplicates ### reviews_max = {} for app in android[1:]: name = app[0] n_reviews = float(app[3]) if (name in reviews_max) and (reviews_max[name] < n_reviews): reviews_max[name] = n_reviews if (name not in reviews_max): reviews_max[name] = n_reviews print('Expected length:', len(android) - 1181) print('Actual length:', len(reviews_max)) android_clean = [] already_added = [] for app in android[1:]: name = app[0] n_reviews = float(app[3]) if n_reviews == reviews_max[name] and name not in already_added: android_clean.append(app) already_added.append(name) # In[6]: ### exploring android_clean data set to ensure it displays as expected ### explore_data(android_clean, 0, 3, True) # In[7]: ## adding function that takes a string and determines if there is any ## character that doesn't belong to the set of common English characters ## if there are more than 3 chars that fall outside the ASCII range (0-127) ## it is determined to be non-english def english_only(language): count = 0 for char in language: if(ord(char) > 127): count += 1; if count > 3: return False; else: return True; print(english_only('Instagram')) print(english_only('爱奇艺PPS -《欢乐颂2》电视剧热播')) print(english_only('Docs To Go™ Free Office Suite')) print(english_only('Instachat 😜')) # In[8]: ## separate Android and iOS apps and find out how many of each we have ### android_english = [] ios_english = [] for app in android_clean: name = app[0] if english_only(name): android_english.append(app) for app in ios: name = app[1]; if english_only(name): ios_english.append(app) explore_data(android_english, 0, 3, True) print('\n') explore_data(ios_english, 0, 3, True) # In[9]: ## isolating the free android and iOS apps ### for app in android_clean: name = app[0] if(english_only): android_english.append(name) for app in ios: name = app[1]; if(english_only): ios_english.append(name) # In[10]: ### We want to find and app profile that fits both the App Store and Google Play because by analyzing apps that are successful on both, we have a way to measure the threshold of entry ### in order to be/remain competitive on those platforms ### explore_data(android_english, 0, 3, True) # In[11]: ### function to generate frequency tables to show percentages ### def freq_table(dataset, index): table = {} total = 0 for row in dataset: total += 1 value = row[index] if value in table: table[value] += 1 else: table[value] = 1 table_percentages = {} for key in table: percentage = (table[key] / total) * 100 table_percentages[key] = percentage return table_percentages ### function to display the percentages in desc ### def display_table(dataset, index, label): table = freq_table(dataset, index) table_display = [] for key in table: key_val_as_tuple = (table[key], key) table_display.append(key_val_as_tuple) table_sorted = sorted(table_display, reverse = True) print("Column: " + "***" + label.upper() + "***") for entry in table_sorted: print(entry[1], ':', entry[0]) # In[12]: display_table(ios, -5, "ios") # In[13]: display_table(android_clean, 1, "Category") # In[14]: display_table(android_clean, -4, "Genres") # In[15]: freq_table(ios, -5) # In[27]: ### calculate most popular apps by genre ### genres_ios = freq_table(ios, -5) for genre in genres_ios: total = 0 len_genre = 0 for genre_app in ios: genre_app = app[-5] if genre_app == genre: user_ratings = float(app[5]) total += user_ratings len_genre += 1 avg_user_ratings = total / len_genre print(genre) print(avg_user_ratings) #