#!/usr/bin/env python # coding: utf-8 # # Matching App Type & Number of Users # # * Our goal for this project is to analyze data to help our developers understand what type of apps are likely to attract more users. # * Cost of app is free, revenue to be generated by advertisement views. # * Two app sources are considered, Google Play for Android users and the Apple App Store for iOS users. # In[1]: def opener(file): import csv from csv import reader opened_file = open(file) read_file = reader(opened_file) data_file = list(read_file) return data_file ios_data = opener('AppleStore.csv') android_data = opener('googleplaystore.csv') # In[2]: # function to isolate given number of rows in data set def slicer(data_file, start, stop): slice = data_file[start:stop] for row in slice: if row == slice[-1]: print(row) print('***') else: print(row) print('\n') # function to count the number of total rows (including any header if present) and number of columns. def row_column_counter(data_file): print('There are ' + str(len(data_file)) + ' rows.') print('There are ' + str(len(data_file[0])) + ' columns.') # __Here is the header row for the iOS data file, and the first two rows as examples__ # # There are about 7000 apps and 16 data columns. # The original data set is available here: https://www.kaggle.com/ramamet4/app-store-apple-data-set-10k-apps # In[3]: slicer(ios_data, 0, 3) row_column_counter(ios_data) # __Most of the iOS headings are self-explanatory, but for clarity here is a chart:__ # # |Heading |Definition | Heading | Defintion | # |:-- |:-- |:-- |:-- | # |id |App ID |user_rating_ver|Avg User Rating (current version) | # |track_name |App Name |ver |Latest Version Code | # |size_bites |Size(in Bytes) |cont_rating |Content Rating | # |currency |Currency Type |prime_genre |Primary Genre | # |price |Price |sup_devices.num|Number of Supporting Devices | # |rating_count_tot|User Rating Count (all versions) |ipadSc_urls.num|Number of Screenshots Shown for Display| # |rating_count_ver|User Rating Count (current version)|lang.num |Number of SUpported Languages | # |user_rating |Avg User Rating (all versions) |vpp_lic |Vpp Device Based LIcensing Available | # __Here is the header row for the Android data file, and the first two rows as exmaples__ # # There are about 10,000 rows and 13 data columns. # The original data set is available here: https://www.kaggle.com/lava18/google-play-store-apps # In[4]: slicer(android_data, 0, 3) row_column_counter(android_data) # __And here is a chart detailing the android headings:__ # # |Heading |Definition |Heading |Definition | # |:-- |:-- |:-- |:-- | # |App |Applicantion Name |Price |Price | # |Category|Category |Content Rating|Target Age Group | # |Rating |User Rating |Genres |Genres | # |Reviews |User Rating Count |Last Updated |Last Update(when scraped) | # |Size |Size(in Megabytes) |Current Ver |Current Version | # |Installs|Number of downloads|Android Ver |Minimum Required Version of Android| # |Type |Paid or Free | # # __The header can be removed from both data sets.__ # # This makes the code easier to read and reduces the chance of bugs. # In[5]: print(len(android_data)) print(android_data[0]) android_data = android_data[1:] ios_data = ios_data[1:] print(len(android_data)) print(android_data[0]) # __Below is an example of a corrupted row.__ # # In this case it can be deleted, but that could cause a problem with the data set. # Here is a link to the discussion of the error: https://www.kaggle.com/lava18/google-play-store-apps/discussion/164101 # # The entry for the category column is missing. # In[6]: # Row 10472 google data (header exclusive) print(len(android_data[10472])) print(android_data[10472]) print(len(android_data[10473])) print(android_data[10473]) # Curious to see if there are any other rows missing entries. short_rows = [] for row in android_data: if len(row) != 13: short_rows.append(row) print(len(short_rows)) # __A quick look shows that the only row with this problem is the one already identified__ # # Filling the variable or using ```del``` are good options, but if the block is run more than once ```del``` will erase whatever new data is in the index. Sometimes the best practice is to leave the original data set unchanged. In that case it is possible to create a copy and fill it with the 'good' data. # # * print the length of a data set # * print the length of a copy of the data set # * use a function to add all rows that are the same length as the header in the original data set to the copy # * print the lengths of the two data sets again # In[7]: data_copy = [] print('The android data set length is ' + str(len(android_data))) print('The android data set copy length is ' + str(len(data_copy))) def refiner(data_set): for row in data_set: if len(row) == len(data_set[0]): data_copy.append(row) return 'Data set copied' refiner(android_data) print('The android data set length is ' + str(len(android_data))) print('The android data set copy length is ' + str(len(data_copy))) # __For ease in the rest of the project, the original Android data can be modified.__ # In[8]: android_data = data_copy print(len(android_data)) # __Many data sets will contain duplicate data which needs to be consolidated or removed.__ # # In this case there are three entries for 'Slack'. One of them has a unique value in the fourth poistion, number of reviews (51510 vs 51507). # In[9]: for row in android_data: app_name = row[0] if app_name == 'Slack': print(row) # __The code below will compile the data in two lists__ # # One will have unique entries, and the other duplicated entries. Then it will print the first ten entries of the duplicate list as an example, and the number of times the app 'Slack' appears in the both lists. # In[10]: duplicate_apps = [] unique_apps = [] for row in android_data: name = row[0] if name in unique_apps: duplicate_apps.append(name) else: unique_apps.append(name) print('Number of unique apps:', len(unique_apps)) print('Number of duplicate apps', len(duplicate_apps)) print('\n') print('Examples of duplicate apps:', duplicate_apps[:10]) print('\n') slack_count = 0 for app_name in unique_apps: if app_name == 'Slack': slack_count += 1 for app_name in duplicate_apps: if app_name == 'Slack': slack_count += 1 print('The number of times "Slack" appears in either data set:' , slack_count) # __One criteria for which entry to retain is to keep the one with the highest number of reviews.__ # # Below the code loops through all the data and builds a dictionary with the app name as the key and the number of reviews as the corresponding value. If the code finds an entry where the name already exists in the dictionary, it will keep whichever entry has the most reviews. # # For the example, we want to keep the third entry for 'Slack', where the User Rating Count = 51510. The code also shows that the ```reviews_max``` dictionary is the same length as the ```unique_apps``` list above. # In[11]: reviews_max = {} for row in android_data: name = row[0] n_reviews = float(row[3]) if name not in reviews_max: reviews_max[name] = n_reviews if name in reviews_max and reviews_max[name] < n_reviews: reviews_max[name] = n_reviews print('The number of reviews for "Slack":', reviews_max['Slack']) print('The number of unique entries:', len(reviews_max)) # __In this loop the entire row is added to a new cleaned data set.__ # # If the User Review Count in the original data set matches the value from the ```reviews_max``` data set built above, the row will be added. # # The output confirms that the length of the new data set matches the length of both the ```reviews_max``` data set and the ```unique_apps``` data sets. Then ```android_data``` is set equal to ```android_clean```. It also shows the correct User Rating Count value for the 'Slack' application. # In[12]: android_clean = [] already_added = [] print('The length of the android data set:', len(android_data)) for row in android_data: name = row[0] n_reviews = float(row[3]) if reviews_max[name] == n_reviews and name not in already_added: android_clean.append(row) already_added.append(name) android_data = android_clean print('The length of the android_clean data set:', len(android_clean)) print('The length of the android data set:', len(android_data)) print('\n') for row in android_clean: if row[0] == 'Slack': print(row) # __The iOS data set uses an ID number for each application.__ # # It is straightforward to use this and see if there are any duplicate entries. # In[13]: print('The total number of entries in the iOS data set:', len(ios_data)) ios_id_nums = [] for row in ios_data: if row[0] not in ios_id_nums: ios_id_nums.append(row[0]) else: print('This ID number already exists:', row[0]) print('The total number of unique ID numbers in the iOS data set:', len(ios_id_nums)) # __Any title with more than three characters outside the common English character set will be removed from the data set.__ # # This allows for some special characters in titles, but limits the likleyhood the application will be intended for a non-English speaking audience. # # In the code blocks below the function is written and called on a sample dataset. # In[14]: string1 = 'Instagram' string2 = '爱奇艺PPS -《欢乐颂2》电视剧热播' string3 = 'Docs To Go™ Free Office Suite' string4 = 'Instachat 😜' def common_english_character(string): c_count = 0 for character in string: if ord(character) > 127: c_count += 1 if c_count == 3: return False print(common_english_character(string1)) print(common_english_character(string2)) print(common_english_character(string3)) print(common_english_character(string4)) # In[15]: dataset = [ ['Instagram'], ['爱奇艺PPS -《欢乐颂2》电视剧热播'], ['Docs To Go™ Free Office Suite'], ['Instachat 😜']] cec_dataset = [] non_english_dataset = [] def common_english_character(dataset1, dataset2, dataset3, title_index): for row in dataset1: c_count = 0 app_name = row[title_index] for character in app_name: if ord(character) > 127: c_count += 1 if c_count < 3: dataset2.append(row) else: dataset3.append(row) common_english_character(dataset, cec_dataset, non_english_dataset, 0) print(dataset) print(cec_dataset) print(non_english_dataset) # __The ```common_english_character``` function can be called on the Android and iOS data sets.___ # # In the second block the function is called on the data set built in the first block, to proove that it does not contain any more rows that meet the criteria. Then the function is called on the iOS data and both original data sets are reassigned. # In[16]: cec_android = [] non_english_android = [] common_english_character(android_data, cec_android, non_english_android, 0) print(len(android_data)) print(len(cec_android)) print(len(non_english_android)) print(non_english_android[:5]) # In[17]: cec_android_the_second = [] common_english_character(cec_android, cec_android_the_second, non_english_android, 0) print(len(android_data)) print(len(cec_android)) print(len(cec_android_the_second)) print(len(non_english_android)) print(non_english_android[:5]) # In[18]: cec_ios = [] non_english_ios = [] common_english_character(ios_data, cec_ios, non_english_ios, 1) print(len(ios_data)) print(len(cec_ios)) print(len(non_english_ios)) # In[19]: android_data = cec_android ios_data = cec_ios print(len(android_data)) print(len(ios_data)) # __Both data sets can easily be cleaned of any applications with a price other than 0.__ # In[20]: free_android = [] charge_android = [] print(len(android_data)) print(len(free_android)) print(len(charge_android)) for row in android_data: if row[7] == '0': free_android.append(row) else: charge_android.append(row) print(len(android_data)) print(len(free_android)) print(len(charge_android)) android_data = free_android # In[21]: free_ios = [] charge_ios = [] print(len(ios_data)) print(len(free_ios)) print(len(charge_ios)) for row in ios_data: if row[4] == '0.0': free_ios.append(row) else: charge_ios.append(row) print(len(ios_data)) print(len(free_ios)) print(len(charge_ios)) ios_data = free_ios for row in free_ios: if row[4] != '0.0': print(row) print(len(ios_data)) # __The client wants to build the app for the Android environment intially, and once it is shown to be successful to build it for iOS.__ # # * A minimal version for Android will be created. # * A refined version will published based on user response. # * After six months of profibility an iOS version will be built. # # A frequency table will show the most common genres for each environment. # In[22]: # Interesting to see how sometimes the category and genre is identical # and sometimes it shows a lot of sub-division. row_count = 0 every_fifth = [] for row in android_data: row_count += 1 if row_count % 5 == 0: every_fifth.append(row) # for row in every_fifth: # print(row[1], row[9]) # __Kludgey part with a sorted tuple.__ # In[23]: def freq_table(dataset, index): table = {} total = 0 for row in dataset: total += 1 key = row[index] if key in table: table[key] += 1 else: table[key] = 1 # return table table_percentages = {} for value in table: percentage = (table[value] / total) * 100 table_percentages[value] = percentage return table_percentages def display_table(dataset, index): table = freq_table(dataset, index) table_display = [] for key in table: key_val_as_tuple = (table[key], key) table_display.append(key_val_as_tuple) table_sorted = sorted(table_display, reverse = True) for entry in table_sorted: print(entry[1], ':', entry[0]) # In[24]: android_category = display_table(android_data, 1) # print(android_category) # In[25]: android_genre = display_table(android_data, 9) # print(android_genre) # In[26]: ios_primary_genre = display_table(ios_data, 11) print(ios_primary_genre) # In[27]: android_family = [] for row in android_data: if row[1] == "FAMILY": android_family.append(row) android_family # __A quick look at the results shows the large number of games among free apps.__ # # Exapnding the android data to look at apps within the ```FAMILY``` category show that most of them are games. # # While most of the apps in the data set are games, it is uncertain if they are the most popular apps. # In[28]: android_category_freq_table = freq_table(android_data, 1) print(android_category_freq_table) # In[29]: n_installs = '100,000+' n_installs = n_installs.replace('+', '') n_installs = n_installs.replace(',', '') print(n_installs)