def explore_data(dataset, start, end, rows_and_columns=False):
dataset_piece = dataset[start:end]
for row in dataset_piece:
print(row)
print('\n')
if rows_and_columns:
print("Number of rows: ", len(dataset))
print('Number of columns: ', len(dataset[0]))
def print_column_names(dataset):
for item in dataset:
print(item)
The Google Play Store column name descriptions can be found here --> Google Play Column Names.
The Apple Store column name descriptions can be found here --> Apple Column Names.
from csv import reader
print('Apple Store File...\n')
apple_store_file = open('AppleStore.csv')
read_apple_store_file = reader(apple_store_file) #reader function MUST be used to properly build a list
apple_store_list = list(read_apple_store_file)
explore_data(apple_store_list, 0, 3, True)
print('\nAndroid File....\n')
android_file = open('googleplaystore.csv')
read_android_file = reader(android_file)
android_list = list(read_android_file)
explore_data(android_list, 0, 4, True)
#explore_data(android_list, 0, 3, True)
print(len(apple_store_list[0]))
i = 0
for item in apple_store_list[0]:
i+=1
print(i, ':', item)
Below is a check to determine if any list item has less than the correct number of columns, which is 13
i = 0
for item in android_list:
if len(item) < 13:
print(item)
print('Number of columns: ', len(item))
print('Index of item: ', i)
print('\n')
i += 1
Below is a function to locate any rows with missing columns.
def missing_columns(dataset):
column_size = len(dataset[0])
for row in dataset[1:]:
if len(row) < column_size:
print('Missing Column Info: ', row)
Below is a function to remove a bad row of data in a given data set.
def remove_row(dataset, index):
del dataset[index]
print(android_list[10473])
print(len(android_list))
print('\n')
remove_row(android_list, 10473)
print('\n')
print(android_list[10473])
print(len(android_list))
The data set has duplicate rows, which means that some of the apps are listed more than once, and there can be only one original or unique app. For example:
duplicate_examples = []
unique_apps = []
for row in android_list:
app_name = row[0]
if app_name in unique_apps:
duplicate_examples.append(app_name)
else:
unique_apps.append(app_name)
print("Number of duplicates: ", len(duplicate_examples))
print("First Few Duplicates: ", duplicate_examples[0:4])
The duplicates will not be removed randomly as we want to keep the most up-to-date review count, which would be the highest. Below is a method for collecting the duplicate apps. The date or the number of reviews will be used to determine the most recent app to keep.
android_duplicates = []
android_unique = []
for app in android_list:
name = app[0]
if name in android_unique:
android_duplicates.append(name)
else:
android_unique.append(name)
print(len(android_duplicates))
apple_duplicates = []
apple_unique = []
for app in apple_store_list:
name = app[1]
if name in apple_unique:
apple_duplicates.append(name)
else:
apple_unique.append(name)
print('Apple Unique Length: ', len(apple_unique))
print('Apple Duplicate Length: ', len(apple_duplicates))
To determine if an app already exists in the dictionary ('reviews_max'), there are two 'if' statements. The second one utilizies the 'not in' operator rather than 'else'. This is because a duplicate may have varying numbers of reviews, with the first one say, 1, the second one, say 100, a third duplicate, say 50. So the first app name and app reviews (total reviews) will be put into the dictionary. The the first duplicate is found, which has 100 reviews. The first statement then evaluates to 'True'. Then when the second dupliate is found, the first 'if' statement evealuates to 'False', which then would execute the 'Else' statement. Doing so would then add the second duplicate into the dictionary, thus not eliminating the duplicate.
When using the 'not in' operator, the second duplicate (the app with 50 ratings) will result in 'False' in the first 'if' statement, and then since the app already exists as a key in the dictionary, the result for the second 'if' statement will be 'False'. But this is okay since the app has already been added and has the highest rating.
reviews_max = {}
for row in android_list[1:]:
name = row[0]
n_reviews = float(row[3])
if name in reviews_max and reviews_max[name] < n_reviews:
reviews_max[name] = n_reviews
#else DO NOT use 'else'. Please see note above as to why.
if name not in reviews_max:
reviews_max[name] = n_reviews
print('Max Length: ', len(reviews_max))
android_clean = []
added = []
for row in android_list[1:]:
name = row[0]
n_reviews = float(row[3])
if (n_reviews == reviews_max[name]) and (name not in added):
android_clean.append(row)
added.append(name)
print('Clean: ', len(android_clean))
def check_string(string):
for char in string:
if 0 < ord(char) > 127:
return False
return True
def check_string(string):
c = 0
for char in string:
if ord(char) > 127 or ord(char) < 0:
c += 1
if c > 3:
return False
c = 0
return True
android_char_cleaned = []
android_char_foreign = []
apple_char_cleaned = []
apple_char_foreign = []
def app_char_check(dataset, index_position):
cleaned_list = []
dirty_list = []
# index_position is the index of the app name. Android is '0', Apple is '1'
c = 0
for row in dataset:
app_name = row[index_position]
for char in app_name:
if ord(char) > 127 or ord(char) < 0:
c += 1
if c > 3:
dirty_list.append(row)
break
if c <= 3:
cleaned_list.append(row)
c = 0
return cleaned_list, dirty_list
#apple_store_list
apple_char_cleaned, apple_char_foreign = app_char_check(apple_store_list, 1)
android_char_cleaned, android_char_foreign = app_char_check(android_clean, 0)
print('Apple Cleaned Length: ', len(apple_char_cleaned))
print('Apple Original File Length: ', len(apple_store_list))
print('Apple Dirty Length: ', len(apple_char_foreign))
#print('Apple Dirty: ', apple_char_foreign)
print('Length of New Anroid: ', len(android_char_cleaned))
print('Length of Dirty Anroid: ', len(android_char_foreign))
#print('Android Foreign: ', android_char_foreign)
Below is a function to filter out the free apps into a new list.
# apple index position of price is 4
# android index position of price is 7
def get_free_apps(dataset, index):
free_apps = []
for row in dataset[1:]:
price = row[index]
if '$' in price:
price = float(price.replace('$', ''))
else:
price = float(price)
if price == 0:
free_apps.append(row)
return free_apps
apple_free_apps = get_free_apps(apple_char_cleaned, 4)
android_free_apps = get_free_apps(android_char_cleaned, 7)
print("Apple Free Apps", len(apple_free_apps))
print("Android Free Apps", len(android_free_apps))
example = '爱奇艺艺'
results = []
results = check_string(example)
print(results)
We are looking for the most popular app genres in both the Apple Store and Android Store datasets. This is because we want to build an app for an audience that has the most likely chance of success on BOTH platforms. We will analyze the 'genre' columns of both datasets to get a good idea of which genres pose the best chances of success, which is adoption and usage.
for genre in apple_free_apps[1:2]:
print(genre[11])
for genre in android_free_apps[1:2]:
print(genre[9])
Below is a function that creates a frequency table of column topics such as app genre.
def freq_table(dataset, index_value):
freq_dictionary = {}
for row in dataset:
name = row[index_value]
if name in freq_dictionary:
freq_dictionary[name] += 1
else:
freq_dictionary[name] = 1
dataset_length = len(dataset)
for key, value in freq_dictionary.items():
freq_dictionary[key] = (value/dataset_length * 100)
return freq_dictionary
apple_freq_result = freq_table(apple_free_apps, 11)
android_freq_CATEGORY_result = freq_table(android_free_apps, 1)
android_freq_GENRE_result = freq_table(android_free_apps, 9)
sorted_apple_freq_result = sorted(apple_freq_result.items(), key=lambda x: x[1], reverse=True)
sorted_android_freq_CATEGORY_result = sorted(android_freq_CATEGORY_result.items(), key=lambda x: x[1], reverse=True)
sorted_android_freq_GENRE_result = sorted(android_freq_GENRE_result.items(), key=lambda x: x[1], reverse=True)
print('Apple genres: ', sorted_apple_freq_result, '\n')
print('Android Categories: ', sorted_android_freq_CATEGORY_result, '\n')
print('Android Genres: ', sorted_android_freq_GENRE_result)
#print("Apple Genre Below...")
#for row in sorted_apple_freq_result:
# print(row[0], ':', row[1])
#print('Android Category Below...')
#for row in sorted_android_freq_CATEGORY_result:
# print(row[0], ':', row[1])
#print('Android Genre Below...')
#for row in sorted_android_freq_GENRE_result:
# print(row[0], ':', row[1])
This function will create a frequency table of a column name, and then for that particular column heading drill down further to return the sum of yet another particular column name. Example: frequency table of genres and then the number of installs of a particular genre.
def apps_most_used(dataset, genre_index, install_index):
freq_dictionary = {}
for row in dataset:
name = row[genre_index]
usage = row[install_index]
print(usage)
# first, create a frequency table for the genres
def apple_store_genre_usage(dataset):
genre_dictionary = {}
for row in dataset:
name = row[11]
if name in genre_dictionary:
genre_dictionary[name] += 1
else:
genre_dictionary[name] = 1
total = 0
genre_count = 0
for genre in dataset[1:]:
if genre[11] == name:
genre_count += float(genre[5])
total += 1
avg_num_user_ratings = genre_count/total
print(name, ':', avg_num_user_ratings)
#return genre_dictionary
# Use the apple_char_cleaned list
# the index for the 'rating_count_total' is: 5
# the index for the genre (i.e. 'prime_genre') is: 11
apple_store_genre_usage(apple_free_apps)
# android_free_apps is the dataset
# The Category name has an index of 1
# Installs has an index of 5
freq_dictionary = {}
for row in android_free_apps:
name = row[1]
total = 0 # total number of installs for a category
install_number = 0 # the number of installs for one app
count = 0 # the number of apps specific to one genre
if name in freq_dictionary:
freq_dictionary[name] += 1
else:
freq_dictionary[name] = 1
for category in android_free_apps:
if category[1] == name:
install_number = category[5]
install_number = install_number.replace('+', '')
install_number = float(install_number.replace(',', ''))
total += install_number
count += 1
avg_installs = total/count
print(name, ':', avg_installs)
# The GENRRE has an index of 9
# The INSTALLS has an index of 5
freq_dictionary = {}
for row in android_free_apps:
name = row[9]
if name in freq_dictionary:
freq_dictionary[name] += 1
else:
freq_dictionary[name] = 1
total = 0
install_count = 0
count = 0
for genre in android_free_apps:
if genre[9] == name:
install_count = genre[5].replace('+', '')
install_count = float(install_count.replace(',', ''))
count += 1
total += install_count
print(name, ':', total/count)