def opener(file):
import csv
from csv import reader
opened_file = open(file)
read_file = reader(opened_file)
data_file = list(read_file)
return data_file
ios_data = opener('AppleStore.csv')
android_data = opener('googleplaystore.csv')
# function to isolate given number of rows in data set
def slicer(data_file, start, stop):
slice = data_file[start:stop]
for row in slice:
if row == slice[-1]:
print(row)
print('***')
else:
print(row)
print('\n')
# function to count the number of total rows (including any header if present) and number of columns.
def row_column_counter(data_file):
print('There are ' + str(len(data_file)) + ' rows.')
print('There are ' + str(len(data_file[0])) + ' columns.')
Here is the header row for the iOS data file, and the first two rows as examples. There are about 7000 apps and 16 data columns.
The original data set is available here: https://www.kaggle.com/ramamet4/app-store-apple-data-set-10k-apps
slicer(ios_data, 0, 3)
row_column_counter(ios_data)
['id', 'track_name', 'size_bytes', 'currency', 'price', 'rating_count_tot', 'rating_count_ver', 'user_rating', 'user_rating_ver', 'ver', 'cont_rating', 'prime_genre', 'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic'] ['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1'] ['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1'] *** There are 7198 rows. There are 16 columns.
Most of the iOS headings are self-explanatory, but for clarity here is a chart:
Heading | Definition | Heading | Defintion |
---|---|---|---|
id | App ID | user_rating_ver | Avg User Rating (current version) |
track_name | App Name | ver | Latest Version Code |
size_bites | Size(in Bytes) | cont_rating | Content Rating |
currency | Currency Type | prime_genre | Primary Genre |
price | Price | sup_devices.num | Number of Supporting Devices |
rating_count_tot | User Rating Count (all versions) | ipadSc_urls.num | Number of Screenshots Shown for Display |
rating_count_ver | User Rating Count (current version) | lang.num | Number of SUpported Languages |
user_rating | Avg User Rating (all versions) | vpp_lic | Vpp Device Based LIcensing Available |
Here is the header row for the Android data file, and the first two rows as exmaples. There are about 10,000 rows and 13 data columns.
The original data set is available here: https://www.kaggle.com/lava18/google-play-store-apps
slicer(android_data, 0, 3)
row_column_counter(android_data)
['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver'] ['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up'] ['Coloring book moana', 'ART_AND_DESIGN', '3.9', '967', '14M', '500,000+', 'Free', '0', 'Everyone', 'Art & Design;Pretend Play', 'January 15, 2018', '2.0.0', '4.0.3 and up'] *** There are 10842 rows. There are 13 columns.
And here is a chart detailing the android headings:
Heading | Definition | Heading | Definition |
---|---|---|---|
App | Applicantion Name | Price | Price |
Category | Category | Content Rating | Target Age Group |
Rating | User Rating | Genres | Genres |
Reviews | User Rating Count | Last Updated | Last Update(when scraped) |
Size | Size(in Megabytes) | Current Ver | Current Version |
Installs | Number of downloads | Android Ver | Minimum Required Version of Android |
Type | Paid or Free |
Below is an example of a corrupted row. In this case it can be deleted, but that could cause a problem with the data set. Here is a link to the discussion of the error: https://www.kaggle.com/lava18/google-play-store-apps/discussion/164101
The entry for the category column is missing. One solution would be to fill the missing variable. Or, the row can be removed using del
, but if the block is run multiple times it will remove more than the corrupted row.
# Row 10472 google data (header exclusive)
print(len(android_data[10473]))
print(android_data[10473])
print(len(android_data[10474]))
print(android_data[10474])
# Curious to see if there are any other rows missing entries.
short_rows = []
for row in android_data:
if len(row) != 13:
short_rows.append(row)
print(len(short_rows))
12 ['Life Made WI-Fi Touchscreen Photo Frame', '1.9', '19', '3.0M', '1,000+', 'Free', '0', 'Everyone', '', 'February 11, 2018', '1.0.19', '4.0 and up'] 13 ['osmino Wi-Fi: free WiFi', 'TOOLS', '4.2', '134203', '4.1M', '10,000,000+', 'Free', '0', 'Everyone', 'Tools', 'August 7, 2018', '6.06.14', '4.4 and up'] 1
A quick look will show that the only row with this problem is the one already identified, so filling the variable or using del
are good options.
However, sometimes the best practice is to leave the original data set unchanged. In that case it is possible to create a copy and fill it with the 'good' data.
data_copy = []
print('The android data set length is ' + str(len(android_data)))
print('The android data set copy length is ' + str(len(data_copy)))
def refiner(data_set):
for row in data_set:
if len(row) == len(data_set[0]):
data_copy.append(row)
return 'Data set copied'
refiner(android_data)
print('The android data set length is ' + str(len(android_data)))
print('The android data set copy length is ' + str(len(data_copy)))
The android data set length is 10842 The android data set copy length is 0 The android data set length is 10842 The android data set copy length is 10841
For ease in the rest of the project, the original Android data can be modified.
android_data = data_copy
print(len(android_data))
10841
Many data sets will contain duplicate data which needs to be consolidated or removed. In this case there are three entries for 'Slack'. One of them has a unique value in the fourth poistion, number of reviews (51510 vs 51507).
for row in android_data:
app_name = row[0]
if app_name == 'Slack':
print(row)
['Slack', 'BUSINESS', '4.4', '51507', 'Varies with device', '5,000,000+', 'Free', '0', 'Everyone', 'Business', 'August 2, 2018', 'Varies with device', 'Varies with device'] ['Slack', 'BUSINESS', '4.4', '51507', 'Varies with device', '5,000,000+', 'Free', '0', 'Everyone', 'Business', 'August 2, 2018', 'Varies with device', 'Varies with device'] ['Slack', 'BUSINESS', '4.4', '51510', 'Varies with device', '5,000,000+', 'Free', '0', 'Everyone', 'Business', 'August 2, 2018', 'Varies with device', 'Varies with device']
The code below will compile the data in two lists, one with unique entries and the other with duplicated entries. Then print the first ten entries of the duplicate list as an example and print the number of times the app 'Slack' appears in the both lists.
duplicate_apps = []
unique_apps = []
for row in android_data[1:]:
name = row[0]
if name in unique_apps:
duplicate_apps.append(name)
else:
unique_apps.append(name)
print('Number of unique apps:', len(unique_apps))
print('Number of duplicate apps', len(duplicate_apps))
print('\n')
print('Examples of duplicate apps:', duplicate_apps[:10])
print('\n')
slack_count = 0
for app_name in unique_apps:
if app_name == 'Slack':
slack_count += 1
for app_name in duplicate_apps:
if app_name == 'Slack':
slack_count += 1
print('The number of times "Slack" appears in either data set:' , slack_count)
Number of unique apps: 9659 Number of duplicate apps 1181 Examples of duplicate apps: ['Quick PDF Scanner + OCR FREE', 'Box', 'Google My Business', 'ZOOM Cloud Meetings', 'join.me - Simple Meetings', 'Box', 'Zenefits', 'Google Ads', 'Google My Business', 'Slack'] The number of times "Slack" appears in either data set: 3
One criteria for which entry to retain is to keep the one with the highest number of reviews.
Below the code loops through all the data and builds a dictionary with the app name as the key and the number of reviews as the corresponding value. If the code finds an entry where the name already exists in the dictionary, it will keep whichever entry has the most reviews.
For the example, we want to keep the third entry for 'Slack', where the User Rating Count = 51510. The code also shows that the reviews_max
dictionary is the same length as the unique_apps
list above.
reviews_max = {}
for row in android_data[1:]:
name = row[0]
n_reviews = float(row[3])
if name not in reviews_max:
reviews_max[name] = n_reviews
if name in reviews_max and reviews_max[name] < n_reviews:
reviews_max[name] = n_reviews
print('The number of reviews for "Slack":', reviews_max['Slack'])
print('The number of unique entries:', len(reviews_max))
The number of reviews for "Slack": 51510.0 The number of unique entries: 9659
In this loop the entire row is added to a new cleaned data set if the User Review Count in the original data set matches the value from the reviews_max
data set built above.
The output confirms that the length of the new data set matches the length of both the reviews_max
data set and the unique_apps
data sets. It also shows the correct User Rating Count value for the 'Slack' application.
android_clean = []
already_added = []
for row in android_data[1:]:
name = row[0]
n_reviews = float(row[3])
if reviews_max[name] == n_reviews and name not in already_added:
android_clean.append(row)
already_added.append(name)
print('The length of the android_clean data set:', len(android_clean))
print('\n')
for row in android_clean:
if row[0] == 'Slack':
print(row)
The length of the android_clean data set: 9659 ['Slack', 'BUSINESS', '4.4', '51510', 'Varies with device', '5,000,000+', 'Free', '0', 'Everyone', 'Business', 'August 2, 2018', 'Varies with device', 'Varies with device']
The iOS data set uses an ID number for each application. It's straightforward to use this to see if there are any duplicate entries.
print('The total number of entries in the iOS data set:', len(ios_data))
ios_id_nums = []
for row in ios_data:
if row[0] not in ios_id_nums:
ios_id_nums.append(row[0])
else:
print('This ID number already exists:', row[0])
print('The total number of unique ID numbers in the iOS data set:', len(ios_id_nums))
The total number of entries in the iOS data set: 7198 The total number of unique ID numbers in the iOS data set: 7198
Any title with more than three characters outside the common English character set will be removed from the data set. This allows for some special characters in titles, but limits the likleyhood the application will be intended for a non-English speaking audience. In the code blocks below the function to do this is written, called on a sample dataset, and then called on the Android and iOS files.
string1 = 'Instagram'
string2 = '爱奇艺PPS -《欢乐颂2》电视剧热播'
string3 = 'Docs To Go™ Free Office Suite'
string4 = 'Instachat 😜'
def common_english_character(string):
c_count = 0
for character in string:
if ord(character) > 127:
c_count += 1
if c_count == 3:
return False
print(common_english_character(string1))
print(common_english_character(string2))
print(common_english_character(string3))
print(common_english_character(string4))
None False None None
dataset = [ ['Instagram'], ['爱奇艺PPS -《欢乐颂2》电视剧热播'], ['Docs To Go™ Free Office Suite'], ['Instachat 😜']]
non_english_dataset = []
def common_english_character(dataset1, dataset2):
for row in dataset:
c_count = 0
app_name = row[0]
for character in app_name:
if ord(character) > 127:
c_count += 1
if c_count >= 3:
dataset1.remove(row)
dataset2.append(row)
common_english_character(dataset, non_english_dataset)
print(dataset)
print(non_english_dataset)
[['Instagram'], ['Docs To Go™ Free Office Suite'], ['Instachat 😜']] [['爱奇艺PPS -《欢乐颂2》电视剧热播']]
***Figure this out later Bruce***
non_english_android = []
dataset = android_clean
common_english_character(dataset, non_english_android)
print(len(non_english_android))
print(non_english_android[:5])
print(len(android_clean))
print(len(dataset))
61 [['Truyện Vui Tý Quậy', 'COMICS', '4.5', '144', '4.7M', '10,000+', 'Free', '0', 'Everyone', 'Comics', 'July 19, 2018', '3.0', '4.0.3 and up'], ['Flame - درب عقلك يوميا', 'EDUCATION', '4.6', '56065', '37M', '1,000,000+', 'Free', '0', 'Everyone', 'Education', 'July 26, 2018', '3.3', '4.1 and up'], ['At home - rental · real estate · room finding application such as apartment · apartment', 'HOUSE_AND_HOME', '3.8', '2496', 'Varies with device', '500,000+', 'Free', '0', 'Everyone', 'House & Home', 'July 5, 2018', 'Varies with device', 'Varies with device'], ['乐屋网: Buying a house, selling a house, renting a house', 'HOUSE_AND_HOME', '3.7', '2248', '15M', '100,000+', 'Free', '0', 'Everyone', 'House & Home', 'August 3, 2018', 'v3.1.1', '4.0 and up'], ['သိင်္ Astrology - Min Thein Kha BayDin', 'LIFESTYLE', '4.7', '2225', '15M', '100,000+', 'Free', '0', 'Everyone', 'Lifestyle', 'July 26, 2018', '4.2.1', '4.0.3 and up']] 9598 9598
non_english_ios = []
def cec_ios(dataset1, dataset2):
for row in dataset:
c_count = 0
app_name = row[1]
for character in app_name:
if ord(character) > 127:
c_count += 1
if c_count >= 3:
dataset1.remove(row)
dataset2.append(row)
cec_ios(ios_data[1:], non_english_ios)
print(len(ios_data))
print(len(non_english_ios))
7198 0
non_english_ios = []
dataset = ios_data[1:]
cec_ios(dataset, non_english_ios)
print(len(ios_data))
print(len(dataset))
print(len(non_english_ios))
7198 6487 710
extra = []
cec_ios(ios_data, extra)
print(len(ios_data))
6866