#open csv file for each dataset, read using reader function imported from csv module, store each in variable as list of lists
opened_file_google = open('/content/drive/My Drive/Datasets/googleplaystore.csv')
opened_file_apple = open('/content/drive/My Drive/Datasets/AppleStore.csv')
from csv import reader
google_data = list(reader(opened_file_google))
google_data_header = google_data[0]
google_data = google_data[1:]
apple_data = list(reader(opened_file_apple))
apple_data_header = apple_data[0]
apple_data = apple_data[1:]


def explore_data(dataset, start, end, rows_and_columns=False):
  '''Passed dataset paraemter as list of lists, prints rows of dataset and if rows_and_columns parameter is passed True
  then prints number of rows (including header row) and number of columns in dataset''' 
  dataset_slice = dataset[start:end]    
  for row in dataset_slice:
    print(row)
    print('\n') # adds a new (empty) line after each row

  if rows_and_columns:
    print('Number of rows:', len(dataset))
    print('Number of columns:', len(dataset[0]))

print(google_data_header)
print('\n')
explore_data(google_data, 0, 3, True)
print('\n'*3)
print(apple_data_header)
print('\n')
explore_data(apple_data, 0, 3, True)

#Delete a row with error as identified in discussion forum in dataset documentation
print(len(google_data))
print(google_data[10472])
del google_data[10472]
print(len(google_data))


name = 'Coloring book moana'

for app in google_data:
  if app[0] == name:
    print(app)
    print(google_data.index(app))

duplicate_entries = []
unique_entries = []

for app in google_data:
  name = app[0]
  if name in unique_entries:
    duplicate_entries.append(name)
  else:
    unique_entries.append(name)

print('Number of duplicate apps: ', len(duplicate_entries))    
print('Examples of duplicate apps: ', duplicate_entries[:10])

#initialize empty dictionary reviews_max
#loop over apps in google_data and update reviews column value if entry is a duplicate. Else add key, value pair to reviews_max if app name is in reviews_max 
reviews_max = {}
for app in google_data:
  name = app[0]
  n_reviews = float(app[3])
  if name in reviews_max:
    if n_reviews > reviews_max[name]:
      reviews_max[name] = n_reviews
  else:
    reviews_max[name] = n_reviews

#print lengths of container variables to check loop has worked correctly
print('Length of google_data minus length of duplicate entries: ', len(google_data) - len(duplicate_entries))
print('Length of unique_entries: ', len(unique_entries))
print('Length of reviews_max: ', len(reviews_max))

                    
#create two empty lists to store cleaned dataset and to store named of apps already added to cleaned dataset
#loop through apps in original dataset and store name and number of reviews 
#if number of reviews is equal to the max number of reviews for apps of same name AND name of app in not in the list of names of apps already added then append app to cleaned dataset
#note: some rows in original dataset have duplicate entries with same number of reviews hence 'name not in already_added' required to prevent duplicates of these rows in cleaned data

google_cleaned = []
already_added = []

for app in google_data:
  name = app[0]
  n_reviews = float(app[3])
  if (n_reviews == reviews_max[name]) & (name not in already_added):
    google_cleaned.append(app)
    already_added.append(name)

#explore the cleaed dataset
explore_data(google_cleaned, 0, 3, True)   


def is_english(s):
  '''is_english returns True if the string only contains characters 
  with an output from ord() function in the range 0 to 127 and False if the 
  string contains one or more characters outside that range'''
  for character in s:
    if ord(character) > 127:
      return False
  return True

#test is_english function on some strings
print(is_english('Instagram'))
print(is_english('爱奇艺PPS -《欢乐颂2》电视剧热播'))
print(is_english('Docs To Go™ Free Office Suite'))
print(is_english('Instachat 😜'))

def is_english(s):
  '''is_english returns True if the string contains 3 or less characters 
  with an output from ord() function outside the range 0 to 127 and False if the 
  string contains 4 or more characters outside that range'''
  count = 0
  for character in s:
    if ord(character) > 127:
      count += 1
      if count == 4:
        return False
  return True

#test modified is_english function on same strings
print(is_english('Instagram'))
print(is_english('爱奇艺PPS -《欢乐颂2》电视剧热播'))
print(is_english('Docs To Go™ Free Office Suite'))
print(is_english('Instachat 😜'))

english_google_cleaned = []
english_apple_data = []

for app in google_cleaned:
  name = app[0]
  if is_english(name):
    english_google_cleaned.append(app)

for app in apple_data:
  name = app[1] #name is in second column (index=1) of apple_data dataset
  if is_english(name):
    english_apple_data.append(app)   

explore_data(english_google_cleaned, 0, 3, True)
print('\n')
explore_data(english_apple_data, 0, 3, True)

free_english_google_cleaned = []
free_english_apple_data = []

for app in english_google_cleaned:
  if (app[6] == 'Free') | (app[7] == '0'):
    free_english_google_cleaned.append(app)

for app in english_apple_data:
  if app[4] == '0.0':
    free_english_apple_data.append(app)

explore_data(free_english_google_cleaned, 0, 3, True)
print('\n')
explore_data(free_english_apple_data, 0, 3, True)

print(google_data_header)
print('\n')
print(apple_data_header)

def freq_table(dataset, index):
  '''dataset is expected to be a list of lists and index is expected to be an integer
  freq_table returns the relative frequency table (as a dictionary) for any column we want.
  '''
  table = {}
  total = len(dataset)

  for app in dataset:
    key = app[index]
    if key in table:
      table[key] += 1
    else:
      table[key] = 1
  
  for key in table:
    table[key] = (table[key] / total) * 100
  
  return table

print(freq_table(free_english_google_cleaned, 1))


def display_table(dataset, index):
  '''Takes in two parameters: dataset and index. dataset is expected to be a list of lists, and index is expected to be an integer.
  Generates a frequency table using the freq_table() function.
  Transforms the frequency table into a list of tuples (value, key), then sorts the list in a descending order using sorted() function.
  Prints the entries of the frequency table.
  Does not return anyhting.
  '''
  table = freq_table(dataset, index)
  table_display = []
  for key in table:
      key_val_as_tuple = (table[key], key)
      table_display.append(key_val_as_tuple)

  table_sorted = sorted(table_display, reverse = True)
  for entry in table_sorted:
      print(entry[1], ':', entry[0])


#use the display_table function to display the frequency table of the prime_genre column from english_apple_data

display_table(free_english_apple_data, 11)

print('Categories frequency table: ')
print('\n')
display_table(free_english_google_cleaned, 1)
print('\n')
print('Genres frequency table: ')
print('\n')
display_table(free_english_google_cleaned, 9)

prime_genre_table = freq_table(free_english_apple_data, 11)

genre_dict = {}

for genre in prime_genre_table:
  total_num_ratings = 0
  len_genre = 0
  for app in free_english_apple_data:
    genre_app = app[11]
    if genre_app == genre:
      app_num_ratings = float(app[5])
      total_num_ratings += app_num_ratings
      len_genre += 1
  mean_num_ratings = total_num_ratings / len_genre
  genre_dict[genre] = mean_num_ratings

genre_list = []

for key in genre_dict:
    key_val_as_tuple = (genre_dict[key], key)
    genre_list.append(key_val_as_tuple)

genre_list_sorted = sorted(genre_list, reverse=True)
for entry in genre_list_sorted:
    print(entry[1], ':', entry[0])


for app in free_english_apple_data:
  if app[11] == 'Navigation':
    print(app[1], ':', app[5])

for app in free_english_apple_data:
  if app[11] == 'Reference':
    print(app[1], ':', app[5])
print()
for app in free_english_apple_data:
  if app[11] == 'Book':
    print(app[1], ':', app[5])

for app in free_english_apple_data:
  if app[11] == 'Social Networking':
    print(app[1], ':', app[5])

genre_dict = {}

for genre in prime_genre_table:
  total_num_ratings = 0
  len_genre = 0
  for app in free_english_apple_data:
    genre_app = app[11]
    if genre_app == genre: 
      app_num_ratings = float(app[5])
      total_num_ratings += app_num_ratings
      
  #added a loop to remove apps for average calculation which have over 20% of total number of ratings in genre
  new_total_num_ratings = total_num_ratings
  for app in free_english_apple_data:
    genre_app = app[11]
    if genre_app == genre:
      app_num_ratings = float(app[5])   
      if app_num_ratings >= 0.2*total_num_ratings:
        new_total_num_ratings -= app_num_ratings
      else:
        len_genre += 1
     
  mean_num_ratings = new_total_num_ratings / len_genre
  genre_dict[genre] = mean_num_ratings


genre_list = []

for key in genre_dict:
    key_val_as_tuple = (genre_dict[key], key)
    genre_list.append(key_val_as_tuple)

genre_list_sorted = sorted(genre_list, reverse=True)
for entry in genre_list_sorted:
    print(entry[1], ':', entry[0])

display_table(free_english_google_cleaned, 5) #'Installs' column is column indexed 5

categories_table = freq_table(free_english_google_cleaned, 1)

category_dict = {}

for category in categories_table:
  total_installs = 0
  len_category = 0
  for app in free_english_google_cleaned:
    category_app = app[1]
    if category_app == category:
      n_installs = app[5]
      n_installs = float(n_installs.replace('+', '').replace(',', ''))
      total_installs += n_installs
      len_category += 1
  
  mean_installs = total_installs / len_category
  category_dict[category] = mean_installs


category_list = []

for key in category_dict:
    key_val_as_tuple = (category_dict[key], key)
    category_list.append(key_val_as_tuple)

category_list_sorted = sorted(category_list, reverse=True)
for entry in category_list_sorted:
    print(entry[1], ':', entry[0])

for app in free_english_google_cleaned:
  if (app[1] == 'COMMUNICATION') & ((app[5] == '1,000,000,000+') | (app[5] == '500,000,000+') | (app[5] == '100,000,000+')):
    print(app[0], ':', app[5])

for category in ['VIDEO_PLAYERS', 'SOCIAL']:
  print(category)
  for app in free_english_google_cleaned:
    if (app[1] == category) & ((app[5] == '1,000,000,000+') | (app[5] == '500,000,000+') | (app[5] == '100,000,000+')):
      print(app[0], ':', app[5])
  print()

category_dict = {}

for category in categories_table:
  total_installs = 0
  len_category = 0
  for app in free_english_google_cleaned:
    category_app = app[1]
    if category_app == category:
      n_installs = app[5]
      n_installs = float(n_installs.replace('+', '').replace(',', ''))
      total_installs += n_installs
          
  #added a nested loop to remove apps for average calculation for each genre which have over 100000000 installs
  new_total_installs = total_installs
  for app in free_english_google_cleaned:
    category_app = app[1]
    if category_app == category:
      n_installs = app[5]
      n_installs = float(n_installs.replace('+', '').replace(',', ''))   
      if n_installs >= 100000000:
        new_total_installs -= n_installs
      else:
        len_category += 1
     
  mean_installs = new_total_installs / len_category
  category_dict[category] = mean_installs

category_list = []

for key in category_dict:
    key_val_as_tuple = (category_dict[key], key)
    category_list.append(key_val_as_tuple)

category_list_sorted = sorted(category_list, reverse=True)
for entry in category_list_sorted:
    print(entry[1], ':', entry[0])

for app in free_english_google_cleaned:
  if (app[1] == 'PHOTOGRAPHY') & ((app[5] == '1,000,000,000+') | (app[5] == '500,000,000+') | (app[5] == '100,000,000+')):
    print(app[0], ':', app[5])
print('\n'*2) 
for app in free_english_google_cleaned:
    if app[1] == 'PHOTOGRAPHY':
        print(app[0], ':', app[5])

for app in free_english_google_cleaned:
    if app[1] == 'PHOTOGRAPHY' and (app[5] == '1,000,000+'
                                            or app[5] == '5,000,000+'
                                            or app[5] == '10,000,000+'
                                            or app[5] == '50,000,000+'):
        print(app[0], ':', app[5])

for app in free_english_google_cleaned:
  if (app[1] == 'COMMUNICATION') & ((app[5] == '1,000,000,000+') | (app[5] == '500,000,000+') | (app[5] == '100,000,000+')):
    print(app[0], ':', app[5])

print('\n')

for app in free_english_google_cleaned:
    if (app[1] == 'SOCIAL') and (app[5] == '1,000,000+'
                                            or app[5] == '5,000,000+'
                                            or app[5] == '10,000,000+'
                                            or app[5] == '50,000,000+'):
        print(app[0], ':', app[5])