#!/usr/bin/env python # coding: utf-8 # In[1]: import pandas as pd import numpy as np from datetime import date as dt import re # In[2]: ## import the wikipedia dataset and view it wikipedia_pivoted = pd.read_excel('wikipedia_dataset.xlsx') wikipedia_pivoted.head() # In[3]: ## To convert the dataset from a wide format to a long format (to unpivot) wikipedia_unpivot = wikipedia_pivoted.melt(id_vars = 'Page', var_name= 'Date', value_name= 'Visits') wikipedia_unpivot # In[4]: wikipedia_unpivot.isnull().sum() # In[5]: ## Filling missing values with 0 and validating it wiki = wikipedia_unpivot.fillna(0) wiki.isnull().sum() # In[6]: ## To add a weekday and month column wiki['Weekday'] = wiki['Date'].dt.day_name() wiki['Month'] = wiki['Date'].dt.month_name() wiki # In[7]: ## To extract the language codes from the Page strings wiki['Language_Codes'] = wiki['Page'].str.extract(r"(_[a-z]{2}\.)") wiki # In[9]: ## Replacing language codes with language names wiki['Language_Names'] = wiki['Language_Codes'].replace(['_de.','_es.','_en.','_fr.', '_ru.','_ja.','_zh.'],['German','Spanish', 'English','French','Russian','Japanese','Chinese']) wiki # In[10]: ##Creating new columns and splitting the page column into a title and device column ## The page coulmn contains the title searched for and the device used wiki[['Title', 'Device']] = wiki['Page'].str.split('.org_', expand=True) wiki # In[11]: wiki.isna().sum() # In[13]: ##Group Devices wiki.groupby('Device').size() # In[14]: ##Replacing the Device column with well defined strings wiki['Device_Type'] = wiki['Device'].replace(['all-access_all-agents','all-access_spider','mobile-web_all-agents','desktop_all-agents'],['All Access','All Access','Mobile', 'Desktop']) wiki # In[15]: ## Assigning a variable for only wikipedia pages ## Since we're working with only wikipedia pages, we filter those out only_wikipedia = wiki[wiki['Page'].str.contains("wikipedia.org")] only_wikipedia # In[16]: ## Creating a new index with ordered sequence wikipedia_all = only_wikipedia.reset_index() wikipedia_all # In[18]: ## Naming the index wikipedia_all.index.name = 'Row' wikipedia_all # In[19]: ## This confirms the missing values that didnt populate when the language codes were extracted were not wikipedia pages print(549000 - 542778) # In[24]: ## The number of languages represented with occurrences Languages = wikipedia_all.groupby(['Language_Names'])['Visits'].sum() Languages # In[25]: ## The day of the week most and least popular for visiting wikipedia Day = wikipedia_all.groupby(['Weekday'])['Visits'].sum().sort_values(ascending=False) Day # In[26]: ##Device type used more frequently in visiting wikipedia Device = wikipedia_all.groupby(['Device_Type'])['Visits'].sum() Device # In[27]: ## Creating a variable for only wikipedia pages written in English. En_wikipedia = wiki.query("Language_Names == 'English'") En_wikipedia # In[29]: ## Trending search topics on Januray 1, 2016 ## First filter out new years,then group the title column to return the sum of visits per group new_year_visits = En_wikipedia.query("Date == '2016-01-01'").groupby(['Title'])['Visits'].sum().sort_values(ascending=False) new_year_visits # In[30]: ## Trending search topics on November 8, 2016 ## First filter out 8th November,then group the title column to return the sum of visits per group November8 = En_wikipedia.query("Date == '2016-11-8'").groupby(['Title'])['Visits'].sum().sort_values(ascending=False) November8 # In[32]: ##Exporting my tables to be visualized Day.to_csv('Day.csv') Device.to_csv('Device.csv') Languages.to_csv('Languages.csv') new_year_visits.to_csv('new_year_visits.csv') November8.to_csv('November8.csv')