#!/usr/bin/env python
# coding: utf-8

# In[1]:


import pandas as pd
import numpy as np
from datetime import date as dt
import re 


# In[2]:


## import the wikipedia dataset and view it
wikipedia_pivoted = pd.read_excel('wikipedia_dataset.xlsx')
wikipedia_pivoted.head()


# In[3]:


## To convert the dataset from a wide format to a long format (to unpivot)
wikipedia_unpivot = wikipedia_pivoted.melt(id_vars = 'Page', var_name= 'Date', value_name= 'Visits')
wikipedia_unpivot


# In[4]:


wikipedia_unpivot.isnull().sum()


# In[5]:


## Filling missing values with 0 and validating it
wiki = wikipedia_unpivot.fillna(0)
wiki.isnull().sum()


# In[6]:


## To add a weekday and month column
wiki['Weekday'] = wiki['Date'].dt.day_name()
wiki['Month'] = wiki['Date'].dt.month_name() 
wiki


# In[7]:


## To extract the language codes from the Page strings
wiki['Language_Codes'] = wiki['Page'].str.extract(r"(_[a-z]{2}\.)")
wiki


# In[9]:


## Replacing language codes with language names
wiki['Language_Names'] = wiki['Language_Codes'].replace(['_de.','_es.','_en.','_fr.', '_ru.','_ja.','_zh.'],['German','Spanish', 'English','French','Russian','Japanese','Chinese'])
wiki


# In[10]:


##Creating new columns and splitting the page column into a title and device column
## The page coulmn contains the title searched for and the device used
wiki[['Title', 'Device']] = wiki['Page'].str.split('.org_', expand=True)
wiki


# In[11]:


wiki.isna().sum()


# In[13]:


##Group Devices
wiki.groupby('Device').size()


# In[14]:


##Replacing the Device column with well defined strings
wiki['Device_Type'] = wiki['Device'].replace(['all-access_all-agents','all-access_spider','mobile-web_all-agents','desktop_all-agents'],['All Access','All Access','Mobile', 'Desktop'])
wiki


# In[15]:


## Assigning a variable for only wikipedia pages
## Since we're working with only wikipedia pages, we filter those out
only_wikipedia = wiki[wiki['Page'].str.contains("wikipedia.org")]
only_wikipedia


# In[16]:


## Creating a new index with ordered sequence
wikipedia_all = only_wikipedia.reset_index() 
wikipedia_all


# In[18]:


## Naming the index
wikipedia_all.index.name = 'Row'
wikipedia_all


# In[19]:


## This confirms the missing values that didnt populate when the language codes were extracted were not wikipedia pages
print(549000 - 542778)


# In[24]:


## The number of languages represented with occurrences
Languages = wikipedia_all.groupby(['Language_Names'])['Visits'].sum()
Languages


# In[25]:


## The day of the week most and least popular for visiting wikipedia
Day = wikipedia_all.groupby(['Weekday'])['Visits'].sum().sort_values(ascending=False)
Day


# In[26]:


##Device type used more frequently in visiting wikipedia
Device = wikipedia_all.groupby(['Device_Type'])['Visits'].sum()
Device


# In[27]:


## Creating a variable for only wikipedia pages written in English.
En_wikipedia = wiki.query("Language_Names == 'English'")
En_wikipedia


# In[29]:


## Trending search topics on Januray 1, 2016
## First filter out new years,then group the title column to return the sum of visits per group
new_year_visits = En_wikipedia.query("Date == '2016-01-01'").groupby(['Title'])['Visits'].sum().sort_values(ascending=False)
new_year_visits


# In[30]:


## Trending search topics on November 8, 2016
## First filter out 8th November,then group the title column to return the sum of visits per group
November8 = En_wikipedia.query("Date == '2016-11-8'").groupby(['Title'])['Visits'].sum().sort_values(ascending=False)
November8


# In[32]:


##Exporting my tables to be visualized
Day.to_csv('Day.csv')
Device.to_csv('Device.csv')
Languages.to_csv('Languages.csv')
new_year_visits.to_csv('new_year_visits.csv')
November8.to_csv('November8.csv')