#!/usr/bin/env python
# coding: utf-8

# We'll be looking for patterns in Jeopardy questions in order to help us win!

# In[1]:


import pandas as pd
import csv

#Read in the file
jeopardy = pd.read_csv('jeopardy.csv')


# In[2]:


#Explore the file

print(jeopardy.shape)
print(jeopardy.head(5))


# In[3]:


#Column names
jeopardy.columns


# Some of the column names have spaces before the names, we will remove these.

# In[4]:


jeopardy.rename(columns = {' Air Date':'Air Date', ' Round':'Round', ' Category': 'Category', ' Value':'Value', ' Question':'Question', ' Answer':'Answer'}, inplace = True)
jeopardy.columns


# Write a function that normalizes a string.

# In[5]:


def normalize_string(string):
    string = string.lower()

    import re
    string = re.sub(r'[^\w\s]', '', string)
    string = re.sub(r'[\s+]', ' ', string)
    return string


# Normalize the Question and Answer columns.

# In[6]:


jeopardy['clean_question'] = jeopardy['Question'].apply(normalize_string)
jeopardy['clean_answer'] = jeopardy['Answer'].apply(normalize_string)


# In[7]:


def normalize_num(num):
    import re
    num = re.sub(r'[^\w\s]', '', num)
    try:
        num = int(num)
    except Exception:
        num = 0
    return num


# Normalize the value column.

# In[8]:


jeopardy['clean_value'] = jeopardy['Value'].apply(normalize_num)


# In[9]:


jeopardy.head(3)


# In order to work with Air Date, which has date info, we will convert it to a datetime column.

# In[10]:


jeopardy['Air Date'] = pd.to_datetime(jeopardy['Air Date'])


# In[11]:


print(jeopardy['Air Date'].head(5))


# In[12]:


jeopardy.head(5)


# In order to study past questions, we will write a function that counts the number of times words occur in the answer and the question.

# In[13]:


def match_in_ans_and_quests(row):
    split_answer = row['clean_answer'].split()
    split_question = row['clean_question'].split()
    match_count = 0
    if 'the' in split_answer:
        split_answer.remove('the')
    if len(split_answer) == 0:
        return 0
    for item in split_answer:
        if item in split_question:
            match_count += 1
        match_by_count = match_count / len(split_answer)
        return match_by_count


# Below we count the number of times that the same terms are found in clean_answer and clean_question.

# In[14]:


jeopardy['answer_in_question'] = jeopardy.apply(match_in_ans_and_quests, axis = 1)
ans_in_quest_mean = jeopardy['answer_in_question'].mean()
print(ans_in_quest_mean)


# In[15]:


jeopardy.dtypes


# In[ ]: