We'll be looking for patterns in Jeopardy questions in order to help us win!
import pandas as pd
import csv
#Read in the file
jeopardy = pd.read_csv('jeopardy.csv')
#Explore the file
print(jeopardy.shape)
print(jeopardy.head(5))
#Column names
jeopardy.columns
Some of the column names have spaces before the names, we will remove these.
jeopardy.rename(columns = {' Air Date':'Air Date', ' Round':'Round', ' Category': 'Category', ' Value':'Value', ' Question':'Question', ' Answer':'Answer'}, inplace = True)
jeopardy.columns
Write a function that normalizes a string.
def normalize_string(string):
string = string.lower()
import re
string = re.sub(r'[^\w\s]', '', string)
string = re.sub(r'[\s+]', ' ', string)
return string
Normalize the Question and Answer columns.
jeopardy['clean_question'] = jeopardy['Question'].apply(normalize_string)
jeopardy['clean_answer'] = jeopardy['Answer'].apply(normalize_string)
def normalize_num(num):
import re
num = re.sub(r'[^\w\s]', '', num)
try:
num = int(num)
except Exception:
num = 0
return num
Normalize the value column.
jeopardy['clean_value'] = jeopardy['Value'].apply(normalize_num)
jeopardy.head(3)
In order to work with Air Date, which has date info, we will convert it to a datetime column.
jeopardy['Air Date'] = pd.to_datetime(jeopardy['Air Date'])
print(jeopardy['Air Date'].head(5))
jeopardy.head(5)
In order to study past questions, we will write a function that counts the number of times words occur in the answer and the question.
def match_in_ans_and_quests(row):
split_answer = row['clean_answer'].split()
split_question = row['clean_question'].split()
match_count = 0
if 'the' in split_answer:
split_answer.remove('the')
if len(split_answer) == 0:
return 0
for item in split_answer:
if item in split_question:
match_count += 1
match_by_count = match_count / len(split_answer)
return match_by_count
Below we count the number of times that the same terms are found in clean_answer and clean_question.
jeopardy['answer_in_question'] = jeopardy.apply(match_in_ans_and_quests, axis = 1)
ans_in_quest_mean = jeopardy['answer_in_question'].mean()
print(ans_in_quest_mean)
jeopardy.dtypes