#!/usr/bin/env python # coding: utf-8 # We'll be looking for patterns in Jeopardy questions in order to help us win! # In[1]: import pandas as pd import csv #Read in the file jeopardy = pd.read_csv('jeopardy.csv') # In[2]: #Explore the file print(jeopardy.shape) print(jeopardy.head(5)) # In[3]: #Column names jeopardy.columns # Some of the column names have spaces before the names, we will remove these. # In[4]: jeopardy.rename(columns = {' Air Date':'Air Date', ' Round':'Round', ' Category': 'Category', ' Value':'Value', ' Question':'Question', ' Answer':'Answer'}, inplace = True) jeopardy.columns # Write a function that normalizes a string. # In[5]: def normalize_string(string): string = string.lower() import re string = re.sub(r'[^\w\s]', '', string) string = re.sub(r'[\s+]', ' ', string) return string # Normalize the Question and Answer columns. # In[6]: jeopardy['clean_question'] = jeopardy['Question'].apply(normalize_string) jeopardy['clean_answer'] = jeopardy['Answer'].apply(normalize_string) # In[7]: def normalize_num(num): import re num = re.sub(r'[^\w\s]', '', num) try: num = int(num) except Exception: num = 0 return num # Normalize the value column. # In[8]: jeopardy['clean_value'] = jeopardy['Value'].apply(normalize_num) # In[9]: jeopardy.head(3) # In order to work with Air Date, which has date info, we will convert it to a datetime column. # In[10]: jeopardy['Air Date'] = pd.to_datetime(jeopardy['Air Date']) # In[11]: print(jeopardy['Air Date'].head(5)) # In[12]: jeopardy.head(5) # In order to study past questions, we will write a function that counts the number of times words occur in the answer and the question. # In[13]: def match_in_ans_and_quests(row): split_answer = row['clean_answer'].split() split_question = row['clean_question'].split() match_count = 0 if 'the' in split_answer: split_answer.remove('the') if len(split_answer) == 0: return 0 for item in split_answer: if item in split_question: match_count += 1 match_by_count = match_count / len(split_answer) return match_by_count # Below we count the number of times that the same terms are found in clean_answer and clean_question. # In[14]: jeopardy['answer_in_question'] = jeopardy.apply(match_in_ans_and_quests, axis = 1) ans_in_quest_mean = jeopardy['answer_in_question'].mean() print(ans_in_quest_mean) # In[15]: jeopardy.dtypes # In[ ]: