#!/usr/bin/env python # coding: utf-8 # # Exploring Hacker News Posts # Hacker News is a site where users can submit posts are voted and commented upon. Hacker News is very popular in technology and startup cicles, and posts that make it to the top of Hacker News' listings can get hundreds of thousands of visitors as a result. # # There are two types of posts that are important for this project: "Ask HN" and "Show HN". People submit "Ask HN" posts to ask Hacker News community a specific question. Likewise, users submit "Shoe HN" posts to show a project, product or something interesting. # # The purpose of this project is to compare these two types of posts and answer the following questions: # 1. Do "Ask HN" or "Show HN" receive more comments on average? # 2. Do posts created at a certain time receive more comments on average? # In[1]: import csv opened_file = open("HN_posts_year_to_Sep_26_2016.csv") hn = list(csv.reader(opened_file)) print(hn[:5]) # Remove the header row from hn. # In[2]: headers = hn[0] hn = hn[1:] print(headers) print(hn[:5]) # Split ask posts and show posts into two different lists: # In[3]: ask_posts = [] show_posts = [] other_posts = [] for row in hn: title = row[1] if title.lower().startswith('ask hn'): ask_posts.append(row) elif title.lower().startswith('show hn'): show_posts.append(row) else: other_posts.append(row) print(len(ask_posts)) print(len(show_posts)) print(len(other_posts)) # Calculate average number of comments for ask posts: # In[5]: total_ask_comments = 0.0 for row in ask_posts: num_comments = int(row[4]) total_ask_comments += num_comments avg_ask_comments = total_ask_comments / len(ask_posts) print(avg_ask_comments) # Calculate average number of comments for show posts: # In[6]: total_show_comments = 0.0 for row in show_posts: num_comments = int(row[4]) total_show_comments += num_comments avg_show_comments = total_show_comments / len(show_posts) print(avg_show_comments) # On average, ask posts receive more comments than show posts. # Next, we will decide if ask posts created at certain time are more likely to get more comments. There are two steps to perform this analysis: # # 1. Calculate the amount of ask posts created in each hour of the day, along with the number of comments received. # 2. Calculate the average number of comments ask posts reveive by hour created. # In[39]: import datetime as dt result_list = [] for row in ask_posts: created_at = row[6] num_comments = int(row[4]) result_list.append([created_at,num_comments]) counts_by_hour = {} comments_by_hour = {} for row in result_list: date_time = dt.datetime.strptime(row[0], "%m/%d/%Y %H:%M") hour = date_time.strftime("%H") if hour in counts_by_hour: counts_by_hour[hour] += 1 comments_by_hour[hour] += row[1] else: counts_by_hour[hour] = 1 comments_by_hour[hour] = row[1] comments_by_hour # In[40]: avg_by_hour = [] for hour in comments_by_hour: avg_by_hour.append([hour, comments_by_hour[hour]/counts_by_hour[hour]]) avg_by_hour # In[ ]: