This project will look at a 20,000 randomly selected posts on Hacker News. Of those, posts that are in either the "Ask HN" or the "Show HN" categories will be analyzed to see which subset generates the most comments. We will also look for any correlation between number of comments and the time the post is published. The only selection criteria used was that all posts must have received comments.
# accessing the data
import csv
from csv import reader
opened_file = open('hacker_news.csv')
read_file = reader(opened_file)
data_file = list(read_file)
hn_header = data_file[0]
hn = data_file[1:]
print(hn_header)
print(hn[:5])
['id', 'title', 'url', 'num_points', 'num_comments', 'author', 'created_at'] [['12224879', 'Interactive Dynamic Video', 'http://www.interactivedynamicvideo.com/', '386', '52', 'ne0phyte', '8/4/2016 11:52'], ['10975351', 'How to Use Open Source and Shut the Fuck Up at the Same Time', 'http://hueniverse.com/2016/01/26/how-to-use-open-source-and-shut-the-fuck-up-at-the-same-time/', '39', '10', 'josep2', '1/26/2016 19:30'], ['11964716', "Florida DJs May Face Felony for April Fools' Water Joke", 'http://www.thewire.com/entertainment/2013/04/florida-djs-april-fools-water-joke/63798/', '2', '1', 'vezycash', '6/23/2016 22:20'], ['11919867', 'Technology ventures: From Idea to Enterprise', 'https://www.amazon.com/Technology-Ventures-Enterprise-Thomas-Byers/dp/0073523429', '3', '1', 'hswarna', '6/17/2016 0:01'], ['10301696', 'Note by Note: The Making of Steinway L1037 (2007)', 'http://www.nytimes.com/2007/11/07/movies/07stein.html?_r=0', '8', '2', 'walterbell', '9/30/2015 4:12']]
# separating the posts into three sub-groups
ask_posts = []
show_posts = []
other_posts = []
for row in hn:
title = row[1]
title = title.lower()
if title.startswith('ask hn'):
ask_posts.append(row)
if title.startswith('show hn'):
show_posts.append(row)
else:
other_posts.append(row)
print(len(ask_posts))
print(len(show_posts))
print(len(other_posts))
1744 1162 18938
total_ask_comments = 0 # determining the average number of comments for the "Ask HN" category
for row in ask_posts:
num_comments = int(row[4])
total_ask_comments += num_comments
avg_ask_comments = total_ask_comments/len(ask_posts)
print(avg_ask_comments)
total_show_comments = 0 # determining the average number of comments for the "Show HN" category
for row in show_posts:
num_comments = int(row[4])
total_show_comments += num_comments
average_show_comments = total_show_comments/len(show_posts)
print(average_show_comments)
14.038417431192661 10.31669535283993
import datetime as dt
result_list = []
for row in ask_posts:
created_at = row[6] # really just interested in the hour here
num_comments = int(row[4])
result_list.append([created_at, num_comments])
counts_by_hour = {} # this is the number of posts_per_hour
comments_by_hour = {}
for row in result_list:
post_time_str = row[0]
comment_num = row[1]
post_time_dt = dt.datetime.strptime(post_time_str, "%m/%d/%Y %H:%M")
post_time_hour = post_time_dt.hour
if post_time_hour not in counts_by_hour:
counts_by_hour[post_time_hour] = 1
comments_by_hour[post_time_hour] = comment_num
else:
counts_by_hour[post_time_hour] += 1
comments_by_hour[post_time_hour] += comment_num
avg_by_hour = []
for hour in counts_by_hour:
avg_by_hour.append([hour, (comments_by_hour[hour]/counts_by_hour[hour])])
swap_avg_by_hour = []
for row in avg_by_hour:
hour = row[0]
avg = row[1]
temp_list = []
temp_list.append(avg)
temp_list.append(hour)
swap_avg_by_hour.append(temp_list)
sorted_swap = sorted(swap_avg_by_hour, key=lambda x:x[0], reverse=True)
print("Top 5 Hours for Ask Post Comments")
print("Adjusted to Pacific Time")
for row in sorted_swap[:5]:
hour = str(row[1])
comments = row[0]
hour_dt = dt.datetime.strptime(hour, "%H")
hour_dt = hour_dt - dt.timedelta(hours = 3)
hour_dt = hour_dt.strftime("%H:%M")
print("{}: {:.2f} average comments per post.".format(hour_dt, comments))
Top 5 Hours for Ask Post Comments Adjusted to Pacific Time 12:00: 38.59 average comments per post. 23:00: 23.81 average comments per post. 17:00: 21.52 average comments per post. 13:00: 16.80 average comments per post. 18:00: 16.01 average comments per post.