Hacker News Data Analysis

  1. Do Ask HN or Show HN receive more comments on average?
  2. Do posts created at a certain time receive more comments on average?
In [1]:
# Import libraries needed
from csv import reader
import datetime as dt

open_file = open('hacker_news.csv')
read_file = reader(open_file)
hn = list(read_file)
header = hn[0]
hn = hn[1:]
open_file.close()

print(hn[:5])
[['12224879', 'Interactive Dynamic Video', 'http://www.interactivedynamicvideo.com/', '386', '52', 'ne0phyte', '8/4/2016 11:52'], ['10975351', 'How to Use Open Source and Shut the Fuck Up at the Same Time', 'http://hueniverse.com/2016/01/26/how-to-use-open-source-and-shut-the-fuck-up-at-the-same-time/', '39', '10', 'josep2', '1/26/2016 19:30'], ['11964716', "Florida DJs May Face Felony for April Fools' Water Joke", 'http://www.thewire.com/entertainment/2013/04/florida-djs-april-fools-water-joke/63798/', '2', '1', 'vezycash', '6/23/2016 22:20'], ['11919867', 'Technology ventures: From Idea to Enterprise', 'https://www.amazon.com/Technology-Ventures-Enterprise-Thomas-Byers/dp/0073523429', '3', '1', 'hswarna', '6/17/2016 0:01'], ['10301696', 'Note by Note: The Making of Steinway L1037 (2007)', 'http://www.nytimes.com/2007/11/07/movies/07stein.html?_r=0', '8', '2', 'walterbell', '9/30/2015 4:12']]
In [2]:
# filter out posts that start with Ask HN or Show HN

ask_posts = []
show_posts = []
other_posts = []


for row in hn:
    title = row[1].lower()
    if title.startswith('ask hn'):
        ask_posts.append(row)
    elif title.startswith('show hn'):
        show_posts.append(row)
    else:
        other_posts.append(row)
        
len_ask_posts = len(ask_posts)
len_show_posts = len(show_posts)
len_other_posts = len(other_posts)
In [3]:
# Find the total number of comments in ask posts and
# assign it to total_ask_comments
total_ask_comments = 0
for row in ask_posts:
    num_comments = int(row[4])
    total_ask_comments += num_comments
avg_ask_comments = total_ask_comments / len_ask_posts
print(avg_ask_comments)

total_show_comments = 0
for row in show_posts:
    num_comments = int(row[4])
    total_show_comments += num_comments
avg_show_comments = total_show_comments / len_show_posts
print(avg_show_comments)
14.038417431192661
10.31669535283993

Do show posts or ask posts receive more comments on average?

Answer:

  • Ask posts receive more comments on average
  • 14 vs 10 respectively

Are ask posts created at a certain time more likely to attract comments?

Steps:

  1. Calculate the amount of ask posts created in each hour of the day, along with the number of comments received.
  2. Calculate the average number of comments ask posts receive by hour created.
In [4]:
# Create list of list with two elements: time created and number of comments
result_list = []
for row in ask_posts:
    created_at = row[6]
    number_of_comments = int(row[4])
    two_element_list = [created_at, number_of_comments]
    result_list.append(two_element_list)
    
# Create two dictionaries: one with number of posts by hour and
# number of corresponding comments to each hour
counts_by_hour = {}
comments_by_hour = {}
for row in result_list:
    date_and_time = row[0]
    date_and_time = dt.datetime.strptime(date_and_time, "%m/%d/%Y %H:%M")
    post_time = date_and_time.strftime("%H")
    if post_time not in counts_by_hour:
        counts_by_hour[post_time] = 1
        comments_by_hour[post_time] = int(row[1])
    else:
        counts_by_hour[post_time] += 1
        comments_by_hour[post_time] += int(row[1])
print(counts_by_hour)
print('')
print(comments_by_hour)
{'09': 45, '13': 85, '10': 59, '14': 107, '16': 108, '23': 68, '12': 73, '17': 100, '15': 116, '21': 109, '20': 80, '02': 58, '18': 109, '03': 54, '05': 46, '19': 110, '01': 60, '22': 71, '08': 48, '04': 47, '00': 55, '06': 44, '07': 34, '11': 58}

{'09': 251, '13': 1253, '10': 793, '14': 1416, '16': 1814, '23': 543, '12': 687, '17': 1146, '15': 4477, '21': 1745, '20': 1722, '02': 1381, '18': 1439, '03': 421, '05': 464, '19': 1188, '01': 683, '22': 479, '08': 492, '04': 337, '00': 447, '06': 397, '07': 267, '11': 641}
In [5]:
# Calculate average number of comments per post for posts created during
# each hour of the day
avg_by_hour = []
for key in comments_by_hour:
    avg_comments = comments_by_hour[key] / counts_by_hour[key]
    avg_by_hour.append([key, avg_comments])
print(avg_by_hour)
[['09', 5.5777777777777775], ['13', 14.741176470588234], ['10', 13.440677966101696], ['14', 13.233644859813085], ['16', 16.796296296296298], ['23', 7.985294117647059], ['12', 9.41095890410959], ['17', 11.46], ['15', 38.5948275862069], ['21', 16.009174311926607], ['20', 21.525], ['02', 23.810344827586206], ['18', 13.20183486238532], ['03', 7.796296296296297], ['05', 10.08695652173913], ['19', 10.8], ['01', 11.383333333333333], ['22', 6.746478873239437], ['08', 10.25], ['04', 7.170212765957447], ['00', 8.127272727272727], ['06', 9.022727272727273], ['07', 7.852941176470588], ['11', 11.051724137931034]]
In [19]:
# Reformat avg_by_hour results to identify highest values
swap_avg_by_hour = []
for list_of_list in avg_by_hour:
    first_value = list_of_list[0]
    second_value = list_of_list[1]
    swap_avg_by_hour.append([second_value, first_value])

# Sort avg_by_hour from highest avg number of comments to lowest
sorted_swap = sorted(swap_avg_by_hour, reverse=True)

# Print the five highest values in a readable format
print('Top 5 Hours for Ask Posts Comments')
for row in sorted_swap[:5]:
    hour_object = dt.datetime.strptime(row[1], "%H")
    hour_object = hour_object.strftime("%H:%M")
    result_string = "{}: {:.2f} average comments per post".format(hour_object, row[0])
    print(result_string)
Top 5 Hours for Ask Posts Comments
15:00: 38.59 average comments per post
02:00: 23.81 average comments per post
20:00: 21.52 average comments per post
16:00: 16.80 average comments per post
21:00: 16.01 average comments per post