Hacker News Porject

In this project, we'll work with a data set of submissions to popular technology site Hacker News.

In [1]:
from csv import reader
opened = open ("hacker_news.csv")
hn = list (reader (opened)) #reading as a list of lists
#print(hn[0:6])
In [2]:
headers = hn[0]
hn = hn[1:]
print(hn[0:6])
[['12224879', 'Interactive Dynamic Video', 'http://www.interactivedynamicvideo.com/', '386', '52', 'ne0phyte', '8/4/2016 11:52'], ['10975351', 'How to Use Open Source and Shut the Fuck Up at the Same Time', 'http://hueniverse.com/2016/01/26/how-to-use-open-source-and-shut-the-fuck-up-at-the-same-time/', '39', '10', 'josep2', '1/26/2016 19:30'], ['11964716', "Florida DJs May Face Felony for April Fools' Water Joke", 'http://www.thewire.com/entertainment/2013/04/florida-djs-april-fools-water-joke/63798/', '2', '1', 'vezycash', '6/23/2016 22:20'], ['11919867', 'Technology ventures: From Idea to Enterprise', 'https://www.amazon.com/Technology-Ventures-Enterprise-Thomas-Byers/dp/0073523429', '3', '1', 'hswarna', '6/17/2016 0:01'], ['10301696', 'Note by Note: The Making of Steinway L1037 (2007)', 'http://www.nytimes.com/2007/11/07/movies/07stein.html?_r=0', '8', '2', 'walterbell', '9/30/2015 4:12'], ['10482257', 'Title II kills investment? Comcast and other ISPs are now spending more', 'http://arstechnica.com/business/2015/10/comcast-and-other-isps-boost-network-investment-despite-net-neutrality/', '53', '22', 'Deinos', '10/31/2015 9:48']]
In [3]:
ask_posts = []
show_posts = []
other_posts = []

for row in hn:
    title = row[1]
    if title.lower().startswith("ask hn"):
        ask_posts.append(row)
    elif title.lower().startswith("show hn"):
        show_posts.append(row)
    else:
        other_posts.append(row)

print("ask posts = ",len(ask_posts))
print("show posts = ",len(show_posts))
print("other posts = ",len(other_posts))
ask posts =  1744
show posts =  1162
other posts =  17194
In [4]:
total_ask_comments = 0;

for row in ask_posts:
    total_ask_comments += int(row [4]) #num_comments is index 4

#calculating the avg ask comments
avg_ask_comments = total_ask_comments/len(ask_posts) 
print("average ask comments is:",avg_ask_comments)


total_show_comments = 0;

for row in show_posts:
    total_show_comments += int(row [4]) #num_comments is index 4

#calculating the avg ask comments
avg_show_comments = total_show_comments/len(show_posts) 
print("average show comments is:",avg_show_comments)
average ask comments is: 14.038417431192661
average show comments is: 10.31669535283993

The above investigation shows that on average the ask posts have 4 more comments than the show posts. The average ask post gets around 14 comments, where as the average show post gets only 10.

In [5]:
import datetime as dt

result_list = []


for row in ask_posts:
    
    result_list.append(row[7:3:-2]) # Slicing the larger list to get the create time and comments 
    ###but I have not made the second element to int yet.
    ###I could go: which gives the int for the comment number
#     temp = []
#     temp.append(row[6])
#     temp.append(int(row[4]))
#     result_list.append(temp)
        
counts_by_hour = {}
comments_by_hour = {}

#print(result_list[0:3])

date_format = ("%m/%d/%Y %H:%M")

for row in result_list:

    date_time =  dt.datetime.strptime(row[0], date_format)
    
    time = dt.datetime.strftime(date_time, "%H")
    if time not in counts_by_hour:
        
        counts_by_hour[time] = 1
        comments_by_hour[time] = int(row[1])       
    else:
        counts_by_hour[time] += 1
        comments_by_hour[time] += int(row[1]) 


    
print("comment by hour ",comments_by_hour)
print('\n')
print("counts by hour",counts_by_hour)

# counts_by_hour: contains the number of ask posts created during each hour of the day.
# comments_by_hour: contains the corresponding number of comments ask posts created at each hour received.
comment by hour  {'09': 251, '13': 1253, '10': 793, '14': 1416, '16': 1814, '23': 543, '12': 687, '17': 1146, '15': 4477, '21': 1745, '20': 1722, '02': 1381, '18': 1439, '03': 421, '05': 464, '19': 1188, '01': 683, '22': 479, '08': 492, '04': 337, '00': 447, '06': 397, '07': 267, '11': 641}


counts by hour {'09': 45, '13': 85, '10': 59, '14': 107, '16': 108, '23': 68, '12': 73, '17': 100, '15': 116, '21': 109, '20': 80, '02': 58, '18': 109, '03': 54, '05': 46, '19': 110, '01': 60, '22': 71, '08': 48, '04': 47, '00': 55, '06': 44, '07': 34, '11': 58}
In [6]:
avg_by_hour =[]
i = 0
for hour in comments_by_hour:
    avg_by_hour.append([hour,comments_by_hour[hour]/counts_by_hour[hour]])

print("average comments per post by the hour is:",avg_by_hour[0:5])
average comments per post by the hour is: [['09', 5.5777777777777775], ['13', 14.741176470588234], ['10', 13.440677966101696], ['14', 13.233644859813085], ['16', 16.796296296296298]]
In [7]:
swap_avg_by_hour = []

for row in avg_by_hour:  #used the "temp" trick leanrnt from answers, but the slicing is a lot better.
    temp = []
    temp.append(row[1])
    temp.append(row[0])
    swap_avg_by_hour.append(temp)

#print(swap_avg_by_hour)


sorted_swap = sorted(swap_avg_by_hour, reverse = True)
#print(sorted_swap)

print("Top 5 Hours for Ask Posts Comments")

for row in sorted_swap[0:5]:
    hour_prsd = dt.datetime.strptime(row[1],"%H")
    hour_frmtd = dt.datetime.strftime(hour_prsd,"%H:00")
    
    #print(hour_frmtd)
    

    my_str = "{}: {:.2f} average comments per post"
    output = my_str.format(hour_frmtd,row[0])
    print(output)
 
    
Top 5 Hours for Ask Posts Comments
15:00: 38.59 average comments per post
02:00: 23.81 average comments per post
20:00: 21.52 average comments per post
16:00: 16.80 average comments per post
21:00: 16.01 average comments per post

Conclusions

Data set has US eastern time. I am in Dubai which is 8 hours AHEAD of US easternn time, so the highest post for me would be 23:00 (11pm) local dubai time. The next highest is 10 am dubai time.

1)Determine if show or ask posts receive more points on average. 2)Determine if posts created at a certain time are more likely to receive more points. 3)Compare your results to the average number of comments and points other posts receive. 4)Use Dataquest's data science project style guide to format your project. link: https://www.dataquest.io/blog/data-science-project-style-guide/

Next Goal: Determine if show or ask posts receive more points on average.

So we calculate the average points for ask posts and show posts.

In [17]:
#Calculating the total points for ask posts
total_ask_points = 0
total_show_points = 0

for row in ask_posts:
    points = int(row[3]) #num_points is index 3 of the row
    total_ask_points += points
    
for row in show_posts:
    points = int(row[3]) #num_points is index 3 of the row
    total_show_points += points
    
    
avg_ask_point = round(total_ask_points/len(ask_posts),2)
avg_show_point = round(total_show_points/len(show_posts),2) 

print("Average points for ask posts is: ", avg_ask_point)
print("Average points for show posts is: ", avg_show_point)
Average points for ask posts is:  15.06
Average points for show posts is:  27.56

The numbers above show that on avergae show posts receive more points.

Determine if posts created at a certain time are more likely to receive more points.

This is similar to the time/date manipulation we did for the comment part. We will use the same variable (results list) which is already formatted and ready, this time we loop for points instead of comments.

In [32]:
## For ask posts:
result_list = [] # to store the date and points of ask posts


for row in ask_posts:
    
    result_list.append(row[7:2:-3]) #slicing to get points and date
        
counts_by_hour = {}
points_by_hour = {}


date_format = ("%m/%d/%Y %H:%M")

for row in result_list:

    date_time =  dt.datetime.strptime(row[0], date_format)
    
    time = dt.datetime.strftime(date_time, "%H")
    if time not in counts_by_hour:
        
        counts_by_hour[time] = 1
        points_by_hour[time] = int(row[1])       
    else:
        counts_by_hour[time] += 1
        points_by_hour[time] += int(row[1]) 


    
# print("points by hour for ask posts ",points_by_hour)
# print('\n')
# print("counts by hour for ask posts",counts_by_hour)
# print('\n')
# #print(result_list)

avg_by_hour =[] #avergae points of posts per hour
i = 0
for hour in points_by_hour:
    avg_by_hour.append([hour,points_by_hour[hour]/counts_by_hour[hour]])

# print("average points per ask post by the hour is:",avg_by_hour[0:5])
# print('\n')

##finding the top 5 avergae points by hour for ask posts

swap_avg_by_hour = []

for row in avg_by_hour:  #used the "temp" trick leanrnt from answers, but the slicing is a lot better.
    temp = []
    temp.append(row[1])
    temp.append(row[0])
    swap_avg_by_hour.append(temp)

#print(swap_avg_by_hour)


sorted_swap = sorted(swap_avg_by_hour, reverse = True)
#print(sorted_swap)

print("Top 5 Hours for ASK Posts points")

for row in sorted_swap[0:5]:
    hour_prsd = dt.datetime.strptime(row[1],"%H")
    hour_frmtd = dt.datetime.strftime(hour_prsd,"%H:00")
    
    #print(hour_frmtd)
    

    my_str = "{}: {:.2f} average points per post"
    output = my_str.format(hour_frmtd,row[0])
    print(output)
 
    
Top 5 Hours for ASK Posts points
15:00: 29.99 average points per post
13:00: 24.26 average points per post
16:00: 23.35 average points per post
17:00: 19.41 average points per post
10:00: 18.68 average points per post
In [33]:
## For show posts:
result_list = [] # to store the date and points of ask posts


for row in show_posts:
    
    result_list.append(row[7:2:-3]) #slicing to get points and date
        
counts_by_hour = {}
points_by_hour = {}


date_format = ("%m/%d/%Y %H:%M")

for row in result_list:

    date_time =  dt.datetime.strptime(row[0], date_format)
    
    time = dt.datetime.strftime(date_time, "%H")
    if time not in counts_by_hour:
        
        counts_by_hour[time] = 1
        points_by_hour[time] = int(row[1])       
    else:
        counts_by_hour[time] += 1
        points_by_hour[time] += int(row[1]) 


    
# print("points by hour for show posts ",points_by_hour)
# print('\n')
# print("counts by hour for show posts",counts_by_hour)
# print('\n')
# #print(result_list)

avg_by_hour = [] #avergae points of posts per hour
for hour in points_by_hour:
    avg_by_hour.append([hour,points_by_hour[hour]/counts_by_hour[hour]])

# print("average points per show post by the hour is:",avg_by_hour[0:5])
# print('\n')

##finding the top 5 avergae points by hour for ask posts

swap_avg_by_hour = []

for row in avg_by_hour:  #used the "temp" trick leanrnt from answers, but the slicing is a lot better.
    temp = []
    temp.append(row[1])
    temp.append(row[0])
    swap_avg_by_hour.append(temp)

#print(swap_avg_by_hour)


sorted_swap = sorted(swap_avg_by_hour, reverse = True)
#print(sorted_swap)

print("Top 5 Hours for SHOW Posts points")

for row in sorted_swap[0:5]:
    hour_prsd = dt.datetime.strptime(row[1],"%H")
    hour_frmtd = dt.datetime.strftime(hour_prsd,"%H:00")
    
    #print(hour_frmtd)
    

    my_str = "{}: {:.2f} average points per post"
    output = my_str.format(hour_frmtd,row[0])
    print(output)
 
    
Top 5 Hours for SHOW Posts points
23:00: 42.39 average points per post
12:00: 41.69 average points per post
22:00: 40.35 average points per post
00:00: 37.84 average points per post
18:00: 36.31 average points per post

We see that the top 5 hours show posts have a higher point count than the top 5 ask posts. The top 3 best hours for show posts have close averages. The best time to make a show post is 23:00 US Eastern time, where as the best time for an ask post in terms of points is 15:00 US Eastern time.

We just need to use the other post list and claculate the average points and comments per hour for that. But I would slice it as an array with date, num_points and num_comments. We do the tally and the averages per hour that way.

In [69]:
other_result_list = []

for row in other_posts: # slicing to get the date, the num_points and the num_comments, using the temp trick
    temp = []
    temp.append(row[-1]) #date
    temp.append(row[3]) #num_points
    temp.append(row[4]) #num_comments
    
    other_result_list.append(temp)
    
counts_by_hour = {}
points_by_hour = {}
comments_by_hour ={}

#parsing the 
date_format = ("%m/%d/%Y %H:%M")

for row in other_result_list:

    date_time =  dt.datetime.strptime(row[0], date_format) #parsing the time, stored at index 0
    
    time = dt.datetime.strftime(date_time, "%H")
    if time not in counts_by_hour:
        
        counts_by_hour[time] = 1
        points_by_hour[time] = int(row[1])  
        comments_by_hour[time] = int(row[2]) 
        
    else:
        counts_by_hour[time] += 1
        points_by_hour[time] += int(row[1]) 
        comments_by_hour[time] += int(row[2])     

    
# print ("counts_by_hour", counts_by_hour)
# print ("points_by_hour", points_by_hour)
# print ("comments_by_hour", comments_by_hour)



avg_by_hour = [] #avergae points and comments of other posts per hour

for hour in points_by_hour:
    avg_by_hour.append([hour, points_by_hour[hour]/counts_by_hour[hour],comments_by_hour[hour]/counts_by_hour[hour]])
    
#Sorting for points (we slice)
sort_avg_points = []
sort_avg_comment = []
temp_sort_avg_comment = []

for row in avg_by_hour:
    sort_avg_points.append(row[1::-1]) #first two columns in reverse
    sort_avg_comment.append(row[-1::-2])# first column and third column in reverse
    

# print(avg_by_hour)
# print("/n")   
# print(sort_avg_points)
# print("/n")
# print(sort_avg_comment)

#actual sorting starts here:

sorted_swap_points = sorted(sort_avg_points, reverse = True)
sorted_swap_comments = sorted(sort_avg_comment, reverse = True)

#print(sorted_swap)

print("Top 5 Hours for OTHER Posts POINTS")

for row in sorted_swap_points[0:5]:
    hour_prsd = dt.datetime.strptime(row[1],"%H")
    hour_frmtd = dt.datetime.strftime(hour_prsd,"%H:00")
    #print(hour_frmtd)
    my_str = "{}: {:.2f} average points per post"
    output = my_str.format(hour_frmtd,row[0])
    print(output)
print ("\n")

print("Top 5 Hours for OTHER Posts COMMENTS")


for row in sorted_swap_comments[0:5]:
    hour_prsd = dt.datetime.strptime(row[1],"%H")
    hour_frmtd = dt.datetime.strftime(hour_prsd,"%H:00")
    #print(hour_frmtd)
    my_str = "{}: {:.2f} average comments per post"
    output = my_str.format(hour_frmtd,row[0])
    print(output)
 
Top 5 Hours for OTHER Posts POINTS
13:00: 62.53 average points per post
14:00: 61.79 average points per post
15:00: 60.54 average points per post
10:00: 60.48 average points per post
19:00: 60.01 average points per post


Top 5 Hours for OTHER Posts COMMENTS
14:00: 32.33 average comments per post
13:00: 30.90 average comments per post
12:00: 30.35 average comments per post
11:00: 29.59 average comments per post
15:00: 29.52 average comments per post

We can see that the average points on other posts is higher than the ask and show posts. However the comments are lower than ask points. Posts created in 1-2pm US Eastern time have the highest number of points and comments, but the number of these points and comments for other posts is close together across the five hours.