In this project we will work on a different set of data to extract some value of it using Python topics we have learned so far.
HackerNews_file = open('hacker_news.csv', encoding='utf8')
from csv import reader
HackerNews_read = reader(HackerNews_file)
hn = list(HackerNews_read)
print(hn[0:5])
[['id', 'title', 'url', 'num_points', 'num_comments', 'author', 'created_at'], ['12224879', 'Interactive Dynamic Video', 'http://www.interactivedynamicvideo.com/', '386', '52', 'ne0phyte', '8/4/2016 11:52'], ['10975351', 'How to Use Open Source and Shut the Fuck Up at the Same Time', 'http://hueniverse.com/2016/01/26/how-to-use-open-source-and-shut-the-fuck-up-at-the-same-time/', '39', '10', 'josep2', '1/26/2016 19:30'], ['11964716', "Florida DJs May Face Felony for April Fools' Water Joke", 'http://www.thewire.com/entertainment/2013/04/florida-djs-april-fools-water-joke/63798/', '2', '1', 'vezycash', '6/23/2016 22:20'], ['11919867', 'Technology ventures: From Idea to Enterprise', 'https://www.amazon.com/Technology-Ventures-Enterprise-Thomas-Byers/dp/0073523429', '3', '1', 'hswarna', '6/17/2016 0:01']]
header = hn[0]
del hn[0]
print(header)
print(hn[0:5])
['id', 'title', 'url', 'num_points', 'num_comments', 'author', 'created_at'] [['12224879', 'Interactive Dynamic Video', 'http://www.interactivedynamicvideo.com/', '386', '52', 'ne0phyte', '8/4/2016 11:52'], ['10975351', 'How to Use Open Source and Shut the Fuck Up at the Same Time', 'http://hueniverse.com/2016/01/26/how-to-use-open-source-and-shut-the-fuck-up-at-the-same-time/', '39', '10', 'josep2', '1/26/2016 19:30'], ['11964716', "Florida DJs May Face Felony for April Fools' Water Joke", 'http://www.thewire.com/entertainment/2013/04/florida-djs-april-fools-water-joke/63798/', '2', '1', 'vezycash', '6/23/2016 22:20'], ['11919867', 'Technology ventures: From Idea to Enterprise', 'https://www.amazon.com/Technology-Ventures-Enterprise-Thomas-Byers/dp/0073523429', '3', '1', 'hswarna', '6/17/2016 0:01'], ['10301696', 'Note by Note: The Making of Steinway L1037 (2007)', 'http://www.nytimes.com/2007/11/07/movies/07stein.html?_r=0', '8', '2', 'walterbell', '9/30/2015 4:12']]
ask_posts = []
show_posts = []
other_posts = []
for row in hn:
title = row[1]
if title.lower().startswith('ask hn'):
ask_posts.append(row)
elif title.lower().startswith('show hn'):
show_posts.append(row)
else:
other_posts.append(row)
print("ask_posts ",len(ask_posts))
print("show_posts ",len(show_posts))
print("other_posts ",len(other_posts))
ask_posts 1744 show_posts 1162 other_posts 17194
total_ask_comments = 0
for row in ask_posts:
num_comments = int(row[4])
total_ask_comments += num_comments
avg_ask_comments = total_ask_comments / len(ask_posts)
print('Average number of comments per ask post ', round(avg_ask_comments,2))
total_show_comments = 0
for row in show_posts:
num_comments = int(row[4])
total_show_comments += num_comments
avg_show_comments = total_show_comments / len(show_posts)
print('Average number of comments per show post ', round(avg_show_comments,2))
Average number of comments per ask post 14.04 Average number of comments per show post 10.32
Looks like Ask posts attract more comments on average than Show posts.
import datetime as dt
result_list = []
for row in ask_posts:
temp_list = []
created_on = row[6]
num_comments = int(row[4])
temp_list.append(created_on)
temp_list.append(num_comments)
result_list.append(temp_list)
counts_by_hour = {}
comments_by_hour = {}
for row in result_list:
created_on = row[0]
num_comments = row[1]
created_dt = dt.datetime.strptime(created_on,"%m/%d/%Y %H:%M") #8/4/2016 11:52
hr = created_dt.strftime("%H")
if hr in counts_by_hour:
counts_by_hour[hr] += 1
comments_by_hour[hr] += num_comments
else:
counts_by_hour[hr] = 1
comments_by_hour[hr] = num_comments
print(counts_by_hour)
print(comments_by_hour)
{'16': 108, '11': 58, '19': 110, '04': 47, '17': 100, '21': 109, '18': 109, '01': 60, '00': 55, '10': 59, '09': 45, '12': 73, '22': 71, '07': 34, '23': 68, '02': 58, '03': 54, '05': 46, '08': 48, '20': 80, '15': 116, '14': 107, '13': 85, '06': 44} {'16': 1814, '11': 641, '19': 1188, '04': 337, '17': 1146, '21': 1745, '18': 1439, '01': 683, '00': 447, '10': 793, '09': 251, '12': 687, '22': 479, '07': 267, '23': 543, '02': 1381, '03': 421, '05': 464, '08': 492, '20': 1722, '15': 4477, '14': 1416, '13': 1253, '06': 397}
avg_by_hour = []
for row in counts_by_hour:
hr = row
count = counts_by_hour[hr]
for item in comments_by_hour:
comment_hr = item
num_comments = comments_by_hour[comment_hr]
if hr == comment_hr:
avg_comments = []
touple = (comment_hr, num_comments/count)
avg_by_hour.append(touple)
print(avg_by_hour)
[('16', 16.796296296296298), ('11', 11.051724137931034), ('19', 10.8), ('04', 7.170212765957447), ('17', 11.46), ('21', 16.009174311926607), ('18', 13.20183486238532), ('01', 11.383333333333333), ('00', 8.127272727272727), ('10', 13.440677966101696), ('09', 5.5777777777777775), ('12', 9.41095890410959), ('22', 6.746478873239437), ('07', 7.852941176470588), ('23', 7.985294117647059), ('02', 23.810344827586206), ('03', 7.796296296296297), ('05', 10.08695652173913), ('08', 10.25), ('20', 21.525), ('15', 38.5948275862069), ('14', 13.233644859813085), ('13', 14.741176470588234), ('06', 9.022727272727273)]
swap_avg_by_hour = []
for row in avg_by_hour:
touple = (row[1], row[0])
swap_avg_by_hour.append(touple)
# print(swap_avg_by_hour)
sorted_swap = sorted(swap_avg_by_hour, reverse = True)
print("Top 5 Hours for Ask Posts Comments")
for row in sorted_swap[:5]:
hr = row[1]
avg = row[0]
hr = dt.datetime.strptime(hr,"%H")
hour = hr.strftime("%H:%M:")
template = "{0} {1:.2f} average comments per post"
print(template.format(hour,avg))
Top 5 Hours for Ask Posts Comments 15:00: 38.59 average comments per post 02:00: 23.81 average comments per post 20:00: 21.52 average comments per post 16:00: 16.80 average comments per post 21:00: 16.01 average comments per post
Looking atthe summay above, 3PM seems to be ideal time to create a post for greater chances of receiving comments.