The project is about analyzing the total no of posts for the title beginning with Ask HN and Show HN. And analysze which time of day Hacker News posts receives more number of comments.
import csv
from csv import reader
news_file = open("hacker_news.csv") # opening the csv file
news_file = reader(news_file)
hn = list(news_file)
print(hn[:5])
headers = hn[0] # removing header row and saving in headers
hn = hn[1:] #this removes the header row
ask_posts = []
show_posts = []
other_posts = []
for row in hn:
title = row[1]
if title.lower().startswith('ask hn'):
ask_posts.append(row)
elif title.lower().startswith('show hn'):
show_posts.append(row)
else:
other_posts.append(row)
print(title)
print(len(ask_posts))
print(len(show_posts))
print(len(other_posts))
ask_posts[:5]
def avg_comments(data):
total_comments = 0
for row in data:
comments = int(row[4])
total_comments += comments
avg_comments = ((total_comments) / (len(data)))
return avg_comments
avg_ask_comments= avg_comments(ask_posts)
avg_show_comments = avg_comments(show_posts)
print(avg_ask_comments)
print(avg_show_comments)
Based on findings , we can conclude that on an average "ask_posts" receive more number of comments when comparing to "show_posts".
import datetime as dt
result_list = []
for row in ask_posts:
date_time = row[6]
comments = int(row[4])
result_list.append([date_time,comments])
counts_by_hour = {}
comments_by_hour = {}
for row in result_list:
date = row[0]
hour = dt.datetime.strptime(date,"%m/%d/%Y %H:%M").strftime("%H")
if hour not in counts_by_hour:
counts_by_hour[hour] = 1
comments_by_hour[hour] = row[1]
else:
counts_by_hour[hour] += 1
comments_by_hour[hour] += row[1]
print(counts_by_hour)
#len(counts_by_hour)
print(comments_by_hour)
#print(ask_posts)
avg_by_hour = []
for hour in comments_by_hour and counts_by_hour:
avg_by_hour.append([hour, comments_by_hour[hour]/counts_by_hour[hour]])
print(avg_by_hour)
swap_avg_by_hour = []
for row in avg_by_hour:
p = row[0]
q = row[1]
p,q = q,p
swap_avg_by_hour.append([p,q])
sorted_swap = sorted(swap_avg_by_hour,reverse = True)
sorted_swap[:5]
These are the top 5 ask_posts with avg_number of comments at respective time
for row in sorted_swap[:5]:
hours = row[1]
avg = "{:.2f}".format(row[0])
hour1 = dt.datetime.strptime(hours,"%H").strftime("%H:%M:")
print("{hour} {avg1} average comments per post".format(hour=hour1,avg1 = avg))
After analyzing the data we can understand that there are more number of posts starting with ask_hn when comparing to show_hn.