#!/usr/bin/env python
# coding: utf-8

# # My Second Guided Project
# ## INTRODUCTION
# 
# in this project, we will work with a data set of submissions to popular technology site [Hacker News](https://news.ycombinator.com/).
# 
# In this site, user-submitted stories (known as "posts") are voted and commented upon. The posts that make it to the top of Hacker News' listings can get hundreds of thousands of visitors as a result. Below are descriptions of the columns:
# * id: The unique identifier from Hacker News for the post
# * title: The title of the post
# * url: The URL that the posts links to, if it the post has a URL
# * num_points: The number of points the post acquired, calculated as the total number of upvotes minus the total number of downvotes
# * num_comments: The number of comments that were made on the post
# * author: The username of the person who submitted the post
# * created_at: The date and time at which the post was submitted
# 
# 

# In[1]:


opened_file = open('hacker_news.csv')
from csv import reader
read_file = reader(opened_file)
hn = list(read_file)
hn_header = hn[0]
hn = hn[1:]
print(hn_header)
print('\n')


# In[2]:


def explore_data(dataset, start, end, rows_and_columns=False):
    dataset_slice = dataset[start:end]    
    for row in dataset_slice:
        print(row)
        print('\n') # adds a new (empty) line after each row

    if rows_and_columns:
        print('Number of rows:', len(dataset))
        print('Number of columns:', len(dataset[0]))


# In[3]:


explore_data(hn, 0, 5, rows_and_columns=True)


# in the dataset, we're specifically interested in posts whose titles begin with either 'Ask HN'(posts to ask Hacker News community a speecific question) or 'Show HN (posts to show the Hacker News community a project, product, or just generally something interesting).
# 
# We want to compare the two types of posts to determine the following:
# * Do `Ask HN` or `Show HN` receive more comments on average?
# * Do posts created at a certain time receive more comments on average?

# To start, we will create a new lists of lists containing just the data for those titles.

# In[4]:


ask_posts = []
show_posts = []
other_posts = []

for row in hn:
    title = row[1]
    if title.startswith('Ask HN'):
        ask_posts.append(row)
        
    elif title.startswith('Show HN'):
        show_posts.append(row)
    else:
        other_posts.append(row)
        
print('Number of rows ask posts:', len(ask_posts))
print('\n')
print('Number of rows show post:', len(show_posts))
print('\n')
print('Number of rows other posts:', len(other_posts))
       

# Below are the first five rows in the `ask_post` list of lists.

# In[5]:


explore_data(ask_posts, 0, 5)


# Below are the first five rows of the `show_post` list of lists.

# In[6]:


explore_data(show_posts, 0, 5)


# Now, let's determine if ask posts or show posts receive more comments on the average.

# In[7]:


def aveg_comment(dataset):
    total_comments = 0
    for row in dataset:
        num_comments = row[4]
        num_comments = int(num_comments)
        total_comments += num_comments
    avg_comments = total_comments / len(dataset)
    print(avg_comments)


# In[8]:


aveg_comment(ask_posts)


# In[9]:


aveg_comment(show_posts)


# Our analysis shows that the posts with title that begins with `Ask HN` has more comments on the average than posts whose title beins with `Show HN`.
# 
# This means that whenn you ask the `Hacker News` community a question, you'll get more responses (maybe answers to your question) to when you are just showing them a product or project.

# since ask posts are more likely to recieve comments, we'll focus our remaining analysis on these posts.

# Our next task is to determine if ask posts created at a certain time are more likely to attract comments.
# 
# We'll use the following steps:
# 1. Calculate the amount of ask posts created in each hour of the day, along with the number of comments received.
# 2. Calculae the average number of comments ask posts receive by hour created.
# 
# 

# In[10]:


import datetime as dt
result_list = []
for row in ask_posts:
    created_at = row[6]
    num_comments = row[4]
    num_comments = int(num_comments)
    result_list.append([created_at, num_comments])
    
counts_by_hour = {}
comments_by_hour = {}
for row in result_list:
    date_n_time = row[0]
    num_comments = row[1]
    dt_object = dt.datetime.strptime(date_n_time, '%m/%d/%Y %H:%M')
    dt_hour = dt_object.strftime('%H')
    if dt_hour not in counts_by_hour:
        counts_by_hour[dt_hour] = 1
        comments_by_hour[dt_hour] = num_comments
    else:
        counts_by_hour[dt_hour] += 1
        comments_by_hour[dt_hour] += num_comments
        
print(counts_by_hour)


# In[11]:


print(comments_by_hour)


# In[12]:


avg_by_hour = []
for key in comments_by_hour:
    avg_value = comments_by_hour[key] / counts_by_hour[key]
    avg_by_hour.append([key, avg_value])

print(avg_by_hour)
    

# In[13]:


swap_avg_by_hour = []
for row in avg_by_hour:
    key = row[0]
    key_value = row[1]
    swap_avg_by_hour.append([key_value, key])
print(swap_avg_by_hour)


# In[14]:


sorted_swap = sorted(swap_avg_by_hour, reverse=True)
sorted_swap_first_five = sorted_swap[:5]

print(sorted_swap_first_five)


# In[15]:


for row in sorted_swap_first_five:
    avg = row[0]
    hr = row[1]
    hr_dt_obj = dt.datetime.strptime(hr, '%H')
    hr_dt_string = hr_dt_obj.strftime('%H:%M')
    template = '{}: {:.2f} average comments per post'
    avg_per_post = template.format(hr_dt_string, avg) 
    print(avg_per_post)
    print('\n')


# My analysis shows that there's a higher chance of receiving comments if you create a post between 15:00-21:00hrs (i.e 3pm-9pm).
# 
# From 15:00, most people have started rounding up business for the day, so it makes sense to believe they've got time for the community till about 21:00 (9pm) when it willbe time to go to bed.
# 
# Although the 02:00hr (2am) mark looks favorably, i wouldn't advise it because it may just be sheer luck. Especially since there's no other time frame close to it in the top 5 comments per hour. 

# In[ ]: