#!/usr/bin/env python # coding: utf-8 # # My Second Guided Project # ## INTRODUCTION # # in this project, we will work with a data set of submissions to popular technology site [Hacker News](https://news.ycombinator.com/). # # In this site, user-submitted stories (known as "posts") are voted and commented upon. The posts that make it to the top of Hacker News' listings can get hundreds of thousands of visitors as a result. Below are descriptions of the columns: # * id: The unique identifier from Hacker News for the post # * title: The title of the post # * url: The URL that the posts links to, if it the post has a URL # * num_points: The number of points the post acquired, calculated as the total number of upvotes minus the total number of downvotes # * num_comments: The number of comments that were made on the post # * author: The username of the person who submitted the post # * created_at: The date and time at which the post was submitted # # # In[1]: opened_file = open('hacker_news.csv') from csv import reader read_file = reader(opened_file) hn = list(read_file) hn_header = hn[0] hn = hn[1:] print(hn_header) print('\n') # In[2]: def explore_data(dataset, start, end, rows_and_columns=False): dataset_slice = dataset[start:end] for row in dataset_slice: print(row) print('\n') # adds a new (empty) line after each row if rows_and_columns: print('Number of rows:', len(dataset)) print('Number of columns:', len(dataset[0])) # In[3]: explore_data(hn, 0, 5, rows_and_columns=True) # in the dataset, we're specifically interested in posts whose titles begin with either 'Ask HN'(posts to ask Hacker News community a speecific question) or 'Show HN (posts to show the Hacker News community a project, product, or just generally something interesting). # # We want to compare the two types of posts to determine the following: # * Do `Ask HN` or `Show HN` receive more comments on average? # * Do posts created at a certain time receive more comments on average? # To start, we will create a new lists of lists containing just the data for those titles. # In[4]: ask_posts = [] show_posts = [] other_posts = [] for row in hn: title = row[1] if title.startswith('Ask HN'): ask_posts.append(row) elif title.startswith('Show HN'): show_posts.append(row) else: other_posts.append(row) print('Number of rows ask posts:', len(ask_posts)) print('\n') print('Number of rows show post:', len(show_posts)) print('\n') print('Number of rows other posts:', len(other_posts)) # Below are the first five rows in the `ask_post` list of lists. # In[5]: explore_data(ask_posts, 0, 5) # Below are the first five rows of the `show_post` list of lists. # In[6]: explore_data(show_posts, 0, 5) # Now, let's determine if ask posts or show posts receive more comments on the average. # In[7]: def aveg_comment(dataset): total_comments = 0 for row in dataset: num_comments = row[4] num_comments = int(num_comments) total_comments += num_comments avg_comments = total_comments / len(dataset) print(avg_comments) # In[8]: aveg_comment(ask_posts) # In[9]: aveg_comment(show_posts) # Our analysis shows that the posts with title that begins with `Ask HN` has more comments on the average than posts whose title beins with `Show HN`. # # This means that whenn you ask the `Hacker News` community a question, you'll get more responses (maybe answers to your question) to when you are just showing them a product or project. # since ask posts are more likely to recieve comments, we'll focus our remaining analysis on these posts. # Our next task is to determine if ask posts created at a certain time are more likely to attract comments. # # We'll use the following steps: # 1. Calculate the amount of ask posts created in each hour of the day, along with the number of comments received. # 2. Calculae the average number of comments ask posts receive by hour created. # # # In[10]: import datetime as dt result_list = [] for row in ask_posts: created_at = row[6] num_comments = row[4] num_comments = int(num_comments) result_list.append([created_at, num_comments]) counts_by_hour = {} comments_by_hour = {} for row in result_list: date_n_time = row[0] num_comments = row[1] dt_object = dt.datetime.strptime(date_n_time, '%m/%d/%Y %H:%M') dt_hour = dt_object.strftime('%H') if dt_hour not in counts_by_hour: counts_by_hour[dt_hour] = 1 comments_by_hour[dt_hour] = num_comments else: counts_by_hour[dt_hour] += 1 comments_by_hour[dt_hour] += num_comments print(counts_by_hour) # In[11]: print(comments_by_hour) # In[12]: avg_by_hour = [] for key in comments_by_hour: avg_value = comments_by_hour[key] / counts_by_hour[key] avg_by_hour.append([key, avg_value]) print(avg_by_hour) # In[13]: swap_avg_by_hour = [] for row in avg_by_hour: key = row[0] key_value = row[1] swap_avg_by_hour.append([key_value, key]) print(swap_avg_by_hour) # In[14]: sorted_swap = sorted(swap_avg_by_hour, reverse=True) sorted_swap_first_five = sorted_swap[:5] print(sorted_swap_first_five) # In[15]: for row in sorted_swap_first_five: avg = row[0] hr = row[1] hr_dt_obj = dt.datetime.strptime(hr, '%H') hr_dt_string = hr_dt_obj.strftime('%H:%M') template = '{}: {:.2f} average comments per post' avg_per_post = template.format(hr_dt_string, avg) print(avg_per_post) print('\n') # My analysis shows that there's a higher chance of receiving comments if you create a post between 15:00-21:00hrs (i.e 3pm-9pm). # # From 15:00, most people have started rounding up business for the day, so it makes sense to believe they've got time for the community till about 21:00 (9pm) when it willbe time to go to bed. # # Although the 02:00hr (2am) mark looks favorably, i wouldn't advise it because it may just be sheer luck. Especially since there's no other time frame close to it in the top 5 comments per hour. # In[ ]: