Notebook

So we have json file filled with 2014 hacker news post¶

It contains stories as single keys¶

we will be dealing with following keys¶

created_at: A timestamp of the story's creation time.

created_at_i: A unix epoch timestamp.

url: The URL of the story link.

objectID: The ID of the story.

author: The story's author (username on HN).

points: The number of upvotes the story had.

title: The headline of the post.

num_comments: The number of a comments a post has.

Using this data-set¶

we will be runining sequences basic of NLP task using our Pipeline Class.¶

In [8]:

from datetime import datetime
import json
import io
import string
import csv
from pipeline import build_csv
from pipeline import Pipeline
from stop_words import stop_words

In [9]:

pipeline=Pipeline()

loading of json file into python¶

create pipeline.task() with no args¶

call the the function file_to_json() loads the file and return stories¶

In [10]:

@pipeline.task()
def file_to_json():
    with open('hn_stories_2014.json', 'r') as f:
        data = json.load(f)
        stories = data['stories']
    return stories

#### we need to filter out most popular stories every year 

@pipeline.task(depends_on=file_to_json)
def filter_stories(stories):
    def is_popular(story): #is_popular will return boolean value
        
        return (story["points"]>50 and story["num_comments"]>1 
                and not story["title"].startswith("ASK HN"))
        
    return (story for story in stories if is_popular(story))

#### now dict to csv conversion is necessary
#### The purpose of translating the dictionaries to a CSV is that 
#### we want to have a consistent data format when running the later 
#### summarizations. By keeping consistent data formats, 
#### each of your pipeline tasks will be adaptable with future task 
#### requirements.

@pipeline.task(depends_on=filter_stories)
def json_to_csv(stories):
    lines=[]
    for story in stories:
        lines.append(
            (story['objectID'], datetime.strptime(story['created_at'], "%Y-%m-%dT%H:%M:%SZ"), story['url'], story['points'], story['title'])
        )
    return build_csv(lines,header=['objectID','created_at','url',
                                   'points','title'], file=io.StringIO())

##### Once we have extracted the titles of each popular post, 
#### we can then run the next word frequency task.
@pipeline.task(depends_on=json_to_csv)
def extract_titles(csv_file):
    reader = csv.reader(csv_file)
    header = next(reader)
    idx = header.index('title')
    
    return (line[idx] for line in reader)

@pipeline.task(depends_on=extract_titles)
def clean_titles(titles):
    for title in titles:
        title = title.lower()
        title = ''.join(c for c in title if c not in string.punctuation)
        yield title
        
@pipeline.task(depends_on=clean_titles)
def build_keyword_dictionary(titles):
    word_freq={}
    for title in titles:
        for word in title.split(" "):
            if word not in stop_words:
                if word not in word_freq:
                    word_freq[word]=1
                word_freq[word]+=1
    return word_freq

@pipeline.task(depends_on=build_keyword_dictionary)
def top_words(keyword_dictionary):
    top_values=sorted(keyword_dictionary.items() ,key=lambda x:x[1],reverse=True)
    return top_values[:100]
                

In [13]:

ran = pipeline.run()
print(ran[top_words])

[('new', 186), ('google', 168), ('', 165), ('ask', 127), ('bitcoin', 103), ('open', 96), ('programming', 93), ('web', 90), ('data', 87), ('video', 80), ('python', 76), ('code', 75), ('released', 72), ('facebook', 72), ('using', 71), ('source', 69), ('2014', 66), ('2013', 66), ('free', 66), ('javascript', 66), ('game', 65), ('internet', 63), ('c', 61), ('microsoft', 60), ('work', 60), ('linux', 59), ('app', 58), ('pdf', 56), ('software', 55), ('language', 55), ('use', 54), ('startup', 53), ('make', 52), ('apple', 51), ('time', 50), ('security', 49), ('yc', 49), ('nsa', 46), ('github', 46), ('windows', 45), ('like', 45), ('way', 43), ('project', 43), ('world', 42), ('developer', 41), ('computer', 41), ('heartbleed', 41), ('users', 41), ('1', 41), ('dont', 39), ('design', 38), ('git', 38), ('ios', 38), ('ceo', 37), ('os', 37), ('big', 37), ('vs', 37), ('twitter', 37), ('online', 37), ('life', 37), ('day', 36), ('apps', 35), ('android', 35), ('years', 35), ('best', 35), ('simple', 34), ('court', 34), ('mt', 34), ('firefox', 33), ('says', 33), ('guide', 33), ('site', 33), ('browser', 33), ('learning', 33), ('api', 33), ('gox', 33), ('problem', 32), ('server', 32), ('mozilla', 32), ('fast', 32), ('engine', 32), ('does', 31), ('better', 31), ('introducing', 31), ('text', 31), ('amazon', 31), ('year', 31), ('support', 30), ('tech', 30), ('stop', 30), ('million', 30), ('money', 30), ('people', 30), ('built', 30), ('learn', 29), ('developers', 29), ('did', 29), ('development', 29), ('3', 29), ('help', 29)]

In [ ]: