In [ ]:

import h2o
h2o.init()
from h2o.estimators.word2vec import H2OWord2vecEstimator
from h2o.estimators.gbm import H2OGradientBoostingEstimator

In [ ]:

job_titles_path = "https://s3.amazonaws.com/h2o-public-test-data/smalldata/craigslistJobTitles.csv"
job_titles = h2o.import_file(job_titles_path, destination_frame = "jobtitles",
                             col_names = ["category", "jobtitle"], col_types = ["enum", "string"], header = 1)

In [ ]:

STOP_WORDS = ["ax","i","you","edu","s","t","m","subject","can","lines","re","what",
               "there","all","we","one","the","a","an","of","or","in","for","by","on",
               "but","is","in","a","not","with","as","was","if","they","are","this","and","it","have",
               "from","at","my","be","by","not","that","to","from","com","org","like","likes","so"]

In [ ]:

def tokenize(sentences, stop_word = STOP_WORDS):
    tokenized = sentences.tokenize("\\W+")
    tokenized_lower = tokenized.tolower()
    tokenized_filtered = tokenized_lower[(tokenized_lower.nchar() >= 2) | (tokenized_lower.isna()),:]
    tokenized_words = tokenized_filtered[tokenized_filtered.grep("[0-9]",invert=True,output_logical=True),:]
    tokenized_words = tokenized_words[(tokenized_words.isna()) | (~ tokenized_words.isin(STOP_WORDS)),:]
    return tokenized_words

In [ ]:

def predict(job_title,w2v, gbm):
    words = tokenize(h2o.H2OFrame(job_title).ascharacter())
    job_title_vec = w2v.transform(words, aggregate_method="AVERAGE")
    print(gbm.predict(test_data=job_title_vec))

In [ ]:

print("Break job titles into sequence of words")
words = tokenize(job_titles["jobtitle"])

In [ ]:

print("Build word2vec model")
w2v_model = H2OWord2vecEstimator(sent_sample_rate = 0.0, epochs = 10)
w2v_model.train(training_frame=words)

In [ ]:

print("Sanity check - find synonyms for the word 'teacher'")
w2v_model.find_synonyms("teacher", count = 5)

In [ ]:

print("Calculate a vector for each job title")
job_title_vecs = w2v_model.transform(words, aggregate_method = "AVERAGE")

In [ ]:

print("Prepare training&validation data (keep only job titles made of known words)")
valid_job_titles = ~ job_title_vecs["C1"].isna()
data = job_titles[valid_job_titles,:].cbind(job_title_vecs[valid_job_titles,:])
data_split = data.split_frame(ratios=[0.8])

In [ ]:

print("Build a basic GBM model")
gbm_model = H2OGradientBoostingEstimator()
gbm_model.train(x = job_title_vecs.names,
                y="category", 
                training_frame = data_split[0], 
                validation_frame = data_split[1])

In [ ]:

print("Predict!")
print(predict(["school teacher having holidays every month"], w2v_model, gbm_model))
print(predict(["developer with 3+ Java experience, jumping"], w2v_model, gbm_model))
print(predict(["Financial accountant CPA preferred"], w2v_model, gbm_model))

In [ ]: