import h2o
h2o.init()
from h2o.estimators.word2vec import H2OWord2vecEstimator
from h2o.estimators.gbm import H2OGradientBoostingEstimator
job_titles_path = "https://s3.amazonaws.com/h2o-public-test-data/smalldata/craigslistJobTitles.csv"
job_titles = h2o.import_file(job_titles_path, destination_frame = "jobtitles",
col_names = ["category", "jobtitle"], col_types = ["enum", "string"], header = 1)
STOP_WORDS = ["ax","i","you","edu","s","t","m","subject","can","lines","re","what",
"there","all","we","one","the","a","an","of","or","in","for","by","on",
"but","is","in","a","not","with","as","was","if","they","are","this","and","it","have",
"from","at","my","be","by","not","that","to","from","com","org","like","likes","so"]
def tokenize(sentences, stop_word = STOP_WORDS):
tokenized = sentences.tokenize("\\W+")
tokenized_lower = tokenized.tolower()
tokenized_filtered = tokenized_lower[(tokenized_lower.nchar() >= 2) | (tokenized_lower.isna()),:]
tokenized_words = tokenized_filtered[tokenized_filtered.grep("[0-9]",invert=True,output_logical=True),:]
tokenized_words = tokenized_words[(tokenized_words.isna()) | (~ tokenized_words.isin(STOP_WORDS)),:]
return tokenized_words
def predict(job_title,w2v, gbm):
words = tokenize(h2o.H2OFrame(job_title).ascharacter())
job_title_vec = w2v.transform(words, aggregate_method="AVERAGE")
print(gbm.predict(test_data=job_title_vec))
print("Break job titles into sequence of words")
words = tokenize(job_titles["jobtitle"])
print("Build word2vec model")
w2v_model = H2OWord2vecEstimator(sent_sample_rate = 0.0, epochs = 10)
w2v_model.train(training_frame=words)
print("Sanity check - find synonyms for the word 'teacher'")
w2v_model.find_synonyms("teacher", count = 5)
print("Calculate a vector for each job title")
job_title_vecs = w2v_model.transform(words, aggregate_method = "AVERAGE")
print("Prepare training&validation data (keep only job titles made of known words)")
valid_job_titles = ~ job_title_vecs["C1"].isna()
data = job_titles[valid_job_titles,:].cbind(job_title_vecs[valid_job_titles,:])
data_split = data.split_frame(ratios=[0.8])
print("Build a basic GBM model")
gbm_model = H2OGradientBoostingEstimator()
gbm_model.train(x = job_title_vecs.names,
y="category",
training_frame = data_split[0],
validation_frame = data_split[1])
print("Predict!")
print(predict(["school teacher having holidays every month"], w2v_model, gbm_model))
print(predict(["developer with 3+ Java experience, jumping"], w2v_model, gbm_model))
print(predict(["Financial accountant CPA preferred"], w2v_model, gbm_model))