#!/usr/bin/env python # coding: utf-8 # In[ ]: import h2o h2o.init() from h2o.estimators.word2vec import H2OWord2vecEstimator from h2o.estimators.gbm import H2OGradientBoostingEstimator # In[ ]: job_titles_path = "https://s3.amazonaws.com/h2o-public-test-data/smalldata/craigslistJobTitles.csv" job_titles = h2o.import_file(job_titles_path, destination_frame = "jobtitles", col_names = ["category", "jobtitle"], col_types = ["enum", "string"], header = 1) # In[ ]: STOP_WORDS = ["ax","i","you","edu","s","t","m","subject","can","lines","re","what", "there","all","we","one","the","a","an","of","or","in","for","by","on", "but","is","in","a","not","with","as","was","if","they","are","this","and","it","have", "from","at","my","be","by","not","that","to","from","com","org","like","likes","so"] # In[ ]: def tokenize(sentences, stop_word = STOP_WORDS): tokenized = sentences.tokenize("\\W+") tokenized_lower = tokenized.tolower() tokenized_filtered = tokenized_lower[(tokenized_lower.nchar() >= 2) | (tokenized_lower.isna()),:] tokenized_words = tokenized_filtered[tokenized_filtered.grep("[0-9]",invert=True,output_logical=True),:] tokenized_words = tokenized_words[(tokenized_words.isna()) | (~ tokenized_words.isin(STOP_WORDS)),:] return tokenized_words # In[ ]: def predict(job_title,w2v, gbm): words = tokenize(h2o.H2OFrame(job_title).ascharacter()) job_title_vec = w2v.transform(words, aggregate_method="AVERAGE") print(gbm.predict(test_data=job_title_vec)) # In[ ]: print("Break job titles into sequence of words") words = tokenize(job_titles["jobtitle"]) # In[ ]: print("Build word2vec model") w2v_model = H2OWord2vecEstimator(sent_sample_rate = 0.0, epochs = 10) w2v_model.train(training_frame=words) # In[ ]: print("Sanity check - find synonyms for the word 'teacher'") w2v_model.find_synonyms("teacher", count = 5) # In[ ]: print("Calculate a vector for each job title") job_title_vecs = w2v_model.transform(words, aggregate_method = "AVERAGE") # In[ ]: print("Prepare training&validation data (keep only job titles made of known words)") valid_job_titles = ~ job_title_vecs["C1"].isna() data = job_titles[valid_job_titles,:].cbind(job_title_vecs[valid_job_titles,:]) data_split = data.split_frame(ratios=[0.8]) # In[ ]: print("Build a basic GBM model") gbm_model = H2OGradientBoostingEstimator() gbm_model.train(x = job_title_vecs.names, y="category", training_frame = data_split[0], validation_frame = data_split[1]) # In[ ]: print("Predict!") print(predict(["school teacher having holidays every month"], w2v_model, gbm_model)) print(predict(["developer with 3+ Java experience, jumping"], w2v_model, gbm_model)) print(predict(["Financial accountant CPA preferred"], w2v_model, gbm_model)) # In[ ]: