Simple web scraping with BeautifulSoup4 and NLP with DistilBERT
!pip install -q requests beautifulsoup4
!pip install -U sentence-transformers
import time
import csv
import re
import numpy as np
import pandas as pd
import requests
import bs4
import lxml.etree as xml
import pprint
from scipy.spatial.distance import cosine, cdist
import nltk
nltk.download('punkt')
from spacy.lang.en import English
nlp = English()
sentencizer = nlp.create_pipe("sentencizer")
nlp.add_pipe(sentencizer)
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
import warnings
warnings.filterwarnings("ignore")
%reload_ext google.colab.data_table
[nltk_data] Downloading package punkt to /root/nltk_data... [nltk_data] Package punkt is already up-to-date!
URLs = ["https://www.flexjobs.com/blog/post/job-search-strategies-for-success-v2/",
"https://www.best-job-interview.com/job-search-strategy.html",
"https://content.wisestep.com/job-search-strategies/",
"https://www.thebalancecareers.com/top-strategies-for-a-successful-job-search-2060714",
"https://www.monster.com/career-advice/article/a-winning-job-search-strategy",
"https://interviewdoctor.com/testimonials/",
"https://www.telenor.com/10-tips-for-job-hunting-in-the-digital-age/",
"https://www.monster.com/career-advice/article/five-ps-of-job-search-progress",
]
requests.get(URLs[7])
<Response [200]>
df = pd.DataFrame(columns=['title','text'])
i = 0
web_page = bs4.BeautifulSoup(requests.get(URLs[i], {}).text, "lxml")
df.loc[i,'title'] = web_page.head.title.text
sub_web_page = web_page.find_all(name="article", attrs={"class": "single-post-page"})[0]
article = '. '.join([wp.text for wp in sub_web_page.find_all({"h2","p"})])
df.loc[i,'text'] = article
i = 1
web_page = bs4.BeautifulSoup(requests.get(URLs[i], {}).text, "lxml")
df.loc[i,'title'] = web_page.head.title.text
sub_web_page = web_page.find_all(attrs={"id": "ContentColumn"})[0]
article = '. '.join([wp.text for wp in sub_web_page.find_all({"span","h2","p"})])
df.loc[i,'text'] = article
i = 2
web_page = bs4.BeautifulSoup(requests.get(URLs[i], {}).text, "lxml")
df.loc[i,'title'] = web_page.head.title.text
sub_web_page = web_page.find_all(attrs={"class": "td-ss-main-content"})[0]
article = '. '.join([wp.text for wp in sub_web_page.find_all({"span","h2","p"})])
df.loc[i,'text'] = article
i = 3
web_page = bs4.BeautifulSoup(requests.get(URLs[i], {}).text, "lxml")
df.loc[i,'title'] = web_page.head.title.text
sub_web_page = web_page.find_all(attrs={"id": "list-sc_1-0"})[0]
article = '. '.join([wp.text for wp in sub_web_page.find_all({"h2","p"})])
df.loc[i,'text'] = article
i = 4
web_page = bs4.BeautifulSoup(requests.get(URLs[i], {}).text, "lxml")
df.loc[i,'title'] = web_page.head.title.text
sub_web_page = web_page.find_all(attrs={"id": "mainContent"})[0]
article = '. '.join([wp.text for wp in sub_web_page.find_all({"h2","p"})])
df.loc[i,'text'] = article
i = 5
web_page = bs4.BeautifulSoup(requests.get(URLs[i], {}).text, "lxml")
df.loc[i,'title'] = web_page.head.title.text
sub_web_page = web_page.find_all(attrs={"class": "site-inner"})[0]
article = '. '.join([wp.text for wp in sub_web_page.find_all({"blockquote"})])
df.loc[i,'text'] = article
i = 6
web_page = bs4.BeautifulSoup(requests.get(URLs[i], {}).text, "lxml")
df.loc[i,'title'] = web_page.head.title.text
sub_web_page = web_page.find_all(attrs={"id": "primary"})[0]
article = '. '.join([wp.text for wp in sub_web_page.find_all({"p","ol"})])
df.loc[i,'text'] = article
i = 7
web_page = bs4.BeautifulSoup(requests.get(URLs[i], {}).text, "lxml")
df.loc[i,'title'] = web_page.head.title.text
sub_web_page = web_page.find_all(attrs={"class": "article-content"})[0]
article = '. '.join([wp.text for wp in sub_web_page.find_all({"p","h2"})])
df.loc[i,'text'] = article
df = df.dropna().reset_index(drop=True)
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 8 entries, 0 to 7 Data columns (total 2 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 title 8 non-null object 1 text 8 non-null object dtypes: object(2) memory usage: 256.0+ bytes
df
title | text | |
---|---|---|
0 | 7 Job Search Strategies For Landing Your Next ... | Job-hunting can be a tedious task, and it can ... |
1 | Successful Job Search Strategy 2020 | Home. Job Search Strategy. A good job search s... |
2 | 20 Effective or Successful Job Search Strategi... | Just like most of the things, even the process... |
3 | Top 10 Strategies for a Successful Job Search | \nJob searching isn't just about applying for ... |
4 | Job Hunting | Monster.com | Trying to land the right job? Learn how to cre... |
5 | Testimonials of Career and Job Search Coach | I found a new role, but before applying I rea... |
6 | 10 Tips for Job Hunting in the Digital Age - T... | On the hunt for a job in this digital age? Tel... |
7 | Job Hunting | Monster.com | If you’re feeling directionless, mastering the... |
def tokenize(x):
return nltk.sent_tokenize(x)
def spacy_tokenize(x):
doc = nlp(x)
return list(doc.sents)
def sentenize(temp, col = 'text'):
s = temp.apply(lambda x: pd.Series(x[col]),axis=1).stack().reset_index(level=1, drop=True)
s.name = col
temp = temp.drop(col, axis=1).join(s)
return temp
temp = df[['text']].copy()
temp.loc[:,'text'] = temp.text.apply(lambda x: re.sub(r'\.+', ".", x))
temp.loc[:,'text'] = temp['text'].apply(tokenize)
temp = sentenize(temp,'text')
temp.reset_index(inplace=True)
temp.columns = ['para_id','text']
temp.loc[:,'text'] = temp['text'].apply(spacy_tokenize)
temp = sentenize(temp,'text')
temp.reset_index(drop=True, inplace=True)
temp = temp.dropna()
temp.loc[:,'text'] = temp.text.apply(lambda x: x.text.lower())
temp.loc[:,'text'] = temp['text'].str.replace("[^a-zA-Z0-9]", " ")
temp.loc[:,'text'] = temp['text'].dropna()
temp = temp[temp['text'].str.split().str.len().gt(3)]
temp = temp.drop_duplicates(subset=['text'], keep='first')
temp = temp.reset_index(drop=True)
temp
para_id | text | |
---|---|---|
0 | 0 | job hunting can be a tedious task and it can ... |
1 | 0 | check out these job search strategies that can... |
2 | 0 | job hunting can be a tedious task and without... |
3 | 0 | how do you continue to be a focused job seeker... |
4 | 0 | create a job hunting strategy |
... | ... | ... |
515 | 7 | could you use some help |
516 | 7 | join monster for free today |
517 | 7 | as a member you can upload up to five version... |
518 | 7 | additionally you can sign up for job alerts s... |
519 | 7 | let monster help you get focused and get hired |
520 rows × 2 columns
embedder = SentenceTransformer('distilbert-base-nli-mean-tokens')
corpus = temp.text.tolist()
corpus_embeddings = embedder.encode(corpus)
100%|██████████| 245M/245M [00:18<00:00, 13.2MB/s]
queries = ['customize resume']
query_embeddings = embedder.encode(queries)
for query, query_embedding in zip(queries, query_embeddings):
distances = cdist([query_embedding], corpus_embeddings, "cosine")[0]
topn_index = distances.argsort()[:5][::-1]
print('Query:', query)
print('Top 5 most similar sentences in corpus:')
for i in topn_index:
pprint.pprint("{} (Score: {})".format(corpus[i], distances[i]))
Query: customize resume Top 5 most similar sentences in corpus: 'ace the job interview (Score: 0.31260448499322224)' ('prepare visual or video resume generate linkedin profiles etc (Score: ' '0.30055823636076495)') 'fill out your employment history and add skills (Score: 0.2733115555152831)' 'customize your resume and cover letter (Score: 0.14002152061573192)' 'customize your resume and cover letter (Score: 0.14002152061573192)'
num_clusters = 20
clustering_model = KMeans(n_clusters=num_clusters)
clustering_model.fit(corpus_embeddings)
cluster_assignment = clustering_model.labels_
df = pd.DataFrame(data={"text":corpus, "cluster":cluster_assignment})
df
text | cluster | |
---|---|---|
0 | job hunting can be a tedious task and it can ... | 2 |
1 | check out these job search strategies that can... | 13 |
2 | job hunting can be a tedious task and without... | 2 |
3 | how do you continue to be a focused job seeker... | 2 |
4 | create a job hunting strategy | 1 |
... | ... | ... |
515 | could you use some help | 10 |
516 | join monster for free today | 4 |
517 | as a member you can upload up to five version... | 18 |
518 | additionally you can sign up for job alerts s... | 8 |
519 | let monster help you get focused and get hired | 1 |
520 rows × 2 columns
c = 0
df.loc[df.cluster==c,:]
text | cluster | |
---|---|---|
31 | it s a unique way to expand your network and p... | 0 |
46 | using large job boards can provide a great way... | 0 |
56 | while networking is always a useful job search... | 0 |
80 | google alerts will save you time and keep you ... | 0 |
88 | there are some definite benefits to sending yo... | 0 |
129 | jotting such things down will help you recogni... | 0 |
158 | you can browse these job boards in order to fi... | 0 |
166 | attending different career fairs is one of the... | 0 |
172 | your mobile can be your best way to land your ... | 0 |
173 | most of the reputed job boards offer mobile ap... | 0 |
178 | networking is still an effective job search st... | 0 |
182 | rank well on google | 0 |
183 | ranking on google is essential not just for a ... | 0 |
191 | such jobs are great to develop new experiences... | 0 |
203 | if you want to land to a reputed job then hav... | 0 |
205 | sharing positive information about your indust... | 0 |
241 | narrowing your search criteria will save time ... | 0 |
250 | rank well on google | 0 |
277 | implementing a diverse job hunting strategy ca... | 0 |
278 | here are seven smart tactics you can use to tr... | 0 |
309 | taking a temp job can help get your foot in th... | 0 |
408 | i ve put together a list of the top ten things... | 0 |
471 | job hunting can feel like an epic journey with... | 0 |
495 | for example attending at least one networking... | 0 |
c = 1
df.loc[df.cluster==c,:]
text | cluster | |
---|---|---|
4 | create a job hunting strategy | 1 |
13 | a portfolio is essentially samples of your work | 1 |
27 | include reasons for why you want to work at th... | 1 |
28 | tell them what they will stand to gain if they... | 1 |
40 | fill out your employment history and add skills | 1 |
... | ... | ... |
504 | while you re busy job hunting remember also t... | 1 |
507 | try to relate some of your personality traits ... | 1 |
508 | so for a customer service job you might say ... | 1 |
512 | power up your job search | 1 |
519 | let monster help you get focused and get hired | 1 |
68 rows × 2 columns
c = 6
df.loc[df.cluster==c,:]
text | cluster | |
---|---|---|
19 | customize your resume and cover letter | 6 |
35 | optimize your linkedin profile | 6 |
43 | use hashtags to make your posts searchable | 6 |
50 | flexjobs for example focuses on flexible and... | 6 |
52 | use these sites to make your search more fruit... | 6 |
54 | take advantage of networking opportunities | 6 |
63 | photo credit bigstockphoto com | 6 |
170 | use your mobile to search for the job | 6 |
171 | don t limit the usage of your mobile to playin... | 6 |
176 | allow your network to work for you | 6 |
184 | prepare visual or video resume generate linke... | 6 |
201 | make sure you have a clean online reputation | 6 |
202 | if there is any content online that reflects p... | 6 |
242 | customize your resume and cover letter | 6 |
252 | use your name for the url if possible | 6 |
310 | let monster pick up some of the slack | 6 |
312 | but did you know monster can also help bring j... | 6 |
396 | telenor group s chief people officer jon erik... | 6 |
418 | because they will google you | 6 |
419 | get ahead and find out what they will uncover ... | 6 |
420 | if some strange images come up in the search ... | 6 |
421 | and remember that it is possible to make cert... | 6 |
422 | tap into your network | 6 |
428 | of course you should be following them on soci... | 6 |
439 | maybe video is your thing maybe an online pho... | 6 |
447 | come up with stories from your experience that... | 6 |
502 | or even better record a video of yourself so... | 6 |