Import packages

In [57]:
from IPython.core.display import clear_output
from os import path
from PIL import Image
from random import randint
from stop_words import get_stop_words
from warnings import warn
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt
import pandas as pd
import matplotlib
import requests
import seaborn as sns
import string
import warnings
import folium
import time
import geocoder
from folium.plugins import HeatMap

warnings.filterwarnings('ignore')

Data Load

In [36]:
Job_list = pd.read_csv('Job_list_base.CSV', engine='python', encoding='utf-8', index_col=0)

#Some cleaning and manipulation
Job_list['Search_Terms'] = Job_list['Search_Terms'].str.replace('data%20scientist', 'Data Scientist') 
Job_list['Search_Terms'] = Job_list['Search_Terms'].str.replace('business%20intelligence', 'Business Intelligence') 
Job_list['Search_Terms'] = Job_list['Search_Terms'].str.replace('analyst', 'Analyst') 

#Lets seperate the data
Job_list['Job_title_raw'] = Job_list['Texts'].str.split("Sign in;", n = 1, expand = True)[1].str.split(";", n = 1, expand = True)[0]
Job_list['Job_title_man'] = Job_list['Job_title_raw'].str.replace("-", " ").str.replace("/", " ").str.replace("&", " ").str.replace("  ", " ").str.replace("   ", " ")
Job_list['Job_title_man'] = Job_list['Job_title_man'].str.strip()
Job_list['Job_title_man'] = Job_list['Job_title_man'].str.replace(r"\(.*?\)","")

#Company and City
Job_list['Company'] = Job_list['Texts'].str.split("Sign in;", n = 1, expand = True)[1].str.split(";", n = 1, expand = True)[1].str.split(";", n = 1, expand = True)[0]
Job_list['City'] = Job_list['Texts'].str.split("Sign in;", n = 1, expand = True)[1].str.split(";", n = 1, expand = True)[1].str.split(";", n = 1, expand = True)[1].str.split(";", n = 1, expand = True)[0].str.split(",", n = 1, expand = True)[0]
Job_list['City'] = Job_list['City'].str.replace('Frankfurt am Main', 'Frankfurt')
Job_list['City'] = Job_list['City'].str.replace('Frankfurt am Main und Umgebung', 'Frankfurt')
Job_list['City'] = Job_list['City'].str.replace('10405 Berlin', 'Berlin')
Job_list['City'] = Job_list['City'].str.replace('Berlin Area', 'Berlin')
Job_list['City'] = Job_list['City'].str.replace('Berlin und Umgebung', 'Berlin')
Job_list['City'] = Job_list['City'].str.replace('Berlino', 'Berlin')
Job_list['City'] = Job_list['City'].str.replace(' und Umgebung', '')
Job_list['City'] = Job_list['City'].str.replace(' Area', '')
Job_list['City'] = Job_list['City'].str.replace('Frankfurt Am Main', 'Frankfurt')

#Time posted
Job_list['Time_posted'] = Job_list['Texts'].str.split(" ago;", n = 1, expand = True)[0].str[-8:].str.strip('E;').str.strip('d;').str.strip('y;').str.strip('y;').str.strip('nd;').str.strip('a;').str.strip('DE;')

#Applicants
Job_list['Applicants_number'] = Job_list['Texts'].str.split(" ago;", n = 1, expand = True)[1].str.split(";", n = 1, expand = True)[0]
Job_list_out = Job_list['Applicants_number'].loc[Job_list['Applicants_number'].isnull()]
Job_list = Job_list.loc[~Job_list['Applicants_number'].isin(Job_list_out)]

#Description
Job_list['Description'] = Job_list['Texts'].str.split("Report this job;", n = 1, expand = True)[1].str.split(";Seniority level", n = 1, expand = True)[0]
Job_list['Description'] = Job_list['Description'].str.replace("Posted by:;", "")

#Description length
Job_list['Description_words_amount'] = Job_list['Description'].apply(lambda x: len(str(x).split(" ")))
Job_list['Description_len'] = Job_list['Description'].apply(len)

#Seniority Level
Job_list['Seniority_level'] = Job_list['Texts'].str.split("Seniority level;", n = 1, expand = True)[1].str.split(";Employment type", n = 1, expand = True)[0]

#Employment Type
Job_list['Employment_type'] = Job_list['Texts'].str.split("Employment type;", n = 1, expand = True)[1].str.split(";Job function", n = 1, expand = True)[0]

#Job Function
Job_list['First_job_function'] = Job_list['Texts'].str.split("Job function;", n = 1, expand = True)[1].str.split(";Industries", n = 1, expand = True)[0].str.split(";", n = 1, expand = True)[0]

#Industry
Job_list['First_industry'] = Job_list['Texts'].str.split("Industries;", n = 1, expand = True)[1].str.split(";Search for more jobs like this;", n = 1, expand = True)[0].str.split(";", n = 1, expand = True)[0]

#Title contains
Job_list['Title_small'] = Job_list['Job_title_raw'].apply(lambda x: " ".join(x.lower() for x in x.split()))
Job_list['Title_Data_Scientist'] = Job_list['Title_small'].apply(lambda x: 'data scientist' if 'data scientist' in x else None)
Job_list['Title_Business_Intelligence'] = Job_list['Title_small'].apply(lambda x: 'business intelligence' if 'business intelligence' in x else None)
Job_list['Title_Analyst'] = Job_list['Title_small'].apply(lambda x: 'analyst' if 'analyst' in x else None)
cols= ['Title_Data_Scientist', 'Title_Business_Intelligence', 'Title_Analyst']
Job_list['Titles_combined'] = Job_list[cols].apply(lambda x: ', '.join(x.dropna()), axis=1)
Job_list['Titles_combined'] = Job_list['Titles_combined'].apply(lambda x: " ".join(x.capitalize() for x in x.split()))

#Make Job List small
Job_list['Job_title_man'] = Job_list['Job_title_man'].apply(lambda x: " ".join(x.lower() for x in x.split()))
Job_list['Job_title_man'] = Job_list['Job_title_man'].apply(lambda x: " ".join(x.capitalize() for x in x.split()))

Job_list.head()
Out[36]:
Job_Links Page_Links Search_Terms Texts Job_title_raw Job_title_man Company City Time_posted Applicants_number ... Description_len Seniority_level Employment_type First_job_function First_industry Title_small Title_Data_Scientist Title_Business_Intelligence Title_Analyst Titles_combined
0 https://www.linkedin.com/jobs/view/1573130365/... https://www.linkedin.com/jobs/search/?geoId=10... Data Scientist function getDfd() {let yFn,nFn;const p=new Pro... Data Scientist Data Scientist Codepan GmbH Berlin 5 days 31 applicants ... 2300 Entry level Full-time Information Technology Information Technology and Services data scientist data scientist None None Data Scientist
1 https://www.linkedin.com/jobs/view/1599153279/... https://www.linkedin.com/jobs/search/?geoId=10... Data Scientist function getDfd() {let yFn,nFn;const p=new Pro... Data Scientist (m/f/d) Data Scientist Free Berlin 2 weeks Be among the first 25 applicants ... 3568 Entry level Full-time Engineering Information Technology and Services data scientist (m/f/d) data scientist None None Data Scientist
2 https://www.linkedin.com/jobs/view/1612674675/... https://www.linkedin.com/jobs/search/?geoId=10... Data Scientist function getDfd() {let yFn,nFn;const p=new Pro... DATA SCIENTIST (M / F) Data Scientist ixto GmbH Berlin 1 week 45 applicants ... 1709 Entry level Part-time Engineering Information Technology and Services data scientist (m / f) data scientist None None Data Scientist
3 https://www.linkedin.com/jobs/view/1569894808/... https://www.linkedin.com/jobs/search/?geoId=10... Data Scientist function getDfd() {let yFn,nFn;const p=new Pro... Data Scientist (m/f/d) Data Scientist Roche Penzberg 3 weeks Be among the first 25 applicants ... 5584 Entry level Full-time Engineering Hospital & Health Care data scientist (m/f/d) data scientist None None Data Scientist
4 https://www.linkedin.com/jobs/view/1524827163/... https://www.linkedin.com/jobs/search/?geoId=10... Data Scientist function getDfd() {let yFn,nFn;const p=new Pro... Data Scientist Data Scientist Babbel Berlin 4 weeks Over 200 applicants ... 3591 Entry level Full-time Information Technology E-Learning data scientist data scientist None None Data Scientist

5 rows × 22 columns

In [4]:
Job_list.to_csv('Job_list_man.CSV',sep=',')

Exploratory Data Analysis

Amount

In [4]:
#Simple amount
Search_term=["Data Scientist","Business Intelligence","Analyst"]
Amount=[3684,10397,7720]
df = pd.DataFrame(dict(Search_term=Search_term, Amount=Amount))
df=df.sort_values('Amount', ascending=True).reset_index(drop="True")


sns.set(font_scale = 4)
sns.set_context("talk")
sns.set(rc={"lines.linewidth": 0, 'lines.markersize': 10})
plt.figure(figsize=(8,8))
plt.xticks(rotation=45)
sns.set_style("whitegrid", {'axes.grid' : False})
ax= sns.barplot(x="Search_term",y="Amount", data=df, color = "#3182bd", ci=None)
ax.set(ylim=(0))
plt.show()
In [5]:
#Simple amount
Search_term=["Data Scientist, Business Intelligence and Analyst","Information Technogy"]
Amount=[21801,444170]
df = pd.DataFrame(dict(Search_term=Search_term, Amount=Amount))
df=df.sort_values('Amount', ascending=True).reset_index(drop="True")


sns.set(font_scale = 4)
sns.set_context("talk")
sns.set(rc={"lines.linewidth": 0, 'lines.markersize': 10})
plt.figure(figsize=(8,8))
plt.xticks(rotation=45)
sns.set_style("whitegrid", {'axes.grid' : False})
ax= sns.barplot(x="Search_term",y="Amount", data=df, color = "#3182bd", ci=None)
ax.set(ylim=(0))
plt.show()
In [6]:
sns.set(font_scale = 4)
sns.set_context("talk")
sns.set(rc={"lines.linewidth": 0, 'lines.markersize': 10})
plt.figure(figsize=(8,8))
plt.xticks(rotation=45)
sns.set_style("whitegrid", {'axes.grid' : False})
ax= sns.countplot("Search_Terms", data=Job_list, color = "#3182bd")
ax.set(ylim=(0))
plt.show()

Job Title

In [7]:
plt.tick_params(axis="x", labelsize=10)
Job_list.Job_title_man.value_counts().sort_values(ascending=False)[:10].plot(kind="barh")
plt.xticks(rotation=0)
plt.show()
In [8]:
grouped = pd.crosstab(Job_list.Job_title_man, Job_list.Search_Terms)
grouped["Total"] = grouped["Analyst"] + grouped["Business Intelligence"] + grouped["Data Scientist"]
grouped = grouped.sort_values('Total', ascending=False)
grouped[:10]
Out[8]:
Search_Terms Analyst Business Intelligence Data Scientist Total
Job_title_man
Data Scientist 4 39 259 302
Data Analyst 89 18 54 161
Business Intelligence Analyst 38 60 0 98
Business Analyst 36 15 0 51
Senior Data Scientist 0 2 43 45
Consultant Business Intelligence 0 26 0 26
Data Engineer 1 20 1 22
Business Intelligence Consultant 0 22 0 22
Business Intelligence Developer 0 21 0 21
Digital Data Analyst 11 0 10 21

Overlap count

In [9]:
grouped = pd.crosstab(Job_list.Titles_combined, Job_list.Search_Terms)
grouped["Total"] = grouped["Analyst"] + grouped["Business Intelligence"] + grouped["Data Scientist"]
grouped = grouped.sort_values('Total', ascending=False)
grouped
Out[9]:
Search_Terms Analyst Business Intelligence Data Scientist Total
Titles_combined
Analyst 868 155 138 1161
Data Scientist 7 96 715 818
Business Intelligence 4 417 3 424
25 162 98 285
Business Intelligence, Analyst 70 133 6 209
Data Scientist, Analyst 1 5 12 18
Data Scientist, Business Intelligence 0 2 2 4
Data Scientist, Business Intelligence, Analyst 0 3 1 4
In [10]:
grouped = grouped.iloc[:, 0:3]
sns.clustermap(grouped, cmap="Blues", linewidth=.5, standard_scale=0, col_cluster=False, row_cluster=False)
Out[10]:
<seaborn.matrix.ClusterGrid at 0x1b84ca07448>
In [11]:
Job_list[Job_list['Titles_combined']==''].Job_title_man.value_counts().sort_values(ascending=False)[:10].plot(kind="barh")
plt.show()

Company

In [12]:
Job_list.Company.value_counts().sort_values(ascending=False)[:10].plot(kind="barh")
plt.show()
In [13]:
grouped = pd.crosstab(Job_list.Company, Job_list.Search_Terms)
grouped["Total"] = grouped["Analyst"] + grouped["Business Intelligence"] + grouped["Data Scientist"]
grouped = grouped.sort_values('Total', ascending=False)
grouped[:10]
Out[13]:
Search_Terms Analyst Business Intelligence Data Scientist Total
Company
Campusjäger 4 28 29 61
Porsche AG 6 29 10 45
Zalando SE 9 10 12 31
EzCareer 24 3 2 29
JobPilot 22 4 2 28
KeenRecruit 20 5 2 27
Harnham 9 6 8 23
Amazon 8 8 6 22
RetailFocus 15 3 2 20
iPAXX AG 5 5 9 19
In [14]:
grouped = grouped.iloc[:, 0:3]
sns.clustermap(grouped[:10], cmap="Blues", linewidth=.5, standard_scale=0, col_cluster=False, row_cluster=False)
Out[14]:
<seaborn.matrix.ClusterGrid at 0x1b84cfe1d48>

City

In [15]:
Job_list.City.value_counts().sort_values(ascending=False)[:10].plot(kind="barh")
plt.show()
In [16]:
grouped = pd.crosstab(Job_list.City, Job_list.Search_Terms)
grouped["Total"] = grouped["Analyst"] + grouped["Business Intelligence"] + grouped["Data Scientist"]
grouped = grouped.sort_values('Total', ascending=False)
grouped[:10]
Out[16]:
Search_Terms Analyst Business Intelligence Data Scientist Total
City
Berlin 253 234 224 711
Hamburg 146 109 117 372
Munich 110 115 110 335
Frankfurt 108 50 62 220
Düsseldorf 41 39 29 109
Köln 45 27 25 97
Hannover 16 25 31 72
Stuttgart 17 23 27 67
Bonn 4 9 24 37
Leipzig 8 17 9 34
In [17]:
grouped = grouped.iloc[:, 0:3]
sns.clustermap(grouped[:10], cmap="Blues", linewidth=.5, standard_scale=0, col_cluster=False, row_cluster=False)
Out[17]:
<seaborn.matrix.ClusterGrid at 0x1b84cfac848>

Map

In [37]:
list_lat = []   # create empty lists
list_long = []

for index, row in Job_list.iterrows(): # iterate over rows in dataframe

    City = row['City']
    query = str(City)

    results = geocoder.bing(query, key='nReoYlSyKa7bNHrfvhOk~s1o5LKAt0EeGB2uzWDatEA~AornPa_MxwI60LoAy6YjXLKEswPz7eeVch1lNGVaAMlFQVxtIrfbSntFSCWRtK_J')   
    lat = results.lat
    long = results.lng

    list_lat.append(lat)
    list_long.append(long)

# create new columns from lists    

Job_list['City_lat'] = list_lat   
Job_list['City_lon'] = list_long
In [39]:
Job_list.to_csv('Job_list_man.CSV',sep=',')
In [67]:
#All
Job_list_clean=Job_list.dropna(subset=['City_lat'])
Job_list_clean=Job_list_clean.dropna(subset=['City_lon'])
base_map = folium.Map(location=[51.1633987,10.4468737], zoom_start=10)
Job_list_clean = Job_list_clean.groupby(['City_lat', 'City_lon']).count().reset_index()
HeatMap(data=Job_list_clean[['City_lat', 'City_lon', 'Job_Links']], radius=15, max_zoom=13).add_to(base_map)
base_map
Out[67]:
In [65]:
#Only Analyst
Job_list_clean=Job_list.dropna(subset=['City_lat'])
Job_list_clean=Job_list_clean.dropna(subset=['City_lon'])
Job_list_clean = Job_list_clean[Job_list_clean.Search_Terms == "Analyst"]
base_map = folium.Map(location=[51.1633987,10.4468737], zoom_start=10)
Job_list_clean = Job_list_clean.groupby(['City_lat', 'City_lon']).count().reset_index()
HeatMap(data=Job_list_clean[['City_lat', 'City_lon', 'Job_Links']], radius=20, max_zoom=13).add_to(base_map)
base_map
Out[65]:
In [66]:
#Only Business Intelligence
Job_list_clean=Job_list.dropna(subset=['City_lat'])
Job_list_clean=Job_list_clean.dropna(subset=['City_lon'])
Job_list_clean = Job_list_clean[Job_list_clean.Search_Terms == "Business Intelligence"]
base_map = folium.Map(location=[51.1633987,10.4468737], zoom_start=10)
Job_list_clean = Job_list_clean.groupby(['City_lat', 'City_lon']).count().reset_index()
HeatMap(data=Job_list_clean[['City_lat', 'City_lon', 'Job_Links']], radius=20, max_zoom=13).add_to(base_map)
base_map
Out[66]:
In [61]:
#Only Data Scientist
Job_list_clean=Job_list.dropna(subset=['City_lat'])
Job_list_clean=Job_list_clean.dropna(subset=['City_lon'])
Job_list_clean = Job_list_clean[Job_list_clean.Search_Terms == "Data Scientist"]
base_map = folium.Map(location=[51.1633987,10.4468737], zoom_start=10)
Job_list_clean = Job_list_clean.groupby(['City_lat', 'City_lon']).count().reset_index()
HeatMap(data=Job_list_clean[['City_lat', 'City_lon', 'Job_Links']], radius=20, max_zoom=13).add_to(base_map)
base_map
Out[61]:

Time posted

In [19]:
Job_list.Time_posted.value_counts().sort_values(ascending=False)[5:].plot(kind="barh")
plt.show()
In [20]:
grouped = pd.crosstab(Job_list.Time_posted, Job_list.Search_Terms)
grouped["Total"] = grouped["Analyst"] + grouped["Business Intelligence"] + grouped["Data Scientist"]
grouped = grouped.sort_values('Total', ascending=False)
grouped[:20]
Out[20]:
Search_Terms Analyst Business Intelligence Data Scientist Total
Time_posted
2 weeks 222 219 180 621
2 months 113 129 144 386
3 weeks 100 106 98 304
1 week 96 81 95 272
1 month 73 95 86 254
4 weeks 62 65 66 193
2 days 64 50 68 182
3 months 40 63 56 159
5 days 76 34 33 143
4 months 22 33 39 94
4 days 22 23 27 72
3 days 26 23 23 72
6 days 16 8 12 36
7 months 6 14 11 31
5 months 4 5 7 16
6 months 5 1 6 12
7 days 4 3 5 12
1 d 2 7 3 12
8 months 3 2 3 8
1 year 0 2 4 6
In [21]:
grouped = grouped.iloc[:, 0:3]
sns.clustermap(grouped[:20], cmap="Blues", linewidth=.5, standard_scale=0, col_cluster=False, row_cluster=False)
Out[21]:
<seaborn.matrix.ClusterGrid at 0x160eebc5518>

Applicants

In [22]:
Job_list.Applicants_number.value_counts().sort_values(ascending=False)[:20].plot(kind="bar")
plt.show()
In [23]:
grouped = pd.crosstab(Job_list.Applicants_number, Job_list.Search_Terms)
grouped["Total"] = grouped["Analyst"] + grouped["Business Intelligence"] + grouped["Data Scientist"]
grouped = grouped.sort_values('Total', ascending=False)
grouped[:20]
Out[23]:
Search_Terms Analyst Business Intelligence Data Scientist Total
Applicants_number
Be among the first 25 applicants 804 867 828 2499
Over 200 applicants 15 11 24 50
25 applicants 6 6 5 17
34 applicants 5 1 7 13
26 applicants 5 5 3 13
45 applicants 3 4 3 10
37 applicants 3 3 3 9
27 applicants 5 2 2 9
29 applicants 4 4 1 9
31 applicants 3 2 4 9
35 applicants 3 2 4 9
32 applicants 2 3 3 8
49 applicants 3 2 2 7
48 applicants 2 2 3 7
30 applicants 3 2 2 7
56 applicants 3 0 4 7
57 applicants 3 3 0 6
42 applicants 4 2 0 6
36 applicants 2 1 2 5
41 applicants 2 1 2 5
In [24]:
grouped = grouped.iloc[:, 0:3]
sns.clustermap(grouped[:20], cmap="Blues", linewidth=.5, standard_scale=0, col_cluster=False, row_cluster=False)
Out[24]:
<seaborn.matrix.ClusterGrid at 0x160ef35ce48>

Seniority level

In [26]:
Job_list.Seniority_level.value_counts().sort_values(ascending=False)[:20].plot(kind="barh")
plt.show()
In [27]:
grouped = pd.crosstab(Job_list.Seniority_level, Job_list.Search_Terms)
grouped["Total"] = grouped["Analyst"] + grouped["Business Intelligence"] + grouped["Data Scientist"]
grouped = grouped.sort_values('Total', ascending=False)
grouped
Out[27]:
Search_Terms Analyst Business Intelligence Data Scientist Total
Seniority_level
Entry level 371 298 624 1293
Associate 469 504 182 1155
Mid-Senior level 65 79 103 247
Not Applicable 54 43 36 133
Internship 13 31 26 70
Director 3 12 2 17
Executive 0 6 2 8
In [28]:
grouped = grouped.iloc[:, 0:3]
sns.clustermap(grouped, cmap="Blues", linewidth=.5, standard_scale=0, col_cluster=False, row_cluster=False)
Out[28]:
<seaborn.matrix.ClusterGrid at 0x160efafcdd8>

Employment type

In [344]:
Job_list.Employment_type.value_counts().sort_values(ascending=False)[:20].plot(kind="bar")
plt.show()
In [345]:
grouped = pd.crosstab(Job_list.Employment_type, Job_list.Search_Terms)
grouped["Total"] = grouped["Analyst"] + grouped["Business Intelligence"] + grouped["Data Scientist"]
grouped = grouped.sort_values('Total', ascending=False)
grouped
Out[345]:
Search_Terms Analyst Business Intelligence Data Scientist Total
Employment_type
Full-time 927 943 937 2807
Contract 21 12 12 45
Internship 11 15 14 40
Part-time 13 3 12 28
Temporary 2 0 0 2
Other 1 0 0 1
In [346]:
grouped = grouped.iloc[:, 0:3]
sns.clustermap(grouped, cmap="Blues", linewidth=.5, standard_scale=0, col_cluster=False, row_cluster=False)
Out[346]:
<seaborn.matrix.ClusterGrid at 0x2cbd4900b38>

Job function - first mention

In [65]:
Job_list.First_job_function.value_counts().sort_values(ascending=False)[:20].plot(kind="barh")
plt.show()
In [67]:
grouped = pd.crosstab(Job_list.First_job_function, Job_list.Search_Terms)
grouped["Total"] = grouped["Analyst"] + grouped["Business Intelligence"] + grouped["Data Scientist"]
grouped = grouped.sort_values('Total', ascending=False)
grouped[:20]
Out[67]:
Search_Terms Analyst Business Intelligence Data Scientist Total
First_job_function
Business Development 309 439 16 764
Information Technology 256 190 246 692
Engineering 14 110 540 664
Other 34 35 93 162
Finance 121 7 0 128
Consulting 10 69 19 98
Marketing 61 18 5 84
Research 38 5 20 63
Management 39 13 0 52
Analyst 23 15 5 43
Project Management 7 22 2 31
Design 0 10 10 20
Education 2 16 1 19
Sales 13 3 2 18
Quality Assurance 12 0 2 14
Strategy/Planning 4 7 2 13
General Business 6 3 2 11
Customer Service 5 0 2 7
Supply Chain 3 3 0 6
Administrative 4 2 0 6
In [349]:
grouped = grouped.iloc[:, 0:3]
sns.clustermap(grouped[:20], cmap="Blues", linewidth=.5, standard_scale=0, col_cluster=False, row_cluster=False)
Out[349]:
<seaborn.matrix.ClusterGrid at 0x2cb504b1eb8>

Industry - First Mention

In [69]:
Job_list.First_industry.value_counts().sort_values(ascending=False)[:10].plot(kind="barh")
plt.show()
In [351]:
grouped = pd.crosstab(Job_list.First_industry, Job_list.Search_Terms)
grouped["Total"] = grouped["Analyst"] + grouped["Business Intelligence"] + grouped["Data Scientist"]
grouped = grouped.sort_values('Total', ascending=False)
grouped[:20]
Out[351]:
Search_Terms Analyst Business Intelligence Data Scientist Total
First_industry
Information Technology and Services 348 529 555 1432
Marketing and Advertising 170 132 122 424
Internet 41 39 36 116
Chemicals 65 14 26 105
Computer Software 31 32 31 94
Electrical/Electronic Manufacturing 32 29 32 93
Staffing and Recruiting 21 14 25 60
Banking 42 9 2 53
Management Consulting 17 18 15 50
Financial Services 26 9 10 45
Automotive 8 14 12 34
Construction 13 10 3 26
Pharmaceuticals 4 3 12 19
Retail 3 7 9 19
Logistics and Supply Chain 2 11 6 19
Research 6 8 4 18
Consumer Goods 7 6 4 17
Transportation/Trucking/Railroad 1 7 8 16
Renewables & Environment 9 4 3 16
Food & Beverages 7 7 1 15
In [352]:
grouped = grouped.iloc[:, 0:3]
sns.clustermap(grouped[:20], cmap="Blues", linewidth=.5, standard_scale=0, col_cluster=False, row_cluster=False)
Out[352]:
<seaborn.matrix.ClusterGrid at 0x2cbdf4a7828>

Wordmap

In [71]:
# Start with one review:
text = Job_list['Description'][5]

# Create and generate a word cloud image:
wordcloud = WordCloud().generate(text)

# Display the generated image:
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()
In [72]:
text = " ".join(text for text in Job_list['Description'])
print ("There are {} words in the combination of all descriptions.".format(len(text)))
There are 7972772 words in the combination of all descriptions.
In [73]:
from stop_words import get_stop_words
stop_words_de = get_stop_words('de')
In [76]:
Job_list_Analyst = Job_list[Job_list.Search_Terms=='Analyst']
text = " ".join(text for text in Job_list_Analyst['Description'])
print ("There are {} words in the combination of all descriptions.".format(len(text)))
# Create stopword list:
stopwords = set(STOPWORDS)
stopwords.update(["Scientist", "Business", "Intelligence", "Analyst", "Science"])
stopwords.update(stop_words_de)

# Generate a word cloud image
wordcloud = WordCloud(stopwords=stopwords, background_color="white").generate(text)

# Display the generated image:
plt.figure(figsize=[10,7])
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()
There are 2703549 words in the combination of all descriptions.