import pandas as pd
import numpy as np
import seaborn as sns
from palmerpenguins import load_penguins
import holoviews as hv
import hvplot.pandas
import panel as pn
import folium
from folium.plugins import MarkerCluster
import pycountry
from geopy.geocoders import Nominatim
from pylab import *
hv.extension('bokeh')
pn.extension()
dfPenguins = load_penguins()
dfPenguins.head()
species | island | bill_length_mm | bill_depth_mm | flipper_length_mm | body_mass_g | sex | year | |
---|---|---|---|---|---|---|---|---|
0 | Adelie | Torgersen | 39.1 | 18.7 | 181.0 | 3750.0 | male | 2007 |
1 | Adelie | Torgersen | 39.5 | 17.4 | 186.0 | 3800.0 | female | 2007 |
2 | Adelie | Torgersen | 40.3 | 18.0 | 195.0 | 3250.0 | female | 2007 |
3 | Adelie | Torgersen | NaN | NaN | NaN | NaN | NaN | 2007 |
4 | Adelie | Torgersen | 36.7 | 19.3 | 193.0 | 3450.0 | female | 2007 |
columns = list(dfPenguins.columns[1:-1])
x = pn.widgets.Select(value='bill_length_mm', options=columns, name='x')
y = pn.widgets.Select(value='flipper_length_mm', options=columns, name='y')
pn.Row(pn.Column('## Penguins', x, y),
pn.bind(dfPenguins.hvplot.scatter, x, y, by='species'))
jobs = pd.read_csv('ds_salaries.csv', index_col=0)
jobs['experience_level'] = jobs['experience_level'].map({'SE':'Senior', 'MI':'Intermediate','EN':'Junior', 'EX':'Executive'})
jobs['employment_type'] = jobs['employment_type'].map({'PT':'Part-time','FT': 'Full-time', 'CT': 'Contract','FL':'Freelance'})
jobs['company_size'] = jobs['company_size'].map({'M': 'Medium', 'L': 'Large', 'S':'Small'})
jobs.drop(['salary', 'salary_currency'], axis=1, inplace=True)
jobs.head()
work_year | experience_level | employment_type | job_title | salary_in_usd | employee_residence | remote_ratio | company_location | company_size | |
---|---|---|---|---|---|---|---|---|---|
0 | 2020 | Intermediate | Full-time | Data Scientist | 79833 | DE | 0 | DE | Large |
1 | 2020 | Senior | Full-time | Machine Learning Scientist | 260000 | JP | 0 | JP | Small |
2 | 2020 | Senior | Full-time | Big Data Engineer | 109024 | GB | 50 | GB | Medium |
3 | 2020 | Intermediate | Full-time | Product Data Analyst | 20000 | HN | 0 | HN | Small |
4 | 2020 | Senior | Full-time | Machine Learning Engineer | 150000 | US | 50 | US | Large |
geolocator = Nominatim(user_agent=os.environ.get('email'))
def geolocate(country):
try:
loc = geolocator.geocode(country)
return (loc.latitude, loc.longitude)
except:
return np.nan
# Find number of jobs and mean salary for a given country
def aggregate(country):
subdf = jobs[jobs['company_location'] == country]
return len(subdf), round(subdf['salary_in_usd'].mean(),2)
# Find Unique Countries
countries = list(set(jobs['company_location']) | set(jobs['employee_residence']))
# Create seperate df for country locations
locationsDf = pd.DataFrame(countries, columns = ["Countries"])
cmap = cm.get_cmap('Spectral', 59)
colors = [matplotlib.colors.rgb2hex(cmap(i)) for i in range(cmap.N)]
locationsDf["colors"] = colors
locationsDf.head()
Countries | colors | |
---|---|---|
0 | MD | #9e0142 |
1 | IL | #a70c44 |
2 | PK | #b11646 |
3 | IR | #ba2149 |
4 | KE | #c42b4b |
colors = {
'Intermediate': '#1f77b4',
'Senior': '#ff7f0e',
'Junior': '#2ca02c',
'Executive': '#324d67'
}
box = jobs.hvplot.box('salary_in_usd', by=["experience_level"],c='experience_level',
cmap=colors, width=600, height=350, legend=False,
yformatter='%.0f').opts(xlabel="Experience Level", ylabel="Salary (USD)")
box
def plot_bars1(year):
year_df = jobs[jobs['work_year'] == year]
return year_df.hvplot.bar('experience_level', 'salary_in_usd', c='experience_level',
cmap=colors, height=350, width=600, legend=False,
yformatter='%.0f').aggregate(function=np.mean).opts(xlabel="Experience Level",
ylabel="Avg Salary (USD)",
title="Average Salary by Experience Level in 2021")
plot_bars1(2021)
pn.extension(sizing_mode='fixed')
year = pn.widgets.IntSlider(name='Year Slider', width=200,
start=2020, end=2022, value=(2020),
step=1,value_throttled=(2020))
year
@pn.depends(year.param.value_throttled)
def year_selected(year):
return '### Jobs in {}'.format(year)
pn.Row(year_selected)
country_colors_dict = dict(zip(locationsDf['Countries'], locationsDf['colors']))
@pn.depends(year.param.value_throttled)
def plot_bars2(year):
year_df = jobs[jobs['work_year'] == year]
df = pd.DataFrame(year_df.groupby('employee_residence')[['employee_residence', 'salary_in_usd']]
.mean().sort_values('salary_in_usd', ascending=False).round(2).head(10))
df['employee_residence'] = df.index
return df.hvplot.bar(x='employee_residence', y='salary_in_usd',c='employee_residence',
cmap=country_colors_dict, min_height=250, min_width=400, legend=False, yformatter='%.0f',
responsive=True).opts(xlabel="Employee Residence", ylabel="Avg Salary (USD)")
plot_bars2(2021)
@pn.depends(year.param.value_throttled)
def plot_bars3(year):
"""Plot salary based on company location and subset by the year"""
year_df = jobs[jobs['work_year'] == year]
df = pd.DataFrame(year_df.groupby('company_location')[['company_location', 'salary_in_usd']]
.mean().sort_values('salary_in_usd', ascending=False).round(3).head(10))
df['company_location'] = df.index
return df.hvplot.bar(x='company_location', y='salary_in_usd',
c='company_location', cmap=country_colors_dict,
min_height=250, min_width=400, legend=False, yformatter='%.0f',
responsive=True).opts(xlabel="Company Location", ylabel="Avg Salary (USD)")
plot_bars3(2021)
@pn.depends(year.param.value_throttled)
def plot_bars4(year):
year_df = jobs[jobs['work_year'] == year]
df = pd.DataFrame(year_df.groupby('company_size')[['company_size', 'salary_in_usd']]
.mean().sort_values('salary_in_usd', ascending=False).round(2).head(10))
df['company_size'] = df.index
colors_dict = dict(zip(df['company_size'], hv.Cycle('Category10').values))
return df.hvplot.bar(x='company_size', y='salary_in_usd',
c='company_size', cmap=colors_dict,
min_height=250, min_width=400, legend=False, yformatter='%.0f',
responsive=True).opts(xlabel="Company Size", ylabel="Avg Salary (USD)")
plot_bars4(2021)
colors = {
'Intermediate': '#1f77b4',
'Senior': '#ff7f0e',
'Junior': '#2ca02c',
'Executive': '#324d67'
}
@pn.depends(year.param.value_throttled)
def plot_bars1(year):
year_df = jobs[jobs['work_year'] == year]
return year_df.hvplot.bar('experience_level', 'salary_in_usd', c='experience_level',
cmap=colors, responsive=True, min_height=250, min_width=400, legend=False,
yformatter='%.0f').aggregate(function=np.mean).opts(xlabel="Experience Level",
ylabel="Avg Salary (USD)")
plots_box = pn.WidgetBox(pn.Column(pn.Row(year_selected, year)
,pn.Row(pn.bind(plot_bars1,year),
pn.bind(plot_bars4,year)), pn.Row(pn.bind(plot_bars2,year),
pn.bind(plot_bars3,year)), align="start",
sizing_mode="stretch_width"))
dashboard = pn.Row(plots_box, sizing_mode="stretch_width")
dashboard