#!/usr/bin/env python
# coding: utf-8
# # IBM Data Science Capstone Project
# ## New York City Current Job Postings Exploration
# ### by Nuttaphat Arunoprayoch
#
# ### Objectives
# Looking for a job to earn a living is challenging and time consuming, and it might not end up well as expected. Therefore, in order to effectively plan for a career, one must know which career path should be taken. This report analyses job opening positions in NY, USA with a hope that it may be useful individually and socially.
#
# ### Data
# the data was derived from https://www.kaggle.com/new-york-city/new-york-city-current-job-postings; its metadata can be found at the link provided.
#
# ## Import Libraries and Dataset
# In[68]:
# Standard Libraries
import requests
import calendar
from dateutil import parser
import json, requests
from IPython.display import display, HTML, clear_output, IFrame
# Data Manipulation
from pandas.io.json import json_normalize
import pandas as pd
import numpy as np
import pandas_profiling
# Data Visualization
get_ipython().run_line_magic('matplotlib', 'inline')
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from pywaffle import Waffle
mpl.style.use(['ggplot']) # optional: for ggplot-like style
sns.set(rc={'figure.figsize':(11.7,8.27)})
# In[69]:
# Hide code cells for clean presentation and import display table
HTML('''
''')
# In[2]:
# Geolocation Libraries
from geopy.geocoders import Nominatim # module to convert an address into latitude and longitude values
import folium
# In[70]:
# Set FourSquare
CLIENT_ID = '#' # your Foursquare ID
CLIENT_SECRET = '#' # your Foursquare Secret
VERSION = '20180604'
LIMIT = 30
# In[4]:
df = pd.read_csv('dataset/new-york-city-current-job-postings/nyc-jobs.csv')
# In[5]:
df.head()
#
#
# ## Data pre-processing and Data Analysis
# In[13]:
df.hist()
# In[7]:
headers = df.columns.tolist()
# headers
# In[8]:
# Select only Annual Paying
df = df[df['Salary Frequency'] == 'Annual']
# In[14]:
df_grp = df.groupby('Business Title')['Salary Range From', 'Salary Range To'].mean()
df_grp['avg_salary'] = (df_grp['Salary Range From'] + df_grp['Salary Range To']) / 2
df_grp = df_grp.sort_values(by=['avg_salary'], ascending=False)
print(df_grp.head(10))
df_grp['avg_salary'].hist()
# As can be seen, the average annual salary is ranged mostly between 50000 and 90000 USD. And the most frequently-paid salary per year is about 60000 USD; which means 5000 USD per month in New York city.
# In[15]:
df_count = df.groupby('Business Title').size().reset_index(name='counts')
df_count_top_10 = df_count.sort_values(by=['counts'], ascending=False)[:10].reset_index(drop=True)
keys = [i for i in df_count_top_10['Business Title']]
values = [i for i in df_count_top_10['counts']]
waffle_data = dict(zip(keys, values))
ax = plt.figure(
FigureClass=Waffle,
rows=10,
values=waffle_data,
legend={'loc': 'upper left', 'bbox_to_anchor': (1.1, 1)}
)
plt.title('Top 10 Wanted Positions')
print(df_count_top_10)
# The waffle chart above indicates that the most wanted position is Project manager followed by Assistant Civil Engineer, Accoutable Manager, and so on. Additionally, as can be observed, the positions are widely ranged from various professional areas.
# In[16]:
df_job = df.groupby('Job Category').size().reset_index(name='counts')
df_job_top_10 = df_job.sort_values(by=['counts'], ascending=False)[:10].set_index('Job Category')
ax = df_job_top_10.plot(kind='barh')
ax.invert_yaxis()
# According to the collected data, the most wanted job areas are Engineering, closely followed by Techonology and Data Related Job. Therefore, one should be considering taking a career path in Engineering and Technology seriously.
# In[17]:
df_job_salary = df.groupby('Job Category')['Salary Range From', 'Salary Range To'].mean()
df_job_salary['avg'] = (df_job_salary['Salary Range From'] + df_job_salary['Salary Range To']) / 2
df_job_salary = df_job_salary.sort_values(by=['avg'], ascending=False)[:5]
df_job_salary['new Job Category'] = ['Administration & Human Resources Finance',
'Administration & Human Resources Constituent',
'Communications & Intergovernmental Affairs',
'Engineering, Architecture, & Planning Maintenance',
'Engineering, Architecture, & Planning Finance']
df_job_salary = df_job_salary.set_index('new Job Category')
print(df_job_salary.head())
ax = df_job_salary['avg'].plot.bar()
# Furthermore, the most well-paid job areas including Administration, HR, Accounting, Legal, Engineering, Architecture, etc.
# In[18]:
df_date = df[['Job Category', 'Posting Date']].dropna().reset_index(drop=True)
df_date['Posting Date'] = df_date['Posting Date'].apply(lambda s: parser.parse(s))
df_date['month'] = df_date['Posting Date'].apply(lambda t: calendar.month_name[t.month])
df_date['year'] = df_date['Posting Date'].apply(lambda t: t.year)
df_date_grp = df_date.groupby(['month', 'year'])['Job Category'].size().reset_index(name='counts')
df_date_grp = df_date_grp.sort_values(by=['month'])
df_date_grp = df_date_grp.pivot('month', 'year', 'counts')
sns.heatmap(df_date_grp, annot=True, fmt='.2f', linewidths=.5)
# Additionally, the job seekers should bear in mind that positions are widely in May, June, July . Hence, it is a good time for applying for your desired positions. Especially, in July which holds a significantly high number of job posting (2019).
# In[63]:
NY_area = list(set(df['Work Location'].tolist())) # Gather Work Locations
# In[64]:
latitude, longitude = 40.7128, -74.0060
search_query = 'Coffee'
radius = 500
url = 'https://api.foursquare.com/v2/venues/search?client_id={}&client_secret={}&ll={},{}&v={}&query={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, latitude, longitude, VERSION, search_query, radius, LIMIT)
results = requests.get(url).json()
# assign relevant part of JSON to venues
venues = results['response']['venues']
# tranform venues into a dataframe
dataframe = json_normalize(venues)
# dataframe.head()
# In[65]:
# keep only columns that include venue name, and anything that is associated with location
filtered_columns = ['name', 'categories'] + [col for col in dataframe.columns if col.startswith('location.')] + ['id']
dataframe_filtered = dataframe.loc[:, filtered_columns]
# function that extracts the category of the venue
def get_category_type(row):
try:
categories_list = row['categories']
except:
categories_list = row['venue.categories']
if len(categories_list) == 0:
return None
else:
return categories_list[0]['name']
# filter the category for each row
dataframe_filtered['categories'] = dataframe_filtered.apply(get_category_type, axis=1)
# clean column names by keeping only last term
dataframe_filtered.columns = [column.split('.')[-1] for column in dataframe_filtered.columns]
# dataframe_filtered
# And just in case, you're in New York City and looking for a job. I would recommend you to warm up your day by taking a good cup of coffee before your interview or else. Enjoy your day.
# In[66]:
venues_map = folium.Map(location=[latitude, longitude], zoom_start=13) # generate map centred around the Conrad Hotel
# add a red circle marker to represent the Conrad Hotel
folium.features.Marker(
[latitude, longitude],
radius=10,
color='red',
popup='Conrad Hotel',
fill = True,
fill_color = 'red',
fill_opacity = 0.6
).add_to(venues_map)
# add the Italian restaurants as blue circle markers
for lat, lng, label in zip(dataframe_filtered.lat, dataframe_filtered.lng, dataframe_filtered.categories):
folium.features.Marker(
[lat, lng],
radius=5,
color='blue',
popup=label,
fill = True,
fill_color='blue',
fill_opacity=0.6
).add_to(venues_map)
# display map
venues_map
#
#
# ## Summary
#
# Exploratoy Data Analysis reveals various interesting points from the job market. It is clearly that Project Managers were highlly wanted. Furthermore, the area of Engineering and Technology have frequently dominated the job market in NY city.
#
# In a nutshell, Engineering and Technology career might give you an upper-hand in job searching, and don't forget to apply a job during May, June, July for better chance of getting a job due to its high number of job posting. Good luck.