#!/usr/bin/env python # coding: utf-8 # # IBM Data Science Capstone Project # ## New York City Current Job Postings Exploration # ### by Nuttaphat Arunoprayoch #

# ### Objectives # Looking for a job to earn a living is challenging and time consuming, and it might not end up well as expected. Therefore, in order to effectively plan for a career, one must know which career path should be taken. This report analyses job opening positions in NY, USA with a hope that it may be useful individually and socially. # # ### Data # the data was derived from https://www.kaggle.com/new-york-city/new-york-city-current-job-postings; its metadata can be found at the link provided. #

# ## Import Libraries and Dataset # In[68]: # Standard Libraries import requests import calendar from dateutil import parser import json, requests from IPython.display import display, HTML, clear_output, IFrame # Data Manipulation from pandas.io.json import json_normalize import pandas as pd import numpy as np import pandas_profiling # Data Visualization get_ipython().run_line_magic('matplotlib', 'inline') import matplotlib as mpl import matplotlib.pyplot as plt import seaborn as sns from pywaffle import Waffle mpl.style.use(['ggplot']) # optional: for ggplot-like style sns.set(rc={'figure.figsize':(11.7,8.27)}) # In[69]: # Hide code cells for clean presentation and import display table HTML(''' ''') # In[2]: # Geolocation Libraries from geopy.geocoders import Nominatim # module to convert an address into latitude and longitude values import folium # In[70]: # Set FourSquare CLIENT_ID = '#' # your Foursquare ID CLIENT_SECRET = '#' # your Foursquare Secret VERSION = '20180604' LIMIT = 30 # In[4]: df = pd.read_csv('dataset/new-york-city-current-job-postings/nyc-jobs.csv') # In[5]: df.head() #

# # ## Data pre-processing and Data Analysis # In[13]: df.hist() # In[7]: headers = df.columns.tolist() # headers # In[8]: # Select only Annual Paying df = df[df['Salary Frequency'] == 'Annual'] # In[14]: df_grp = df.groupby('Business Title')['Salary Range From', 'Salary Range To'].mean() df_grp['avg_salary'] = (df_grp['Salary Range From'] + df_grp['Salary Range To']) / 2 df_grp = df_grp.sort_values(by=['avg_salary'], ascending=False) print(df_grp.head(10)) df_grp['avg_salary'].hist() # As can be seen, the average annual salary is ranged mostly between 50000 and 90000 USD. And the most frequently-paid salary per year is about 60000 USD; which means 5000 USD per month in New York city. # In[15]: df_count = df.groupby('Business Title').size().reset_index(name='counts') df_count_top_10 = df_count.sort_values(by=['counts'], ascending=False)[:10].reset_index(drop=True) keys = [i for i in df_count_top_10['Business Title']] values = [i for i in df_count_top_10['counts']] waffle_data = dict(zip(keys, values)) ax = plt.figure( FigureClass=Waffle, rows=10, values=waffle_data, legend={'loc': 'upper left', 'bbox_to_anchor': (1.1, 1)} ) plt.title('Top 10 Wanted Positions') print(df_count_top_10) # The waffle chart above indicates that the most wanted position is Project manager followed by Assistant Civil Engineer, Accoutable Manager, and so on. Additionally, as can be observed, the positions are widely ranged from various professional areas. # In[16]: df_job = df.groupby('Job Category').size().reset_index(name='counts') df_job_top_10 = df_job.sort_values(by=['counts'], ascending=False)[:10].set_index('Job Category') ax = df_job_top_10.plot(kind='barh') ax.invert_yaxis() # According to the collected data, the most wanted job areas are Engineering, closely followed by Techonology and Data Related Job. Therefore, one should be considering taking a career path in Engineering and Technology seriously. # In[17]: df_job_salary = df.groupby('Job Category')['Salary Range From', 'Salary Range To'].mean() df_job_salary['avg'] = (df_job_salary['Salary Range From'] + df_job_salary['Salary Range To']) / 2 df_job_salary = df_job_salary.sort_values(by=['avg'], ascending=False)[:5] df_job_salary['new Job Category'] = ['Administration & Human Resources Finance', 'Administration & Human Resources Constituent', 'Communications & Intergovernmental Affairs', 'Engineering, Architecture, & Planning Maintenance', 'Engineering, Architecture, & Planning Finance'] df_job_salary = df_job_salary.set_index('new Job Category') print(df_job_salary.head()) ax = df_job_salary['avg'].plot.bar() # Furthermore, the most well-paid job areas including Administration, HR, Accounting, Legal, Engineering, Architecture, etc. # In[18]: df_date = df[['Job Category', 'Posting Date']].dropna().reset_index(drop=True) df_date['Posting Date'] = df_date['Posting Date'].apply(lambda s: parser.parse(s)) df_date['month'] = df_date['Posting Date'].apply(lambda t: calendar.month_name[t.month]) df_date['year'] = df_date['Posting Date'].apply(lambda t: t.year) df_date_grp = df_date.groupby(['month', 'year'])['Job Category'].size().reset_index(name='counts') df_date_grp = df_date_grp.sort_values(by=['month']) df_date_grp = df_date_grp.pivot('month', 'year', 'counts') sns.heatmap(df_date_grp, annot=True, fmt='.2f', linewidths=.5) # Additionally, the job seekers should bear in mind that positions are widely in May, June, July . Hence, it is a good time for applying for your desired positions. Especially, in July which holds a significantly high number of job posting (2019). # In[63]: NY_area = list(set(df['Work Location'].tolist())) # Gather Work Locations # In[64]: latitude, longitude = 40.7128, -74.0060 search_query = 'Coffee' radius = 500 url = 'https://api.foursquare.com/v2/venues/search?client_id={}&client_secret={}&ll={},{}&v={}&query={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, latitude, longitude, VERSION, search_query, radius, LIMIT) results = requests.get(url).json() # assign relevant part of JSON to venues venues = results['response']['venues'] # tranform venues into a dataframe dataframe = json_normalize(venues) # dataframe.head() # In[65]: # keep only columns that include venue name, and anything that is associated with location filtered_columns = ['name', 'categories'] + [col for col in dataframe.columns if col.startswith('location.')] + ['id'] dataframe_filtered = dataframe.loc[:, filtered_columns] # function that extracts the category of the venue def get_category_type(row): try: categories_list = row['categories'] except: categories_list = row['venue.categories'] if len(categories_list) == 0: return None else: return categories_list[0]['name'] # filter the category for each row dataframe_filtered['categories'] = dataframe_filtered.apply(get_category_type, axis=1) # clean column names by keeping only last term dataframe_filtered.columns = [column.split('.')[-1] for column in dataframe_filtered.columns] # dataframe_filtered # And just in case, you're in New York City and looking for a job. I would recommend you to warm up your day by taking a good cup of coffee before your interview or else. Enjoy your day. # In[66]: venues_map = folium.Map(location=[latitude, longitude], zoom_start=13) # generate map centred around the Conrad Hotel # add a red circle marker to represent the Conrad Hotel folium.features.Marker( [latitude, longitude], radius=10, color='red', popup='Conrad Hotel', fill = True, fill_color = 'red', fill_opacity = 0.6 ).add_to(venues_map) # add the Italian restaurants as blue circle markers for lat, lng, label in zip(dataframe_filtered.lat, dataframe_filtered.lng, dataframe_filtered.categories): folium.features.Marker( [lat, lng], radius=5, color='blue', popup=label, fill = True, fill_color='blue', fill_opacity=0.6 ).add_to(venues_map) # display map venues_map #

# # ## Summary # # Exploratoy Data Analysis reveals various interesting points from the job market. It is clearly that Project Managers were highlly wanted. Furthermore, the area of Engineering and Technology have frequently dominated the job market in NY city. # # In a nutshell, Engineering and Technology career might give you an upper-hand in job searching, and don't forget to apply a job during May, June, July for better chance of getting a job due to its high number of job posting. Good luck.