import itertools
import networkx as nx
import pandas as pd
import matplotlib.pyplot as plt
import re
import sys
import locale
import numpy as np
# import geopy
import plotly
import math
import urllib2
import pickle
import IPython.core.display as di
from time import sleep
from nltk.corpus import stopwords
from collections import Counter
from scipy import stats, integrate
# from geopy.geocoders import Nominatim
from bs4 import BeautifulSoup
from urllib2 import urlopen
from plotly.graph_objs import *#Scatter, Layout
import igraph as ig
# di.display_html('<script>jQuery(function() {if (jQuery("body.notebook_app").length == 0) { jQuery(".input_area").toggle(); jQuery(".prompt").toggle();}});</script>', raw=True)
# di.display_html('''<button onclick="jQuery('.input_area').toggle(); jQuery('.prompt').toggle();">Toggle code</button>''', raw=True)
city_data=pickle.load( open( "city_data.p", "rb" ) )
## pickle.dump( city_data, open( "city_data.p", "wb" ) )
df=pickle.load( open( "df.p", "rb" ) )
## pickle.dump( df, open( "df.p", "wb" ) )
python_query = pickle.load( open( "python_query.p", "rb" ) )
## pickle.dump( python_query, open( "python_query.p", "wb" ) )
r_query = pickle.load( open( "r_query.p", "rb" ) )
## pickle.dump( r_query, open( "r_query.p", "wb" ) )
data_frames = pickle.load( open( "data_frames.p", "rb" ) )
## pickle.dump( data_frames, open( "data_frames.p", "wb" ) )
plotly.offline.init_notebook_mode()
%matplotlib inline
def get_states_from_lat_lon(df):
geolocator = Nominatim()
r_geo = lambda x: geolocator.reverse(str(float(x['lat']))+","+str(float(x['lon'])))
geo_loc = df.apply(r_geo, axis=1)
states = []
for i in range(len(geo_loc)):
try:
states.append((str(geo_loc[i]).split(',')[-3]).strip())
except:
states.append('') # if something goes wrong
return states
def get_avg_salary(job="\"Data+Scientist\"", loc=["NY","NY"], radius=0, verbose=False):
locale.setlocale(locale.LC_ALL, 'us')
job = job.lower()
loc = [i.lower() for i in loc]
loc = '%2C+'.join(loc)
url = 'http://www.indeed.com/jobs?q=%s&l=%s&radius=%d' % (job, loc, radius)
soup = BeautifulSoup(urlopen(url).read(),"lxml")
links = soup.find_all('a', title=re.compile('\+ \('))
salaries, counts = [], []
salary_regex = '(\d+,000)\+ \((\d+)\)'
for a in links:
title = a.get('title').encode('utf-8')
results = re.search(salary_regex, title).groups()
salaries.append( int(results[0].replace(',','')) )
counts.append( int(results[1]) )
d_y_m = soup.find_all('span', style="", string=re.compile('Did you mean'))
o_r_r = soup.find_all('div', id=re.compile('original_radius_result'))
if (len(counts)==0) or (len(o_r_r) > 0) or (len(d_y_m) > 0):
num_jobs=0
avg_salary=0
else:
left = []
right = []
jobs = []
for i in range(len(counts)):
if i==max(range(len(counts))):
jobs.append(counts[i])
for ii in range(counts[i]):
left.append(salaries[i])
right.append(salaries[i])
else:
jobs.append(counts[i]-counts[i+1])
for ii in range(counts[i]-counts[i+1]):
left.append(salaries[i])
right.append(salaries[i+1])
avg_salary = np.mean([np.mean(left),np.mean(right)])
num_jobs_area = soup.find(id = 'searchCount').string.encode('utf-8')
job_numbers = re.findall('\d+', num_jobs_area)
if len(job_numbers) > 3:
num_jobs = (int(job_numbers[2])*1000) + int(job_numbers[3])
else:
num_jobs = int(job_numbers[2])
if verbose==True:
print 'Location: %s' % ", ".join(loc.split("%2C+"))
print 'Job : %s' % " ".join(job.split("+"))
print 'Salaries:'
for j,l,r in zip(jobs,left,right):
print "%s jobs: $%d-$%d" % (j,l,r)
print ''
print "%d jobs in %s with a mean salary of: $%d" % (num_jobs,",".join(loc.split("%2C+")),avg_salary)
return avg_salary, num_jobs
def text_cleaner(website):
try:
site = urllib2.urlopen(website).read() # Connect to the job posting
except:
return
soup_obj = BeautifulSoup(site,"lxml")
if len(soup_obj) == 0:
soup_obj = BeautifulSoup(site, 'html5lib')
for script in soup_obj(["script", "style"]):
script.extract()
text = soup_obj.get_text()
lines = (line.strip() for line in text.splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
text = ''.join(chunk for chunk in chunks if chunk).encode('utf-8')
try:
text = text.decode('unicode_escape').encode('ascii', 'ignore')
except:
return
text = re.sub("[^a-zA-Z+3]"," ", text)
text = re.sub(r"([a-z])([A-Z])", r"\1 \2", text)
text = text.lower().split()
stop_words = set(stopwords.words("english"))
text = [w for w in text if not w in stop_words]
text = list(set(text))
return text
def skills_info(city = None, state = None, verbose=False):
final_job = 'data+scientist'
if city is not None:
final_city = city.split()
final_city = '+'.join(word for word in final_city)
final_state = state.split()
final_state = '+'.join(word for word in final_state)
final_site_list = ['http://www.indeed.com/jobs?q=%22', final_job, '%22&l=', final_city,
'%2C+', final_state]
else:
final_site_list = ['http://www.indeed.com/jobs?q="', final_job, '"']
final_site = ''.join(final_site_list)
base_url = 'http://www.indeed.com'
try:
html = urlopen(final_site).read()
except:
'That city/state combination did not have any jobs. Exiting . . .'
return
soup = BeautifulSoup(html)
num_jobs_area = soup.find(id = 'searchCount').string.encode('utf-8')
job_numbers = re.findall('\d+', num_jobs_area)
if len(job_numbers) > 3:
total_num_jobs = (int(job_numbers[2])*1000) + int(job_numbers[3])
else:
total_num_jobs = int(job_numbers[2])
city_title = city
if city is None:
city_title = 'Nationwide'
if verbose:
print 'There were', total_num_jobs, 'jobs found,', city_title
num_pages = total_num_jobs/10
job_descriptions = []
for i in xrange(1,num_pages+1):
if verbose:
print 'Getting page', i
start_num = str(i*10)
current_page = ''.join([final_site, '&start=', start_num])
html_page = urllib2.urlopen(current_page).read()
page_obj = BeautifulSoup(html_page)
job_link_area = page_obj.find(id = 'resultsCol')
job_URLS = [base_url + link.get('href') for link in job_link_area.find_all('a')]
job_URLS = filter(lambda x:'clk' in x, job_URLS)
for j in xrange(0,len(job_URLS)):
final_description = text_cleaner(job_URLS[j])
if final_description:
job_descriptions.append(final_description)
sleep(1)
if verbose:
print 'Done with collecting the job postings!'
print 'There were', len(job_descriptions), 'jobs successfully found.'
doc_frequency = Counter()
[doc_frequency.update(item) for item in job_descriptions]
prog_lang_dict = Counter({'R':doc_frequency['r'], 'Python':doc_frequency['python'],
'Java':doc_frequency['java'], 'C++':doc_frequency['c++'],
'Ruby':doc_frequency['ruby'], 'Julia':doc_frequency['julia'],
'Perl':doc_frequency['perl'], 'Matlab':doc_frequency['matlab'],
'JavaScript':doc_frequency['javascript'], 'Scala': doc_frequency['scala']})
analysis_tool_dict = Counter({'Excel':doc_frequency['excel'], 'Tableau':doc_frequency['tableau'],
'D3.js':doc_frequency['d3.js'], 'SAS':doc_frequency['sas'],
'SPSS':doc_frequency['spss'], 'D3':doc_frequency['d3']})
hadoop_dict = Counter({'Hadoop':doc_frequency['hadoop'], 'MapReduce':doc_frequency['mapreduce'],
'Spark':doc_frequency['spark'], 'Pig':doc_frequency['pig'],
'Hive':doc_frequency['hive'], 'Shark':doc_frequency['shark'],
'Oozie':doc_frequency['oozie'], 'ZooKeeper':doc_frequency['zookeeper'],
'Flume':doc_frequency['flume'], 'Mahout':doc_frequency['mahout']})
database_dict = Counter({'SQL':doc_frequency['sql'], 'NoSQL':doc_frequency['nosql'],
'HBase':doc_frequency['hbase'], 'Cassandra':doc_frequency['cassandra'],
'MongoDB':doc_frequency['mongodb']})
overall_total_skills = prog_lang_dict + analysis_tool_dict + hadoop_dict + database_dict
final_frame = pd.DataFrame(overall_total_skills.items(), columns = ['Term', 'NumPostings'])
final_frame.NumPostings = (final_frame.NumPostings)*100/len(job_descriptions)
final_frame.sort_values(by='NumPostings', ascending = False, inplace = True)
data = Data([
Bar(x=list(final_frame.Term.values),
y=list(final_frame.NumPostings.values),
name='Data Science Skills')])
layout = Layout(height=480,
hovermode='closest',
margin=Margin(r=40,
t=50,
b=140,
l=40,
pad=2),
title=city+','+state+' Data Science Skills',
width=650,
xaxis=XAxis(nticks=200,
tickangle=-90,
tickfont=dict(size=7)))
fig = Figure(data=data, layout=layout)
return fig, final_frame # End of the function
def create_graph(dict_of_lists):
G=nx.Graph()
for i in dict_of_lists.keys():
for ii in dict_of_lists[i]:
G.add_edges_from([(i,ii)])
return G
def stack_graph(search_term = "r", pages=None):
# try:
list_of_links = {}
base_add= "http://stackoverflow.com"
headers = {'User-Agent' : "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/534.30 (KHTML, like Gecko) Ubuntu/11.04 Chromium/12.0.742.112 Chrome/12.0.742.112 Safari/534.30"}
#-- get number of pages
def num_pages(link_url = "http://stackoverflow.com/questions/tagged/r?page=1&sort=frequent&pageSize=50"):
url_0 = link_url
req_0 = urllib2.Request(url_0, headers=headers)
con_0 = urllib2.urlopen(req_0)
soup_0 = BeautifulSoup(con_0.read(), "lxml")
counts_flag = soup_0.find_all('div', class_=re.compile('summarycount'))
questions_tagged = re.findall('\d+', str(counts_flag))
pages = int(''.join(questions_tagged))/50
pages = math.ceil(pages)
pages = int(pages)
return pages
#--
link_url_0 = "http://stackoverflow.com/questions/tagged/%s?page=%s&sort=frequent&pageSize=50" % (search_term,"1")
pages_0s = 1
pages_0 = num_pages(link_url=link_url_0)
if pages != None: pages_0s = pages[0]
for i in range(pages_0s,pages_0+1):
url = "http://stackoverflow.com/questions/tagged/%s?page=%s&sort=frequent&pageSize=50" % (search_term,str(i))
req = urllib2.Request(url, headers=headers)
con = urllib2.urlopen(req)
soup = BeautifulSoup(con.read(), "lxml")
links = soup.find_all('a', class_=re.compile('question-hyperlink'))
for ii in links:
question = base_add + ii['href']
question_num = question.split('/')[4]
link_url_1 = base_add + "/questions/linked/%s?pagesize=50&sort=hot" % (question_num)
pages_1s = 1
pages_1 = num_pages(link_url=link_url_1)
linked_urls = []
if pages != None: pages_1s = pages[1]
for iii in range(pages_1s,pages_1+1):
question_url = base_add + "/questions/linked/%s?page=%s&sort=hot&pagesize=50" % (question_num,str(iii))
request = urllib2.Request(question_url, headers=headers)
connection = urllib2.urlopen(request)
new_soup = BeautifulSoup(connection.read(), "lxml")
linked_qs = new_soup.find_all('a', class_=re.compile('question-hyperlink'))
for iiii in linked_qs:
linked_urls.append(iiii['href'])
list_of_links[ii['href']]=linked_urls
print len(list_of_links.keys())
return list_of_links
# except:
# print "CONNECTION TIME OUT"
# print "query page: %s , link page: $s" % (page_0, page_1)
# return (list_of_links,page_0,page_1)
# # return create_graph(list_of_links)
def map_fig(df,of='Salary'):
if of=='Salary':
df['text'] = df['city'] + '<br> ' +of+ (df[of.lower()]/1e6).astype(str)+' million'
df.sort_values(by=of.lower(), ascending = False, inplace = True)
elif of=='Jobs':
df['text'] = df['city'] + '<br> ' + (df[of.lower()]).astype(str)+' '+ of
df.sort_values(by=of.lower(), ascending = False, inplace = True)
limits = [(50,3000),(20,50),(10,20),(3,10),(0,3)]
colors = ['rgb(41, 128, 171)','rgb(104, 157, 46)','rgb(169, 140, 31)','rgb(178, 81, 28)','rgb(165, 28, 18)']
cities = []
scale = 25000
start = 2
for i in range(len(limits)):
lim = limits[i]
df_sub = df[lim[0]:lim[1]]
city = dict(type = 'scattergeo',
locationmode = 'USA-states',
lon = df_sub['lon'],
lat = df_sub['lat'],
text = df_sub['text'],
marker = dict(size = (df_sub['salary'])/(scale/start),
color = colors[i],
line = dict(width=0.5, color='rgb(255,255,255)'),
sizemode = 'area'),
name = '{0} - {1}'.format(lim[0],lim[1]) )
cities.append(city)
start+=50
layout = dict(title = '2016 Data Science '+of+' by City',
showlegend = True,
geo = dict(scope='usa',
projection=dict( type='albers usa' ),
showland = True,
landcolor = 'rgb(67, 67, 67)',
subunitwidth=1,
countrywidth=1,
subunitcolor="rgb(255, 255, 255)",
countrycolor="rgb(255, 255, 255)"),)
fig = dict( data=cities, layout=layout )
return fig
def bar_fig(city_data):
bars = {}
for i in city_data:
city = i[0]
data = i[1]
color = i[2]
bars[city] = Bar(x=list(data.Term.values), y=list(data.NumPostings.values),
marker=Marker(color=color), name=city)
data = Data([bars[i] for i in bars.keys()])
layout = Layout(autosize=True, bargap=0.15, bargroupgap=0, barmode='group',
boxgap=0.3, boxgroupgap=0.3, boxmode='overlay', dragmode='zoom',
font=Font(color='#444', family="'Open sans', verdana, arial, sans-serif", size=12),
height=579, hidesources=False, hovermode='x',
legend=Legend(x=0.8986784140969163, y=0.8521303258145363,
bgcolor='#fff',bordercolor='#444',borderwidth=0,
font=Font(color='', family='', size=0),
traceorder='normal',xanchor='left',yanchor='top'),
margin=Margin(r=80,t=100,autoexpand=True,b=80,l=80,pad=0),
paper_bgcolor='#fff', plot_bgcolor='#fff', separators='.,', showlegend=True,
smith=False, title='Tools by job description',
titlefont=dict(color='',family='',size=0),width=1000,
xaxis=XAxis(anchor='y' ,autorange=True, autotick=True, domain=[0, 1],
dtick=1, exponentformat='B', gridcolor='#eee', gridwidth=1,
linecolor='#444', linewidth=1, mirror=False, nticks=0, overlaying=False,
position=0, range=[-0.5, 27.5], rangemode='normal', showexponent='all',
showgrid=False, showline=False, showticklabels=True, tick0=0, tickangle='auto',
tickcolor='#444', tickfont=dict(color='',family='',size=0),ticklen=5,
ticks='',tickwidth=1,title='<br>Language/tool',
titlefont=dict(color='',family='',size=0),type='category',zeroline=False,
zerolinecolor='#444',zerolinewidth=1),
yaxis=YAxis(anchor='x', autorange=True, autotick=True,
domain=[0, 1], dtick=10, exponentformat='B',gridcolor='#eee',gridwidth=1,
linecolor='#444',linewidth=1,mirror=False,nticks=0,overlaying=False,
position=0,range=[0, 54.90526315789474],rangemode='normal',showexponent='all',
showgrid=True,showline=False,showticklabels=True,tick0=0,tickangle='auto',
tickcolor='#444', tickfont=dict(color='',family='',size=0),
ticklen=5, ticks='', tickwidth=1, title='Percentage of posts',
titlefont=dict(color='', family='', size=0), type='linear',
zeroline=True,zerolinecolor='#444',zerolinewidth=1) )
fig = Figure(data=data, layout=layout)
return fig
def network_plot(list_of_edges=[("God","Bad"),("Bad","Michevious"),("Michevious","Good")],color=None, title="Earth"):
sets_of_edges = [set(i) for i in list_of_edges]
edge_numbering = {}
number = 0
for i in list(set.union(*sets_of_edges)):
edge_numbering[i]=number
number += 1
edges=[]
for i in list_of_edges:
edges.append((edge_numbering[i[0]],edge_numbering[i[1]]))
N = len(edge_numbering.keys())
text = list(set.union(*sets_of_edges))
iG=ig.Graph(edges, directed=False)
layt=iG.layout('kk', dim=3)
Xn=[layt[k][0] for k in range(N)]# x-coordinates of nodes
Yn=[layt[k][1] for k in range(N)]# y-coordinates
Zn=[layt[k][2] for k in range(N)]# z-coordinates
Xe=[]
Ye=[]
Ze=[]
for e in edges:
Xe+=[layt[e[0]][0],layt[e[1]][0], None]# x-coordinates of edge ends
Ye+=[layt[e[0]][1],layt[e[1]][1], None]
Ze+=[layt[e[0]][2],layt[e[1]][2], None]
t_1=Scatter3d(x=Xe, y=Ye, z=Ze, mode='lines',
line=Line(color='rgb(125,125,125)', width=1),hoverinfo='none')
t_2=Scatter3d(x=Xn, y=Yn, z=Zn, mode='markers', name='actors',
marker=Marker(symbol='dot', size=6, color=color,
line=Line(color='rgb(50,50,50)', width=0.5)),text=text,hoverinfo='text')
axis=dict(showbackground=False, showline=False, zeroline=False,
showgrid=False, showticklabels=False, title='')
layout = Layout(title=title, width=1000, height=1000, showlegend=False,
scene=Scene(xaxis=XAxis(axis), yaxis=YAxis(axis), zaxis=ZAxis(axis),),
margin=Margin(t=100), hovermode='closest', annotations=Annotations([
Annotation(showarrow=False,
text="Data source: <a href='http://stackoverflow.com'>[1]</a>",
xref='paper', yref='paper', x=0, y=0.1, xanchor='left',
yanchor='bottom', font=Font(size=14))]),)
data=Data([t_1, t_2])
fig=Figure(data=data, layout=layout)
return fig