Notebook

In [1]:

import itertools
import networkx as nx
import pandas as pd
import matplotlib.pyplot as plt
import re
import sys
import locale
import numpy as np
# import geopy
import plotly
import math
import urllib2
import pickle
import IPython.core.display as di
from time import sleep
from nltk.corpus import stopwords
from collections import Counter
from scipy import stats, integrate
# from geopy.geocoders import Nominatim
from bs4 import BeautifulSoup
from urllib2 import urlopen
from plotly.graph_objs import *#Scatter, Layout
import igraph as ig

# di.display_html('<script>jQuery(function() {if (jQuery("body.notebook_app").length == 0) { jQuery(".input_area").toggle(); jQuery(".prompt").toggle();}});</script>', raw=True)
# di.display_html('''<button onclick="jQuery('.input_area').toggle(); jQuery('.prompt').toggle();">Toggle code</button>''', raw=True)

city_data=pickle.load( open( "city_data.p", "rb" ) )
## pickle.dump( city_data, open( "city_data.p", "wb" ) )

df=pickle.load( open( "df.p", "rb" ) )
## pickle.dump( df, open( "df.p", "wb" ) )

python_query = pickle.load( open( "python_query.p", "rb" ) )
## pickle.dump( python_query, open( "python_query.p", "wb" ) )

r_query = pickle.load( open( "r_query.p", "rb" ) )
## pickle.dump( r_query, open( "r_query.p", "wb" ) )

data_frames = pickle.load( open( "data_frames.p", "rb" ) )
## pickle.dump( data_frames, open( "data_frames.p", "wb" ) )


plotly.offline.init_notebook_mode()
%matplotlib inline


def get_states_from_lat_lon(df):
    geolocator = Nominatim()
    r_geo = lambda x:  geolocator.reverse(str(float(x['lat']))+","+str(float(x['lon'])))
    geo_loc = df.apply(r_geo, axis=1)
    states = []
    for i in range(len(geo_loc)):
        try:
            states.append((str(geo_loc[i]).split(',')[-3]).strip())
        except:
            states.append('') # if something goes wrong
    return states


def get_avg_salary(job="\"Data+Scientist\"", loc=["NY","NY"], radius=0, verbose=False):
    locale.setlocale(locale.LC_ALL, 'us')
    job = job.lower()
    loc = [i.lower() for i in loc]
    loc = '%2C+'.join(loc)
    url = 'http://www.indeed.com/jobs?q=%s&l=%s&radius=%d' % (job, loc, radius)
    soup = BeautifulSoup(urlopen(url).read(),"lxml")

    links = soup.find_all('a', title=re.compile('\+ \('))
    salaries, counts = [], []
    salary_regex = '(\d+,000)\+ \((\d+)\)'
    for a in links:
        title = a.get('title').encode('utf-8')
        results = re.search(salary_regex, title).groups()
        salaries.append( int(results[0].replace(',','')) )
        counts.append( int(results[1]) )

    d_y_m = soup.find_all('span', style="", string=re.compile('Did you mean'))
    o_r_r = soup.find_all('div', id=re.compile('original_radius_result'))
    
    if (len(counts)==0) or (len(o_r_r) > 0) or (len(d_y_m) > 0):
        num_jobs=0
        avg_salary=0
    else:
        left = []
        right = []
        jobs = []
        for i in range(len(counts)):
            if i==max(range(len(counts))):
                jobs.append(counts[i])
                for ii in range(counts[i]):
                    left.append(salaries[i])
                    right.append(salaries[i])
            else:
                jobs.append(counts[i]-counts[i+1])
                for ii in range(counts[i]-counts[i+1]):
                    left.append(salaries[i])
                    right.append(salaries[i+1])

        avg_salary = np.mean([np.mean(left),np.mean(right)])
        
        num_jobs_area = soup.find(id = 'searchCount').string.encode('utf-8')
        job_numbers = re.findall('\d+', num_jobs_area)

        if len(job_numbers) > 3: 
            num_jobs = (int(job_numbers[2])*1000) + int(job_numbers[3])
        else:
            num_jobs = int(job_numbers[2]) 
        
        if verbose==True:
            print 'Location: %s' % ", ".join(loc.split("%2C+"))
            print 'Job : %s' % " ".join(job.split("+"))
            print 'Salaries:'
            for j,l,r in zip(jobs,left,right):
                print "%s jobs: $%d-$%d" % (j,l,r)
            print ''
            print "%d jobs in %s with a mean salary of: $%d" % (num_jobs,",".join(loc.split("%2C+")),avg_salary)

    return avg_salary, num_jobs


def text_cleaner(website):
    try:
        site = urllib2.urlopen(website).read() # Connect to the job posting
    except: 
        return
    
    soup_obj = BeautifulSoup(site,"lxml")
    
    if len(soup_obj) == 0:
        soup_obj = BeautifulSoup(site, 'html5lib')
    
    for script in soup_obj(["script", "style"]):
        script.extract()

    text = soup_obj.get_text()
    lines = (line.strip() for line in text.splitlines())
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    text = ''.join(chunk for chunk in chunks if chunk).encode('utf-8')

    try:
        text = text.decode('unicode_escape').encode('ascii', 'ignore')
    except:
        return
   
    text = re.sub("[^a-zA-Z+3]"," ", text)
    text = re.sub(r"([a-z])([A-Z])", r"\1 \2", text)
    text = text.lower().split()
    stop_words = set(stopwords.words("english"))
    text = [w for w in text if not w in stop_words]
    text = list(set(text))
    
    return text


def skills_info(city = None, state = None, verbose=False):
    final_job = 'data+scientist' 
    
    if city is not None:
        final_city = city.split() 
        final_city = '+'.join(word for word in final_city)
        final_state = state.split() 
        final_state = '+'.join(word for word in final_state)
        final_site_list = ['http://www.indeed.com/jobs?q=%22', final_job, '%22&l=', final_city,
                   '%2C+', final_state] 
    else:
        final_site_list = ['http://www.indeed.com/jobs?q="', final_job, '"']

    final_site = ''.join(final_site_list)
    base_url = 'http://www.indeed.com'

    try:
        html = urlopen(final_site).read()
    except:
        'That city/state combination did not have any jobs. Exiting . . .' 
        return
    
    soup = BeautifulSoup(html)
    num_jobs_area = soup.find(id = 'searchCount').string.encode('utf-8') 
    job_numbers = re.findall('\d+', num_jobs_area) 

    if len(job_numbers) > 3:
        total_num_jobs = (int(job_numbers[2])*1000) + int(job_numbers[3])
    else:
        total_num_jobs = int(job_numbers[2]) 

    city_title = city
    if city is None:
        city_title = 'Nationwide'
    
    if verbose:
        print 'There were', total_num_jobs, 'jobs found,', city_title

    num_pages = total_num_jobs/10
    job_descriptions = [] 

    for i in xrange(1,num_pages+1): 
        if verbose:
            print 'Getting page', i
        start_num = str(i*10)
        current_page = ''.join([final_site, '&start=', start_num])
        
        html_page = urllib2.urlopen(current_page).read()
        page_obj = BeautifulSoup(html_page) 
        job_link_area = page_obj.find(id = 'resultsCol') 
        job_URLS = [base_url + link.get('href') for link in job_link_area.find_all('a')]
        job_URLS = filter(lambda x:'clk' in x, job_URLS)
        
        for j in xrange(0,len(job_URLS)):
            final_description = text_cleaner(job_URLS[j])
            if final_description:
                job_descriptions.append(final_description)
            sleep(1)
            
    if verbose:
        print 'Done with collecting the job postings!'    
        print 'There were', len(job_descriptions), 'jobs successfully found.'

    doc_frequency = Counter()
    [doc_frequency.update(item) for item in job_descriptions] 

    prog_lang_dict = Counter({'R':doc_frequency['r'], 'Python':doc_frequency['python'],
                    'Java':doc_frequency['java'], 'C++':doc_frequency['c++'], 
                   'Ruby':doc_frequency['ruby'], 'Julia':doc_frequency['julia'],
                  'Perl':doc_frequency['perl'], 'Matlab':doc_frequency['matlab'],
                  'JavaScript':doc_frequency['javascript'], 'Scala': doc_frequency['scala']})
                  
    analysis_tool_dict = Counter({'Excel':doc_frequency['excel'],  'Tableau':doc_frequency['tableau'],
                      'D3.js':doc_frequency['d3.js'], 'SAS':doc_frequency['sas'],
                      'SPSS':doc_frequency['spss'], 'D3':doc_frequency['d3']})  

    hadoop_dict = Counter({'Hadoop':doc_frequency['hadoop'], 'MapReduce':doc_frequency['mapreduce'],
               'Spark':doc_frequency['spark'], 'Pig':doc_frequency['pig'],
               'Hive':doc_frequency['hive'], 'Shark':doc_frequency['shark'],
               'Oozie':doc_frequency['oozie'], 'ZooKeeper':doc_frequency['zookeeper'],
               'Flume':doc_frequency['flume'], 'Mahout':doc_frequency['mahout']})
               
    database_dict = Counter({'SQL':doc_frequency['sql'], 'NoSQL':doc_frequency['nosql'],
                 'HBase':doc_frequency['hbase'], 'Cassandra':doc_frequency['cassandra'],
                 'MongoDB':doc_frequency['mongodb']})
                     
    overall_total_skills = prog_lang_dict + analysis_tool_dict + hadoop_dict + database_dict
    final_frame = pd.DataFrame(overall_total_skills.items(), columns = ['Term', 'NumPostings']) 
    final_frame.NumPostings = (final_frame.NumPostings)*100/len(job_descriptions) 
    final_frame.sort_values(by='NumPostings', ascending = False, inplace = True)
    
    data = Data([
        Bar(x=list(final_frame.Term.values),
            y=list(final_frame.NumPostings.values),
            name='Data Science Skills')])
    
    layout = Layout(height=480,
        hovermode='closest',
        margin=Margin(r=40,
            t=50,
            b=140,
            l=40,
            pad=2),
        title=city+','+state+' Data Science Skills',
        width=650,
        xaxis=XAxis(nticks=200,
            tickangle=-90,
            tickfont=dict(size=7)))
    
    fig = Figure(data=data, layout=layout)
    return fig, final_frame # End of the function


def create_graph(dict_of_lists):
    G=nx.Graph()
    for i in dict_of_lists.keys():
        for ii in dict_of_lists[i]:
            G.add_edges_from([(i,ii)])
    return G


def stack_graph(search_term = "r", pages=None):
#     try:
    list_of_links = {}
    base_add= "http://stackoverflow.com"
    headers = {'User-Agent' : "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/534.30 (KHTML, like Gecko) Ubuntu/11.04 Chromium/12.0.742.112 Chrome/12.0.742.112 Safari/534.30"}

    #-- get number of pages
    def num_pages(link_url = "http://stackoverflow.com/questions/tagged/r?page=1&sort=frequent&pageSize=50"):
        url_0 = link_url
        req_0 = urllib2.Request(url_0, headers=headers)
        con_0 = urllib2.urlopen(req_0)
        soup_0 = BeautifulSoup(con_0.read(), "lxml")

        counts_flag = soup_0.find_all('div', class_=re.compile('summarycount'))
        questions_tagged = re.findall('\d+', str(counts_flag))  
        pages = int(''.join(questions_tagged))/50
        pages = math.ceil(pages)
        pages = int(pages)
        return pages
    #--

    link_url_0 = "http://stackoverflow.com/questions/tagged/%s?page=%s&sort=frequent&pageSize=50" % (search_term,"1")
    pages_0s = 1
    pages_0 = num_pages(link_url=link_url_0)

    if pages != None: pages_0s = pages[0]

    for i in range(pages_0s,pages_0+1):
        url = "http://stackoverflow.com/questions/tagged/%s?page=%s&sort=frequent&pageSize=50" % (search_term,str(i))
        req = urllib2.Request(url, headers=headers)
        con = urllib2.urlopen(req)
        soup = BeautifulSoup(con.read(), "lxml")
        links = soup.find_all('a', class_=re.compile('question-hyperlink'))

        for ii in links:
            question = base_add + ii['href']
            question_num = question.split('/')[4]

            link_url_1 = base_add + "/questions/linked/%s?pagesize=50&sort=hot" % (question_num)
            pages_1s = 1
            pages_1 = num_pages(link_url=link_url_1)
            linked_urls = []

            if pages != None: pages_1s = pages[1]

            for iii in range(pages_1s,pages_1+1):
                question_url = base_add + "/questions/linked/%s?page=%s&sort=hot&pagesize=50" % (question_num,str(iii))
                request = urllib2.Request(question_url, headers=headers)
                connection = urllib2.urlopen(request)
                new_soup = BeautifulSoup(connection.read(), "lxml")
                linked_qs = new_soup.find_all('a', class_=re.compile('question-hyperlink'))


                for iiii in linked_qs:
                    linked_urls.append(iiii['href'])

            list_of_links[ii['href']]=linked_urls

        print len(list_of_links.keys())

    return list_of_links
#     except:
#         print "CONNECTION TIME OUT"
#         print "query page: %s , link page: $s" % (page_0, page_1)
#         return (list_of_links,page_0,page_1)
#     #     return create_graph(list_of_links)


def map_fig(df,of='Salary'):
    if of=='Salary':
        df['text'] = df['city'] + '<br> ' +of+ (df[of.lower()]/1e6).astype(str)+' million'
        df.sort_values(by=of.lower(), ascending = False, inplace = True)
    elif of=='Jobs':
        df['text'] = df['city'] + '<br> ' + (df[of.lower()]).astype(str)+' '+ of
        df.sort_values(by=of.lower(), ascending = False, inplace = True)
    limits = [(50,3000),(20,50),(10,20),(3,10),(0,3)]
    colors = ['rgb(41, 128, 171)','rgb(104, 157, 46)','rgb(169, 140, 31)','rgb(178, 81, 28)','rgb(165, 28, 18)']
    cities = []
    scale = 25000
    start = 2
    for i in range(len(limits)):
        lim = limits[i]
        df_sub = df[lim[0]:lim[1]]
        city = dict(type = 'scattergeo',
            locationmode = 'USA-states',
            lon = df_sub['lon'],
            lat = df_sub['lat'],
            text = df_sub['text'],
            marker = dict(size = (df_sub['salary'])/(scale/start),
                color = colors[i],
                line = dict(width=0.5, color='rgb(255,255,255)'),
                sizemode = 'area'),
            name = '{0} - {1}'.format(lim[0],lim[1]) )
        cities.append(city)
        start+=50

    layout = dict(title = '2016 Data Science '+of+' by City',
            showlegend = True,
            geo = dict(scope='usa',
                projection=dict( type='albers usa' ),
                showland = True,
                landcolor = 'rgb(67, 67, 67)',
                subunitwidth=1,
                countrywidth=1,
                subunitcolor="rgb(255, 255, 255)",
                countrycolor="rgb(255, 255, 255)"),)

    fig = dict( data=cities, layout=layout )
    return fig


def bar_fig(city_data):
    bars = {}
    for i in city_data:
        city = i[0]
        data = i[1]
        color = i[2]
        bars[city] = Bar(x=list(data.Term.values), y=list(data.NumPostings.values),
                        marker=Marker(color=color), name=city)    

    data = Data([bars[i] for i in bars.keys()])

    layout = Layout(autosize=True, bargap=0.15, bargroupgap=0, barmode='group',
        boxgap=0.3, boxgroupgap=0.3, boxmode='overlay', dragmode='zoom',
        font=Font(color='#444', family="'Open sans', verdana, arial, sans-serif", size=12),
        height=579, hidesources=False, hovermode='x',
        legend=Legend(x=0.8986784140969163, y=0.8521303258145363,
            bgcolor='#fff',bordercolor='#444',borderwidth=0,
            font=Font(color='', family='', size=0),
            traceorder='normal',xanchor='left',yanchor='top'),
        margin=Margin(r=80,t=100,autoexpand=True,b=80,l=80,pad=0),
        paper_bgcolor='#fff', plot_bgcolor='#fff', separators='.,', showlegend=True,
        smith=False, title='Tools by job description',
        titlefont=dict(color='',family='',size=0),width=1000,
        xaxis=XAxis(anchor='y' ,autorange=True, autotick=True, domain=[0, 1],
            dtick=1, exponentformat='B', gridcolor='#eee', gridwidth=1,
            linecolor='#444', linewidth=1, mirror=False, nticks=0, overlaying=False,
            position=0, range=[-0.5, 27.5], rangemode='normal', showexponent='all',
            showgrid=False, showline=False, showticklabels=True, tick0=0, tickangle='auto',
            tickcolor='#444', tickfont=dict(color='',family='',size=0),ticklen=5,
            ticks='',tickwidth=1,title='<br>Language/tool',
            titlefont=dict(color='',family='',size=0),type='category',zeroline=False,
            zerolinecolor='#444',zerolinewidth=1),
        yaxis=YAxis(anchor='x', autorange=True, autotick=True,
            domain=[0, 1], dtick=10, exponentformat='B',gridcolor='#eee',gridwidth=1,
            linecolor='#444',linewidth=1,mirror=False,nticks=0,overlaying=False,
            position=0,range=[0, 54.90526315789474],rangemode='normal',showexponent='all',
            showgrid=True,showline=False,showticklabels=True,tick0=0,tickangle='auto',
            tickcolor='#444', tickfont=dict(color='',family='',size=0),
            ticklen=5, ticks='', tickwidth=1, title='Percentage of posts',
            titlefont=dict(color='', family='', size=0), type='linear',
            zeroline=True,zerolinecolor='#444',zerolinewidth=1) )

    fig = Figure(data=data, layout=layout)
    return fig

def network_plot(list_of_edges=[("God","Bad"),("Bad","Michevious"),("Michevious","Good")],color=None, title="Earth"):
    sets_of_edges = [set(i) for i in list_of_edges]

    edge_numbering = {}
    number = 0
    for i in list(set.union(*sets_of_edges)):
        edge_numbering[i]=number
        number += 1

    edges=[]
    for i in list_of_edges:
        edges.append((edge_numbering[i[0]],edge_numbering[i[1]]))

    N = len(edge_numbering.keys())
    
    text = list(set.union(*sets_of_edges))
    iG=ig.Graph(edges, directed=False)
    layt=iG.layout('kk', dim=3)
    
    Xn=[layt[k][0] for k in range(N)]# x-coordinates of nodes
    Yn=[layt[k][1] for k in range(N)]# y-coordinates
    Zn=[layt[k][2] for k in range(N)]# z-coordinates
    Xe=[]
    Ye=[]
    Ze=[]
    for e in edges:
        Xe+=[layt[e[0]][0],layt[e[1]][0], None]# x-coordinates of edge ends
        Ye+=[layt[e[0]][1],layt[e[1]][1], None]  
        Ze+=[layt[e[0]][2],layt[e[1]][2], None]

    t_1=Scatter3d(x=Xe, y=Ye, z=Ze, mode='lines', 
        line=Line(color='rgb(125,125,125)', width=1),hoverinfo='none')

    t_2=Scatter3d(x=Xn, y=Yn, z=Zn, mode='markers', name='actors',
        marker=Marker(symbol='dot', size=6, color=color,
            line=Line(color='rgb(50,50,50)', width=0.5)),text=text,hoverinfo='text')

    axis=dict(showbackground=False, showline=False, zeroline=False,
        showgrid=False, showticklabels=False, title='')

    layout = Layout(title=title, width=1000, height=1000, showlegend=False,
        scene=Scene(xaxis=XAxis(axis), yaxis=YAxis(axis), zaxis=ZAxis(axis),),
        margin=Margin(t=100), hovermode='closest', annotations=Annotations([
            Annotation(showarrow=False, 
                text="Data source: <a href='http://stackoverflow.com'>[1]</a>",
                xref='paper', yref='paper', x=0, y=0.1, xanchor='left',   
                yanchor='bottom', font=Font(size=14))]),)

    data=Data([t_1, t_2])
    fig=Figure(data=data, layout=layout)
    return fig

Vendor:  Continuum Analytics, Inc.
Package: mkl
Message: trial mode expires in 7 days

In [ ]:

##---------------##
##--- Do work ---##
##---------------##

# df = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/2014_us_cities.csv')
# df['states'] = get_states_from_lat_lon(df)
# df['city'] = [i.rstrip() for i in list(df['name'].values)]

df1 = pd.read_csv('/Users/jazon/Downloads/US_1/US.txt',sep="\t",header=None) # http://download.geonames.org/export/zip/US.zip
df1.columns=['country','postal','city','state','state_abr','county','county_num','adminname3','admincode3','lat','lon','accuracy']

df1.drop_duplicates(subset=['city','state'],inplace=True)
df1.drop(['country','postal','state_abr','county','county_num','adminname3','admincode3','accuracy'], axis=1, inplace=True)

data_frames = {}
ticker = int(round(30661/20))
begin = 0
for i in range(0,20):
    end = begin+ticker
    data_frames[i] = df1.iloc[begin:end,:]
    begin+=ticker
    
mu_sals = lambda x: get_avg_salary(loc=['+'.join(str(x['city']).split(' ')),'+'.join(str(x['state']).split(' '))])
    
for i in data_frames.keys():
    data_frames[i]['salary'],data_frames[i]['jobs'] = zip(*data_frames[i].apply(mu_sals,axis=1).tolist())
    print "finished", i
    
df = pd.concat([data_frames[i] for i in data_frames.keys()]).sort_values(by="jobs".lower(), ascending = False)
df = df.drop(df.index[[0]])

In [221]:

cols = ['rgb(230, 145, 56)','rgb(241, 194, 50)',  'rgb(106, 168, 79)','rgb(69, 129, 142)', 'rgb(7, 55, 99)']

top_5 = zip(
    df[0:5].apply(lambda x: x.loc["state"],axis=1).tolist(), 
    df[0:5].apply(lambda x: x.loc["city"],axis=1).tolist(),
    cols
   )

city_data = []
for i,j,k in top_5:
    city_data.append((j,skills_info(city=j,state=i)[1],k))

In [ ]:

DS_info = skills_info()
plotly.offline.iplot( DS_info[0], validate=False )

In [ ]:

r_query = stack_graph(search_term="r")
print len(r_query.keys())

In [ ]:

python_query = stack_graph(search_term="python")
print len(python_query.keys())

In [4]:

r_graph = create_graph(r_query)
r_graph_to_prune = nx.isolates(r_graph) 
r_graph.remove_nodes_from(r_graph_to_prune)

python_graph = create_graph(python_query)
python_graph_to_prune = nx.isolates(python_graph) 
python_graph.remove_nodes_from(python_graph_to_prune)

In [5]:

print len(nx.degree(r_graph).values())
print len(nx.degree(python_graph).values())

print np.mean(nx.degree(r_graph).values())
print np.mean(nx.degree(python_graph).values())

38987
8990
1.72785800395
2.05205784205

In [6]:

r_k_clique = list(nx.k_clique_communities(r_graph, 3))
r_communities = [list(i) for i in list(r_k_clique)]

python_k_clique = list(nx.k_clique_communities(python_graph, 3))
python_communities = [list(i) for i in list(python_k_clique)]

In [7]:

r_community_graph_edges = []
color_clock = 0
for i in r_communities:
    for ii in i:
        for iii in list(nx.all_neighbors(r_graph,ii)):
            r_community_graph_edges.append((ii,iii))

    
r_community_graph = nx.Graph()
r_community_graph.add_edges_from(r_community_graph_edges)
r_graph_fig = network_plot(r_community_graph.edges(),
                           color='rgb(223, 192, 173)',
                           title="R network modules on Stackoverflow")


python_community_graph_edges = []
for i in python_communities:
    for ii in i:
        for iii in list(nx.all_neighbors(python_graph,ii)):
            python_community_graph_edges.append((ii,iii))
 
    
python_community_graph = nx.Graph()
python_community_graph.add_edges_from(python_community_graph_edges)
python_graph_fig = network_plot(python_community_graph.edges(),
                                color='rgb(147, 199, 141)',
                                title="Python network modules on Stackoverflow")

Top data science tools and languages¶

Data science is an interdisciplinary endeavour, and it serves the purpose of extracting knowledge or insight from varying sources of information. It is not surprising, then, that the tools used by data scientists are myriad, but what are the most valued and popular programming languages in a data scientists tool-box? A 2014 KD nuggets poll¹ suggest R, Python, SAS and SQL are among the top contenders, and indeed it is generally the case that almost every posting on the topic will mention at least 3 of these tools. There even seems to be a not-so-silent² competition between the R and Python community in terms crowning either as 'best' for performing data science tasks.

A nice post by Martijn Theuwissen³ in 2015 summarizes the current state of the art in R and Python comparisons. So, is R or Python better for data science? Well, that depends, Martijn suggests. This is, of course, exactly correct. R and Python are defined by their differences, and both have unique advantages.

R¶

R started in many academic circles as a free alternative to Matlab. Due to the ability of academics to freely create, distribute and use packages developed by the community, R took off as the lingua franca of data analysis. The size of the R community has exploded and CRAN, the repository for R packages, has swelled. Almost every analysis task has a specific R package devoted to performing it. The native plotting libraries are easy to use, which enables quick data visualization and exploration and packages like ggplot2 as well as others give R users powerful tools for more effective and aesthetic portrayals of their analysis results.

Some notable recent additions to the R ecosystem are Rstudio and the Rshiny web-app framework. Rstudio enables quick, easy and interactive data analysis and exploration whose reproducible results can be saved and shared among colleagues and the community. Rshiny enables easy R-powered web-app development for data-analysis dashboards or reporting tools for quantitative managers.

However, if you are starting off as a programmer, then R is relatively difficult to learn. Quirks that many come to love in R are the bane of new users. R is considered slow by some standards, but that view is changing due to developments like (now) Microsofts Revolution R, and APIs for distributed computing libraries such as SparkR.

R is an analysis language, and as such it is rarely used for anything else. Indeed, Stackoverflow questions are highly linked and demonstrate a tight-nit user and development community.

In [8]:

plotly.offline.iplot(r_graph_fig)

Python¶

Python is a quick and multi-use language that is reletively easy to learn. In terms of its data-crunching ecosystem, Python has seen rapid growth. Noteable is the rich suite of user-facing machine learning packages like Scikit-learn, XGBoost, and others. Pandas has brought data analysis in Python to a new high in term of making data analysis as naural and easy as it is in R. Python is host to a number of rich web-development frameworks that are used not only for building data science dash boards, but also for full-scale web-apps. The pluggability of web-app development in Python has enabled the fully expressive development of integrative and powerful web-apps. Flask and Django lead the way in terms of the Python web-app development landscape, but Bottle and Pyramid are also quite popular.

Like R, Python is also considered slow in comparison to the likes of C or C++. Unlike R, the Python data science ecosystem lacks a star IDE like Rstudio. Both of these outside criticisms are not unmet, however, as several easy-to-use tutorials describe development with Cython, which taps into the speed of C using slightly different C-like syntax. In addition, Yhat has released a very nice Python IDE, Rodeo, that mimics the look and feel of Rstudio, although it is still in early release stages. The Beaker notebook, although not specific to Python, is also a nice IDE for data science and has labeled itself the "data scientists laboratory", as it allows for development in a number of data science related languages.

Unlike R, Python is used for a myriad of general purpose programming and piping tasks. Although a powerful language for data analysis, Python is a general programming language that is not limited to its number-crunching abilities. Indeed, Stackoverflow question links and their communities are focused around general development or language-specific usage topics.

In [9]:

plotly.offline.iplot(python_graph_fig)

Query Languages¶

Most businesses house their data in structured databases, such as MySQL, PostgreSQL, other SQL-accessible systems or NoSQL. As such, being able to interact with these systems by writing SQL-based queries to extract data is an important and incredibly valuable skill for most analysts and data scientists.

Distributed query¶

As data sets become vastly larger, deep knowledge of distributed storage, computation and querying is becoming among the most valuable skillsets available for data scientists. Many of the interfaces to these systems are SQL-like, and Big Data architectures are rapidly becoming the norm for business storage solutions. Among the more popular interfaces to Big Data architectures are Pig, Hive, and SparkSQL, but there are some interesting new developments like Apache Drill, which promises "Schema-free SQL Query Engine for Hadoop, NoSQL and Cloud Storage".

Julia¶

Although new, the idea of using Julia as a common language for data science tasks has grown in popularity. Julia is a new programming language marketed as having the strengths of both R and Python, but without any of the weaknesses. It is too early to say how many data scientist will add Julia to their toolkit, but it has thus-far demonstrated itself as an exciting and powerful new data science-oriented language.

But what some of the other tools or programming languages valued by the data science community and, perhaps more importantly, companies looking to hire data scientists? Some go by github rankings or other program language usage/popularity indices⁴, but it is a difficult task to differentiate between data science users and general software development for general use-cases (particularly for Python, which can be used for non-data science related tasks). However, if we take a more practical approach, we can look directly at where the data science jobs are, and even ask what companies in the most job-rich areas are looking for.

Where are all the data science jobs?¶

Indeed.com is one of the more popular websites to list public job openings, so we looked to see which US cities were posting the most data science jobs. Cities with job postings matching "data scientist" were queried; the results are shown below.

In [183]:

plotly.offline.iplot(map_fig(df,of='Jobs'), validate=False)

As expected, both coasts are rich in data science jobs, with New York City, NY, San Francisco, CA, Seattle, WA, and Chicago, IL taking the top positions, with Boston, Redmond and Washington DC not too far behind. That is not to say that there aren't data science positions all across the country. Notable are the several listings in in Denver, CO, Atlanta, GA, St. Louis and elsewhere. We expect this map to grow and light up over the coming years as data science takes strides into mainstream business practice. If you are currently looking for a position as a data scientist, there is no lack of opportunity.

But what are companies looking for?¶

We came across a very nice posting by Jesse Steinweg-Woods⁵, which demonstrates how to use Beautiful Soup and other python tools to scrape Indeed.com for data science job postings. To determine what companies are generally looking for in a data scientist, we used this approach to look at the top 5 cities for data science jobs we tallied how often a particular language or tool was mentioned across job posts.

In [229]:

plotly.offline.iplot(bar_fig(city_data), validate=False )

In agreement with common wisdom, R and Python are reliably the top two programming languages that companies want their data scientists to know. SQL, comes in a clear third, with Java, Hadoop, Spark, Pig and Hive trailing behind. Surprising to us was the prevalence of Excel, Matlab, SAS, SPSS, Tableu, which are not always thought of as the most popular of toolsets among data scientists. Julia has a surprise showing. It is still a young in its development cycle, but Julia seems to be an increasingly popular language for use in data science tasks.

Summary¶

The top tools and languages for data science are rather consistent if not slightly varying across region, and knowledge in R, Python and SQL are in general the most sought-after skills. What is clear is that an understanding of and skills in the use of languages that interact with Big Data storage and compute architecture is quickly becoming a must for practicing data scientists. Julia is now on the job-market map, and it will be interesting to see how quickly Julia will spread in the data sciences.

Links to cited pages¶

¹ http://www.kdnuggets.com/2014/08/four-main-languages-analytics-data-mining-data-science.html
² http://www.infoworld.com/article/2951779/application-development/in-data-science-the-r-language-is-swallowing-python.html
³ http://www.kdnuggets.com/2015/05/r-vs-python-data-science.html
⁴ http://spectrum.ieee.org/computing/software/the-2015-top-ten-programming-languages
⁵ https://jessesw.com/Data-Science-Skills/

Links to popular data science tools/languages¶

Python: https://www.python.org/
Scikit-learn: http://scikit-learn.org/stable/
Pandas: http://pandas.pydata.org/
XGBoost: https://github.com/dmlc/xgboost
Django: https://www.djangoproject.com/
Flask: http://flask.pocoo.org/
Bottle: http://bottlepy.org/docs/dev/index.html
Pyrmid: http://www.pylonsproject.org/
Cython: http://cython.org/
Beautiful Soup: http://www.crummy.com/software/BeautifulSoup/
R: https://www.r-project.org/
Revolution R: http://www.revolutionanalytics.com/revolution-r-enterprise
Rstuido: https://www.rstudio.com/home/
Rshiny: http://shiny.rstudio.com/
Java: https://www.java.com/
SQL: https://en.wikipedia.org/wiki/SQL
PostgreSQL: http://www.postgresql.org/
NoSQL: https://en.wikipedia.org/wiki/NoSQL
MySQL: https://www.mysql.com/
Drill: https://drill.apache.org/
Hadoop: http://hadoop.apache.org/
Hive: https://hive.apache.org/
Pig: https://pig.apache.org/
Spark: http://spark.apache.org/
PySpark:https://spark.apache.org/docs/0.9.0/python-programming-guide.html
SparkR: https://amplab-extras.github.io/SparkR-pkg/
C: https://en.wikipedia.org/wiki/C_(programming_language)
C++: https://en.wikipedia.org/wiki/C%2B%2B
Matlab: http://www.mathworks.com/products/matlab/
SAS: https://www.sas.com/en_us/home.html
Scala: http://www.scala-lang.org/
Perl: https://www.perl.org/
Ruby: https://www.ruby-lang.org/en/
MS Excel: https://products.office.com/en-us/excel
SPSS: http://www-01.ibm.com/software/analytics/spss/
Tableau: http://www.tableau.com/
HBase: https://hbase.apache.org/
SparkSQL: https://spark.apache.org/sql/
Javascript: https://en.wikipedia.org/wiki/JavaScript
Cassandra: http://cassandra.apache.org/
Mahout: http://mahout.apache.org/
MapReduce: https://hadoop.apache.org/docs/r1.2.1/mapred_tutorial.html
D3: https://d3js.org/
Oozie: https://oozie.apache.org/
Julia: http://julialang.org/
MongoDB: https://www.mongodb.org/
Zookeeper: https://zookeeper.apache.org/
Flume: https://flume.apache.org/
Beaker Notebook: http://beakernotebook.com/