In [2]:

%matplotlib inline 
import matplotlib.pyplot as plt 
import pandas 
import numpy as np

import statsmodels.api as sm
from statsmodels.sandbox.regression.predstd import wls_prediction_std

import plotly.plotly as py
import plotly.graph_objs as go
from plotly.tools import FigureFactory as FF
py.sign_in('erikrood','3eqsrype8v')

In [26]:

NIPE = pandas.read_csv('/Users/erikrood/desktop/ipython_datasets/NIPE_main_cleaned.csv') 

In [144]:

NIPE.head(5)

Out[144]:

	Company_name	NIPE_MM	Sector	Industry	Exchange	Market_Cap	Employees_Full_Time	Net_income_MM	Clean_market_cap
0	Host Hotels & Resorts Inc	2675	Financials	Real Estate Investment Trusts (REITs)	NYSE	$11.8B	240	$642,000	12
1	Gilead Sciences Inc	2168	Health Care	Biotechnology	NASDAQ	$118.9B	8000	$17,341,000	119
2	Realty Income Corp.	1967	Financials	Real Estate Investment Trusts (REITs)	NYSE	$14.9B	132	$259,665	15
3	Welltower Inc	1719	Financials	Real Estate Investment Trusts (REITs)	NYSE	$24.6B	476	$818,344	25
4	Equity Residential	1221	Financials	Real Estate Investment Trusts (REITs)	NYSE	$24.9B	3500	$4,272,041	25

In [29]:

df = NIPE [[3,1]]
df[['NIPE_MM']] = df[['NIPE_MM']].astype(float)
df.head()

Out[29]:

	Industry	NIPE_MM
0	Real Estate Investment Trusts (REITs)	2675.0
1	Biotechnology	2168.0
2	Real Estate Investment Trusts (REITs)	1967.0
3	Real Estate Investment Trusts (REITs)	1719.0
4	Real Estate Investment Trusts (REITs)	1221.0

In [73]:

average_NIPE_industry = df.groupby('Industry').mean().sort_values(by='NIPE_MM',ascending=False).round(decimals=0).reset_index()
average_NIPE_industry_top7 = average_NIPE_industry.head(7)
average_NIPE_industry_top7.head(7)

Out[73]:

	Industry	NIPE_MM
0	Tobacco	609.0
1	Real Estate Investment Trusts (REITs)	487.0
2	Biotechnology	293.0
3	Diversified Financial Services	184.0
4	Road & Rail	130.0
5	Consumer Finance	128.0
6	Semiconductors & Semiconductor Equipment	106.0

In [75]:

average_NIPE_industry_bottom7 = average_NIPE_industry.tail(7)
average_NIPE_industry_bottom7.head(7)

Out[75]:

	Industry	NIPE_MM
53	Food & Staples Retailing	10.0
54	Auto Components	8.0
55	Containers & Packaging	8.0
56	Multiline Retail	7.0
57	Metals & Mining	-20.0
58	Energy Equipment & Services	-21.0
59	Oil, Gas & Consumable Fuels	-421.0

In [82]:

#too lazy to create function, but dropping everything less the top 7 and bottom 7 companies by NIPE
average_NIPE_industry.drop([7,8,9,10,11,12,13,14 , 15 , 16 , 17 , 18 , 19 , 20 , 21 , 22 , 23 , 24 , 25 , 26 , 27 , 28 , 29 , 30 , 31 , 32 , 33 , 34 , 35 , 36 , 37 , 38 , 39 , 40 , 41 , 42 , 43 , 44 , 45 , 46 , 47 , 48 , 49 , 50 , 51 , 52], inplace = True)

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-82-358bd5250706> in <module>()
----> 1 average_NIPE_industry.drop([7,8,9,10,11,12,13,14 , 15 , 16 , 17 , 18 , 19 , 20 , 21 , 22 , 23 , 24 , 25 , 26 , 27 , 28 , 29 , 30 , 31 , 32 , 33 , 34 , 35 , 36 , 37 , 38 , 39 , 40 , 41 , 42 , 43 , 44 , 45 , 46 , 47 , 48 , 49 , 50 , 51 , 52], inplace = True)
      2 average_NIPE_industry.head()

/Users/erikrood/anaconda/lib/python2.7/site-packages/pandas/core/generic.pyc in drop(self, labels, axis, level, inplace, errors)
   1871                 new_axis = axis.drop(labels, level=level, errors=errors)
   1872             else:
-> 1873                 new_axis = axis.drop(labels, errors=errors)
   1874             dropped = self.reindex(**{axis_name: new_axis})
   1875             try:

/Users/erikrood/anaconda/lib/python2.7/site-packages/pandas/indexes/base.pyc in drop(self, labels, errors)
   2964             if errors != 'ignore':
   2965                 raise ValueError('labels %s not contained in axis' %
-> 2966                                  labels[mask])
   2967             indexer = indexer[~mask]
   2968         return self.delete(indexer)

ValueError: labels [ 7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52] not contained in axis

In [84]:

average_NIPE_industry.head(14)

Out[84]:

	Industry	NIPE_MM
0	Tobacco	609.0
1	Real Estate Investment Trusts (REITs)	487.0
2	Biotechnology	293.0
3	Diversified Financial Services	184.0
4	Road & Rail	130.0
5	Consumer Finance	128.0
6	Semiconductors & Semiconductor Equipment	106.0
53	Food & Staples Retailing	10.0
54	Auto Components	8.0
55	Containers & Packaging	8.0
56	Multiline Retail	7.0
57	Metals & Mining	-20.0
58	Energy Equipment & Services	-21.0
59	Oil, Gas & Consumable Fuels	-421.0

In [136]:

#bar chart


x = average_NIPE_industry['Industry']
y = average_NIPE_industry['NIPE_MM']

data = [
    go.Bar(
        x=x,
        y=y,
        
        marker=dict(
            color='rgb(255, 217, 102)',
            line=dict(
                color='rgb(8,48,107)',
                width=1.5
            ),
        ),
        opacity=0.6
    )
]
layout = go.Layout(
      title='net income per employee by industry [top 7/bottom 7 industries]',
    xaxis=dict(
        title='industry',
        tickangle = 47,
    ),
    yaxis=dict(
        title='avg. NIPE (MM)'
    ),
     margin=go.Margin(
        b = 220
        
    )

    
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='NIPE_industry')

Out[136]:

In [111]:

#scatter of employees vs. NI,(could drill into sectors/industries)
df4 = NIPE [[6,7]]
df4[['Employees_Full_Time']] = df4[['Employees_Full_Time']].astype(int)
df4.head()
df5 = df4.groupby('Net_income_MM').sum().round(decimals=0).reset_index()
df5.head()

Out[111]:

	Net_income_MM	Employees_Full_Time
0	$1,000,000	6700
1	$1,002,100	47145
2	$1,010,551	39556
3	$1,020,661	77800
4	$1,028,380	180835

In [115]:

#Creating a better looking scatterplot of Cereal vs. Year

_x = df5['Employees_Full_Time']
_y = df5['Net_income_MM']

# Create a trace
trace = go.Scatter(
    x = _x,
    y = _y,

    mode = 'markers',
        marker = dict(
        size = 6,
        color = 'rgb(156, 231, 193)',

    )
       
)

data = [trace]

layout =  dict(title='net income vs. number of employees',
    hovermode='closest',
    xaxis=dict(
        title='Net Income (MM)',
        ticklen=5,
        zeroline=False,
        gridwidth=2,
    ),
               yaxis=dict(
        title='# of employees',
        ticklen=5,
        zeroline=False,
        gridwidth=2,
    ),
)


# Plot and embed in ipython notebook
fig = dict(data=data, layout=layout)
py.iplot(fig, filename='NIPE_vs_Employees')

Out[115]:

In [123]:

#rank top/bottom companies
df3 = NIPE [[0,1]]
#top 10
df6 = df3.head(10)
#bottom 10
df7 = df3.tail(10)

In [135]:

#bar chart

x = df6['Company_name']
y = df6['NIPE_MM']

data = [
    go.Bar(
        x=x,
        y=y,
        
        marker=dict(
            color='rgb(247, 135, 22)',
            line=dict(
                color='rgb(8,48,107)',
                width=1.5
            ),
        ),
        opacity=0.6
    )
]
layout = go.Layout(
      title='net income per employee by company [top 10]',
    xaxis=dict(
        title='company',
        tickangle = 47,
    ),
    yaxis=dict(
        title='NIPE (MM)'
    ),
     margin=go.Margin(
        b = 220
        
    )

    
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='NIPE_industry_top10')

Out[135]:

In [134]:

#bar chart, bottom 10

x = df7['Company_name']
y = df7['NIPE_MM']

data = [
    go.Bar(
        x=x,
        y=y,
        
        marker=dict(
            color='rgb(84, 226, 129)',
            line=dict(
                color='rgb(8,48,107)',
                width=1.5
            ),
        ),
        opacity=0.6
    )
]
layout = go.Layout(
      title='net income per employee by company [bottom 10]',
    xaxis=dict(
        title='company',
        tickangle = 47,
    ),
    yaxis=dict(
        title='NIPE (MM)'
    ),
     margin=go.Margin(
        b = 220
        
    )

    
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='NIPE_industry_bottom10')

Out[134]:

In [137]:

#splitting out IT companies
IT_companies = ['Information Technology']
NIPE_IT = NIPE[NIPE['Sector'].isin(IT_companies)]

In [138]:

NIPE_IT.head()

Out[138]:

	Company_name	NIPE_MM	Sector	Industry	Exchange	Market_Cap	Employees_Full_Time	Net_income_MM	Clean_market_cap
12	Visa Inc	607	Information Technology	IT Services	NYSE	$184.2B	11300	$6,857,000	184
20	Apple Inc	461	Information Technology	Technology Hardware, Storage & Peripherals	NASDAQ	$513.5B	110000	$50,678,000	514
23	Facebook Inc	369	Information Technology	Internet Software & Services	NASDAQ	$336.3B	12691	$4,686,000	336
27	MasterCard Inc	332	Information Technology	IT Services	NYSE	$106.9B	11300	$3,747,000	107
29	Alibaba Group Holding Ltd	305	Information Technology	Internet Software & Services	NYSE	$192B	34985	$10,673,110	192

In [140]:

df8 = NIPE_IT[[0,1]]
df8.head()

Out[140]:

	Company_name	NIPE_MM
12	Visa Inc	607
20	Apple Inc	461
23	Facebook Inc	369
27	MasterCard Inc	332
29	Alibaba Group Holding Ltd	305

In [143]:

IT_NIPE_cleaned = df8.groupby('Company_name').mean().sort_values(by='NIPE_MM',ascending=False).round(decimals=0).reset_index()
IT_NIPE_cleaned.head(5)

Out[143]:

	Company_name	NIPE_MM
0	Visa Inc	607
1	Apple Inc	461
2	Facebook Inc	369
3	MasterCard Inc	332
4	Alibaba Group Holding Ltd	305

In [153]:

#standard deviation in NIPE by industry
df9 = NIPE [[3,1]]
#converting NIPE to float/#
df9[['NIPE_MM']] = df9[['NIPE_MM']].astype(float)
#pivoting and calculating std deviation
df10 = df9.groupby('Industry', as_index=False).agg(np.std, ddof=0).sort_values(by='NIPE_MM',ascending=False).round(decimals=0)

In [157]:

#cleaning column names
df10.columns = ['Industry', 'NIPE_Std_dev'] #renaming columns
df10.head(5)

Out[157]:

	Industry	NIPE_Std_dev
44	Oil, Gas & Consumable Fuels	1092.0
48	Real Estate Investment Trusts (REITs)	989.0
7	Biotechnology	596.0
56	Tobacco	427.0
41	Metals & Mining	331.0

In [158]:

#standard deviation in NIPE by sector
df11 = NIPE [[2,1]]
#converting NIPE to float/#
df11[['NIPE_MM']] = df11[['NIPE_MM']].astype(float)
#pivoting and calculating std deviation
df12 = df11.groupby('Sector', as_index=False).agg(np.std, ddof=0).sort_values(by='NIPE_MM',ascending=False).round(decimals=0)

In [160]:

df12.columns = ['Sector', 'NIPE_Std_dev'] #renaming columns
df12.head(10)

Out[160]:

	Sector	NIPE_Std_dev
2	Energy	1058.0
3	Financials	510.0
4	Health Care	281.0
7	Materials	204.0
1	Consumer Staples	182.0
6	Information Technology	135.0
8	Telecommunication Services	73.0
0	Consumer Discretionary	56.0
5	Industrials	48.0
9	Utilities	40.0

In [162]:

#avg NIPE by sector
df13 = NIPE [[2,1]]
df14 = df13.groupby('Sector').mean().sort_values(by='NIPE_MM',ascending=False).round(decimals=0).reset_index()
df14.columns = ['Sector', 'Avg_NIPE'] #renaming columns
df14.head(5)

Out[162]:

	Sector	Avg_NIPE
0	Financials	185.0
1	Health Care	92.0
2	Consumer Staples	86.0
3	Utilities	83.0
4	Information Technology	74.0

In [164]:

#joining avg_NIPE w/ standard deviation NIPE table
j1 = df14.merge(df12, how='left', on="Sector")
j1.head(10)

Out[164]:

	Sector	Avg_NIPE	NIPE_Std_dev
0	Financials	185.0	510.0
1	Health Care	92.0	281.0
2	Consumer Staples	86.0	182.0
3	Utilities	83.0	40.0
4	Information Technology	74.0	135.0
5	Industrials	43.0	48.0
6	Telecommunication Services	40.0	73.0
7	Consumer Discretionary	38.0	56.0
8	Materials	35.0	204.0
9	Energy	-393.0	1058.0

In [169]:

#bar chart

x = j1['Sector']
y1 = j1['Avg_NIPE']
y2 = j1['NIPE_Std_dev']


trace0 = go.Bar(
        x=x,
        y=y1,
        name='Average NIPE',
        marker=dict(
            color='rgb(255, 217, 102)',
            line=dict(
                color='rgb(8,48,107)',
                width=1.5
            ),
        ),
        opacity=0.6
    )

trace1 = go.Bar(
        x=x,
        y=y2,
        name='Standard deviation',
        marker=dict(
            color='rgb(84, 226, 129)',
            line=dict(
                color='rgb(8,48,107)',
                width=1.5
            ),
        ),
        opacity=0.6
    )
        
data = [trace0, trace1]
    
layout = go.Layout(
      title='mean NIPE by sector, with standard deviation',
    xaxis=dict(
        title='industry',
        tickangle = 47,
    ),
    yaxis=dict(
        title='NIPE (MM)'
    ),
     margin=go.Margin(
        b = 220
        
    )

    
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='NIPE_sector_stddev')

Out[169]:

In [231]:

#setting up y values for box plot (step 0), loading in the data I want
sector_nipe = NIPE[['Sector', 'NIPE_MM']]
sector_nipe[['NIPE_MM']] = sector_nipe[['NIPE_MM']].astype(float)

In [222]:

#setting up x variables for box plot
x_data1 = {'Consumer Discretionary','Consumer Staples','Energy',\
         'Financials', 'Health Care','Industrials',\
          'Information Technology','Materials','Telecommunication Services','Utilities'}
x_data1

Out[222]:

{'Consumer Discretionary',
 'Consumer Staples',
 'Energy',
 'Financials',
 'Health Care',
 'Industrials',
 'Information Technology',
 'Materials',
 'Telecommunication Services',
 'Utilities'}

In [224]:

#setting up y values for box plot (step 1)
sector_CD = sector_nipe[sector_nipe['Sector'] == 'Consumer Discretionary'].copy()
ConsumerDiscretionary = sector_CD[['NIPE_MM']].copy()

sector_CS = sector_nipe[sector_nipe['Sector']== 'Consumer Staples'].copy()
ConsumerStaples = sector_CS[['NIPE_MM']].copy()

sector_energy = sector_nipe[sector_nipe['Sector'] == 'Energy'].copy()
Energy = sector_energy[['NIPE_MM']].copy()

sector_financials = sector_nipe[sector_nipe['Sector'] == 'Financials'].copy()
Financials = sector_financials[['NIPE_MM']].copy()

sector_healthcare = sector_nipe[sector_nipe['Sector'] == 'Health Care'].copy()
Healthcare = sector_healthcare[['NIPE_MM']].copy()

sector_industrials = sector_nipe[sector_nipe['Sector'] == 'Industrials'].copy()
Industrials = sector_industrials[['NIPE_MM']].copy()

sector_IT = sector_nipe[sector_nipe['Sector'] == 'Information Technology'].copy()
InformationTechnology = sector_IT[['NIPE_MM']].copy()

sector_materials = sector_nipe[sector_nipe['Sector'] == 'Materials'].copy()
Materials = sector_materials[['NIPE_MM']].copy()

sector_TS = sector_nipe[sector_nipe['Sector'] == 'Telecommunication Services'].copy()
TelecommunicationServices = sector_TS[['NIPE_MM']].copy()

sector_utilities = sector_nipe[sector_nipe['Sector'] == 'Utilities'].copy()
Utilities = sector_utilities[['NIPE_MM']].copy()

#setting up y values for box plot (step 2)
y_data = [
    Industrials.values,
    Energy.values,
    Utilities.values,
    ConsumerStaples.values,
    Healthcare.values,
    Materials.values,
    TelecommunicationServices.values,
    Financials.values,
    ConsumerDiscretionary.values,
    InformationTechnology.values
]

In [234]:

#putting together the box plot
traces = []
for xd, yd in zip(x_data1, y_data):
        traces.append(go.Box(
            y=yd,
            name=xd,
            boxpoints='all',
            jitter=0.5,
            whiskerwidth=0.2,
            marker=dict(
                size=2,
            ),
            line=dict(width=1),
        ))

layout = go.Layout(
    title='NIPE distribution by sector',
    yaxis=dict(
        range=[-300, 350],
        showgrid=True,
        zeroline=True,
        #dtick=5,
        gridcolor='rgb(255, 255, 255)',
        gridwidth=1,
        zerolinecolor='rgb(255, 255, 255)',
        zerolinewidth=2,
    ),
    margin=go.Margin(
        l=50,
        r=90,
        b=280,
        t=60,
        pad=4
    ),
    width=800,
    height=650,
    showlegend=False
)

    
fig = go.Figure(data=traces, layout=layout)
py.iplot(fig, filename = 'NIPE_distribution_sector')

Out[234]:

In [ ]: