%matplotlib inline
import matplotlib.pyplot as plt
import pandas
import numpy as np
import statsmodels.api as sm
from statsmodels.sandbox.regression.predstd import wls_prediction_std
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.tools import FigureFactory as FF
py.sign_in('erikrood','3eqsrype8v')
NIPE = pandas.read_csv('/Users/erikrood/desktop/ipython_datasets/NIPE_main_cleaned.csv')
NIPE.head(5)
Company_name | NIPE_MM | Sector | Industry | Exchange | Market_Cap | Employees_Full_Time | Net_income_MM | Clean_market_cap | |
---|---|---|---|---|---|---|---|---|---|
0 | Host Hotels & Resorts Inc | 2675 | Financials | Real Estate Investment Trusts (REITs) | NYSE | $11.8B | 240 | $642,000 | 12 |
1 | Gilead Sciences Inc | 2168 | Health Care | Biotechnology | NASDAQ | $118.9B | 8000 | $17,341,000 | 119 |
2 | Realty Income Corp. | 1967 | Financials | Real Estate Investment Trusts (REITs) | NYSE | $14.9B | 132 | $259,665 | 15 |
3 | Welltower Inc | 1719 | Financials | Real Estate Investment Trusts (REITs) | NYSE | $24.6B | 476 | $818,344 | 25 |
4 | Equity Residential | 1221 | Financials | Real Estate Investment Trusts (REITs) | NYSE | $24.9B | 3500 | $4,272,041 | 25 |
df = NIPE [[3,1]]
df[['NIPE_MM']] = df[['NIPE_MM']].astype(float)
df.head()
Industry | NIPE_MM | |
---|---|---|
0 | Real Estate Investment Trusts (REITs) | 2675.0 |
1 | Biotechnology | 2168.0 |
2 | Real Estate Investment Trusts (REITs) | 1967.0 |
3 | Real Estate Investment Trusts (REITs) | 1719.0 |
4 | Real Estate Investment Trusts (REITs) | 1221.0 |
average_NIPE_industry = df.groupby('Industry').mean().sort_values(by='NIPE_MM',ascending=False).round(decimals=0).reset_index()
average_NIPE_industry_top7 = average_NIPE_industry.head(7)
average_NIPE_industry_top7.head(7)
Industry | NIPE_MM | |
---|---|---|
0 | Tobacco | 609.0 |
1 | Real Estate Investment Trusts (REITs) | 487.0 |
2 | Biotechnology | 293.0 |
3 | Diversified Financial Services | 184.0 |
4 | Road & Rail | 130.0 |
5 | Consumer Finance | 128.0 |
6 | Semiconductors & Semiconductor Equipment | 106.0 |
average_NIPE_industry_bottom7 = average_NIPE_industry.tail(7)
average_NIPE_industry_bottom7.head(7)
Industry | NIPE_MM | |
---|---|---|
53 | Food & Staples Retailing | 10.0 |
54 | Auto Components | 8.0 |
55 | Containers & Packaging | 8.0 |
56 | Multiline Retail | 7.0 |
57 | Metals & Mining | -20.0 |
58 | Energy Equipment & Services | -21.0 |
59 | Oil, Gas & Consumable Fuels | -421.0 |
#too lazy to create function, but dropping everything less the top 7 and bottom 7 companies by NIPE
average_NIPE_industry.drop([7,8,9,10,11,12,13,14 , 15 , 16 , 17 , 18 , 19 , 20 , 21 , 22 , 23 , 24 , 25 , 26 , 27 , 28 , 29 , 30 , 31 , 32 , 33 , 34 , 35 , 36 , 37 , 38 , 39 , 40 , 41 , 42 , 43 , 44 , 45 , 46 , 47 , 48 , 49 , 50 , 51 , 52], inplace = True)
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-82-358bd5250706> in <module>() ----> 1 average_NIPE_industry.drop([7,8,9,10,11,12,13,14 , 15 , 16 , 17 , 18 , 19 , 20 , 21 , 22 , 23 , 24 , 25 , 26 , 27 , 28 , 29 , 30 , 31 , 32 , 33 , 34 , 35 , 36 , 37 , 38 , 39 , 40 , 41 , 42 , 43 , 44 , 45 , 46 , 47 , 48 , 49 , 50 , 51 , 52], inplace = True) 2 average_NIPE_industry.head() /Users/erikrood/anaconda/lib/python2.7/site-packages/pandas/core/generic.pyc in drop(self, labels, axis, level, inplace, errors) 1871 new_axis = axis.drop(labels, level=level, errors=errors) 1872 else: -> 1873 new_axis = axis.drop(labels, errors=errors) 1874 dropped = self.reindex(**{axis_name: new_axis}) 1875 try: /Users/erikrood/anaconda/lib/python2.7/site-packages/pandas/indexes/base.pyc in drop(self, labels, errors) 2964 if errors != 'ignore': 2965 raise ValueError('labels %s not contained in axis' % -> 2966 labels[mask]) 2967 indexer = indexer[~mask] 2968 return self.delete(indexer) ValueError: labels [ 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52] not contained in axis
average_NIPE_industry.head(14)
Industry | NIPE_MM | |
---|---|---|
0 | Tobacco | 609.0 |
1 | Real Estate Investment Trusts (REITs) | 487.0 |
2 | Biotechnology | 293.0 |
3 | Diversified Financial Services | 184.0 |
4 | Road & Rail | 130.0 |
5 | Consumer Finance | 128.0 |
6 | Semiconductors & Semiconductor Equipment | 106.0 |
53 | Food & Staples Retailing | 10.0 |
54 | Auto Components | 8.0 |
55 | Containers & Packaging | 8.0 |
56 | Multiline Retail | 7.0 |
57 | Metals & Mining | -20.0 |
58 | Energy Equipment & Services | -21.0 |
59 | Oil, Gas & Consumable Fuels | -421.0 |
#bar chart
x = average_NIPE_industry['Industry']
y = average_NIPE_industry['NIPE_MM']
data = [
go.Bar(
x=x,
y=y,
marker=dict(
color='rgb(255, 217, 102)',
line=dict(
color='rgb(8,48,107)',
width=1.5
),
),
opacity=0.6
)
]
layout = go.Layout(
title='net income per employee by industry [top 7/bottom 7 industries]',
xaxis=dict(
title='industry',
tickangle = 47,
),
yaxis=dict(
title='avg. NIPE (MM)'
),
margin=go.Margin(
b = 220
)
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='NIPE_industry')
#scatter of employees vs. NI,(could drill into sectors/industries)
df4 = NIPE [[6,7]]
df4[['Employees_Full_Time']] = df4[['Employees_Full_Time']].astype(int)
df4.head()
df5 = df4.groupby('Net_income_MM').sum().round(decimals=0).reset_index()
df5.head()
Net_income_MM | Employees_Full_Time | |
---|---|---|
0 | $1,000,000 | 6700 |
1 | $1,002,100 | 47145 |
2 | $1,010,551 | 39556 |
3 | $1,020,661 | 77800 |
4 | $1,028,380 | 180835 |
#Creating a better looking scatterplot of Cereal vs. Year
_x = df5['Employees_Full_Time']
_y = df5['Net_income_MM']
# Create a trace
trace = go.Scatter(
x = _x,
y = _y,
mode = 'markers',
marker = dict(
size = 6,
color = 'rgb(156, 231, 193)',
)
)
data = [trace]
layout = dict(title='net income vs. number of employees',
hovermode='closest',
xaxis=dict(
title='Net Income (MM)',
ticklen=5,
zeroline=False,
gridwidth=2,
),
yaxis=dict(
title='# of employees',
ticklen=5,
zeroline=False,
gridwidth=2,
),
)
# Plot and embed in ipython notebook
fig = dict(data=data, layout=layout)
py.iplot(fig, filename='NIPE_vs_Employees')
#rank top/bottom companies
df3 = NIPE [[0,1]]
#top 10
df6 = df3.head(10)
#bottom 10
df7 = df3.tail(10)
#bar chart
x = df6['Company_name']
y = df6['NIPE_MM']
data = [
go.Bar(
x=x,
y=y,
marker=dict(
color='rgb(247, 135, 22)',
line=dict(
color='rgb(8,48,107)',
width=1.5
),
),
opacity=0.6
)
]
layout = go.Layout(
title='net income per employee by company [top 10]',
xaxis=dict(
title='company',
tickangle = 47,
),
yaxis=dict(
title='NIPE (MM)'
),
margin=go.Margin(
b = 220
)
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='NIPE_industry_top10')
#bar chart, bottom 10
x = df7['Company_name']
y = df7['NIPE_MM']
data = [
go.Bar(
x=x,
y=y,
marker=dict(
color='rgb(84, 226, 129)',
line=dict(
color='rgb(8,48,107)',
width=1.5
),
),
opacity=0.6
)
]
layout = go.Layout(
title='net income per employee by company [bottom 10]',
xaxis=dict(
title='company',
tickangle = 47,
),
yaxis=dict(
title='NIPE (MM)'
),
margin=go.Margin(
b = 220
)
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='NIPE_industry_bottom10')
#splitting out IT companies
IT_companies = ['Information Technology']
NIPE_IT = NIPE[NIPE['Sector'].isin(IT_companies)]
NIPE_IT.head()
Company_name | NIPE_MM | Sector | Industry | Exchange | Market_Cap | Employees_Full_Time | Net_income_MM | Clean_market_cap | |
---|---|---|---|---|---|---|---|---|---|
12 | Visa Inc | 607 | Information Technology | IT Services | NYSE | $184.2B | 11300 | $6,857,000 | 184 |
20 | Apple Inc | 461 | Information Technology | Technology Hardware, Storage & Peripherals | NASDAQ | $513.5B | 110000 | $50,678,000 | 514 |
23 | Facebook Inc | 369 | Information Technology | Internet Software & Services | NASDAQ | $336.3B | 12691 | $4,686,000 | 336 |
27 | MasterCard Inc | 332 | Information Technology | IT Services | NYSE | $106.9B | 11300 | $3,747,000 | 107 |
29 | Alibaba Group Holding Ltd | 305 | Information Technology | Internet Software & Services | NYSE | $192B | 34985 | $10,673,110 | 192 |
df8 = NIPE_IT[[0,1]]
df8.head()
Company_name | NIPE_MM | |
---|---|---|
12 | Visa Inc | 607 |
20 | Apple Inc | 461 |
23 | Facebook Inc | 369 |
27 | MasterCard Inc | 332 |
29 | Alibaba Group Holding Ltd | 305 |
IT_NIPE_cleaned = df8.groupby('Company_name').mean().sort_values(by='NIPE_MM',ascending=False).round(decimals=0).reset_index()
IT_NIPE_cleaned.head(5)
Company_name | NIPE_MM | |
---|---|---|
0 | Visa Inc | 607 |
1 | Apple Inc | 461 |
2 | Facebook Inc | 369 |
3 | MasterCard Inc | 332 |
4 | Alibaba Group Holding Ltd | 305 |
#standard deviation in NIPE by industry
df9 = NIPE [[3,1]]
#converting NIPE to float/#
df9[['NIPE_MM']] = df9[['NIPE_MM']].astype(float)
#pivoting and calculating std deviation
df10 = df9.groupby('Industry', as_index=False).agg(np.std, ddof=0).sort_values(by='NIPE_MM',ascending=False).round(decimals=0)
#cleaning column names
df10.columns = ['Industry', 'NIPE_Std_dev'] #renaming columns
df10.head(5)
Industry | NIPE_Std_dev | |
---|---|---|
44 | Oil, Gas & Consumable Fuels | 1092.0 |
48 | Real Estate Investment Trusts (REITs) | 989.0 |
7 | Biotechnology | 596.0 |
56 | Tobacco | 427.0 |
41 | Metals & Mining | 331.0 |
#standard deviation in NIPE by sector
df11 = NIPE [[2,1]]
#converting NIPE to float/#
df11[['NIPE_MM']] = df11[['NIPE_MM']].astype(float)
#pivoting and calculating std deviation
df12 = df11.groupby('Sector', as_index=False).agg(np.std, ddof=0).sort_values(by='NIPE_MM',ascending=False).round(decimals=0)
df12.columns = ['Sector', 'NIPE_Std_dev'] #renaming columns
df12.head(10)
Sector | NIPE_Std_dev | |
---|---|---|
2 | Energy | 1058.0 |
3 | Financials | 510.0 |
4 | Health Care | 281.0 |
7 | Materials | 204.0 |
1 | Consumer Staples | 182.0 |
6 | Information Technology | 135.0 |
8 | Telecommunication Services | 73.0 |
0 | Consumer Discretionary | 56.0 |
5 | Industrials | 48.0 |
9 | Utilities | 40.0 |
#avg NIPE by sector
df13 = NIPE [[2,1]]
df14 = df13.groupby('Sector').mean().sort_values(by='NIPE_MM',ascending=False).round(decimals=0).reset_index()
df14.columns = ['Sector', 'Avg_NIPE'] #renaming columns
df14.head(5)
Sector | Avg_NIPE | |
---|---|---|
0 | Financials | 185.0 |
1 | Health Care | 92.0 |
2 | Consumer Staples | 86.0 |
3 | Utilities | 83.0 |
4 | Information Technology | 74.0 |
#joining avg_NIPE w/ standard deviation NIPE table
j1 = df14.merge(df12, how='left', on="Sector")
j1.head(10)
Sector | Avg_NIPE | NIPE_Std_dev | |
---|---|---|---|
0 | Financials | 185.0 | 510.0 |
1 | Health Care | 92.0 | 281.0 |
2 | Consumer Staples | 86.0 | 182.0 |
3 | Utilities | 83.0 | 40.0 |
4 | Information Technology | 74.0 | 135.0 |
5 | Industrials | 43.0 | 48.0 |
6 | Telecommunication Services | 40.0 | 73.0 |
7 | Consumer Discretionary | 38.0 | 56.0 |
8 | Materials | 35.0 | 204.0 |
9 | Energy | -393.0 | 1058.0 |
#bar chart
x = j1['Sector']
y1 = j1['Avg_NIPE']
y2 = j1['NIPE_Std_dev']
trace0 = go.Bar(
x=x,
y=y1,
name='Average NIPE',
marker=dict(
color='rgb(255, 217, 102)',
line=dict(
color='rgb(8,48,107)',
width=1.5
),
),
opacity=0.6
)
trace1 = go.Bar(
x=x,
y=y2,
name='Standard deviation',
marker=dict(
color='rgb(84, 226, 129)',
line=dict(
color='rgb(8,48,107)',
width=1.5
),
),
opacity=0.6
)
data = [trace0, trace1]
layout = go.Layout(
title='mean NIPE by sector, with standard deviation',
xaxis=dict(
title='industry',
tickangle = 47,
),
yaxis=dict(
title='NIPE (MM)'
),
margin=go.Margin(
b = 220
)
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='NIPE_sector_stddev')
#setting up y values for box plot (step 0), loading in the data I want
sector_nipe = NIPE[['Sector', 'NIPE_MM']]
sector_nipe[['NIPE_MM']] = sector_nipe[['NIPE_MM']].astype(float)
#setting up x variables for box plot
x_data1 = {'Consumer Discretionary','Consumer Staples','Energy',\
'Financials', 'Health Care','Industrials',\
'Information Technology','Materials','Telecommunication Services','Utilities'}
x_data1
{'Consumer Discretionary', 'Consumer Staples', 'Energy', 'Financials', 'Health Care', 'Industrials', 'Information Technology', 'Materials', 'Telecommunication Services', 'Utilities'}
#setting up y values for box plot (step 1)
sector_CD = sector_nipe[sector_nipe['Sector'] == 'Consumer Discretionary'].copy()
ConsumerDiscretionary = sector_CD[['NIPE_MM']].copy()
sector_CS = sector_nipe[sector_nipe['Sector']== 'Consumer Staples'].copy()
ConsumerStaples = sector_CS[['NIPE_MM']].copy()
sector_energy = sector_nipe[sector_nipe['Sector'] == 'Energy'].copy()
Energy = sector_energy[['NIPE_MM']].copy()
sector_financials = sector_nipe[sector_nipe['Sector'] == 'Financials'].copy()
Financials = sector_financials[['NIPE_MM']].copy()
sector_healthcare = sector_nipe[sector_nipe['Sector'] == 'Health Care'].copy()
Healthcare = sector_healthcare[['NIPE_MM']].copy()
sector_industrials = sector_nipe[sector_nipe['Sector'] == 'Industrials'].copy()
Industrials = sector_industrials[['NIPE_MM']].copy()
sector_IT = sector_nipe[sector_nipe['Sector'] == 'Information Technology'].copy()
InformationTechnology = sector_IT[['NIPE_MM']].copy()
sector_materials = sector_nipe[sector_nipe['Sector'] == 'Materials'].copy()
Materials = sector_materials[['NIPE_MM']].copy()
sector_TS = sector_nipe[sector_nipe['Sector'] == 'Telecommunication Services'].copy()
TelecommunicationServices = sector_TS[['NIPE_MM']].copy()
sector_utilities = sector_nipe[sector_nipe['Sector'] == 'Utilities'].copy()
Utilities = sector_utilities[['NIPE_MM']].copy()
#setting up y values for box plot (step 2)
y_data = [
Industrials.values,
Energy.values,
Utilities.values,
ConsumerStaples.values,
Healthcare.values,
Materials.values,
TelecommunicationServices.values,
Financials.values,
ConsumerDiscretionary.values,
InformationTechnology.values
]
#putting together the box plot
traces = []
for xd, yd in zip(x_data1, y_data):
traces.append(go.Box(
y=yd,
name=xd,
boxpoints='all',
jitter=0.5,
whiskerwidth=0.2,
marker=dict(
size=2,
),
line=dict(width=1),
))
layout = go.Layout(
title='NIPE distribution by sector',
yaxis=dict(
range=[-300, 350],
showgrid=True,
zeroline=True,
#dtick=5,
gridcolor='rgb(255, 255, 255)',
gridwidth=1,
zerolinecolor='rgb(255, 255, 255)',
zerolinewidth=2,
),
margin=go.Margin(
l=50,
r=90,
b=280,
t=60,
pad=4
),
width=800,
height=650,
showlegend=False
)
fig = go.Figure(data=traces, layout=layout)
py.iplot(fig, filename = 'NIPE_distribution_sector')