import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
%matplotlib inline
os.chdir(r"C:\Users\Gram\Desktop\아시아경제 수업자료\01 Python 분석 기초 - 실습\data")
# df = pd.read_csv('data_studentlist_en.csv', header='infer',encoding='ISO-8859-1')
df0 = pd.read_csv('data_coffee.csv', header='infer',encoding = 'latin1')
df0.shape
(46832, 23)
df0.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 46832 entries, 0 to 46831 Data columns (total 23 columns): number 46832 non-null int64 companyName 46832 non-null object adress 46600 non-null object adressBystreet 39548 non-null object dateOflicensing 46832 non-null int64 stateOfbusiness 46832 non-null object dateOfclosure 13245 non-null float64 startdateOfcessation 0 non-null float64 duedateOfcessation 0 non-null float64 dateOfreOpen 0 non-null float64 areaOfsite 0 non-null float64 zip 0 non-null float64 waterwork 29610 non-null object numOfmenWorker 5918 non-null float64 yearOfStart 46813 non-null float64 multipleUse 46813 non-null object grade 3775 non-null object sizeOfsite 46813 non-null float64 numOfwomenWorker 7042 non-null float64 vicintyOfsite 12894 non-null object sanitaryName 46813 non-null object businessCondition 46813 non-null object totalOfworker 5537 non-null float64 dtypes: float64(11), int64(2), object(10) memory usage: 8.2+ MB
header = df0.columns
header
Index(['number', 'companyName', 'adress', 'adressBystreet', 'dateOflicensing', 'stateOfbusiness', 'dateOfclosure', 'startdateOfcessation', 'duedateOfcessation', 'dateOfreOpen', 'areaOfsite', 'zip', 'waterwork', 'numOfmenWorker', 'yearOfStart', 'multipleUse', 'grade', 'sizeOfsite', 'numOfwomenWorker', 'vicintyOfsite', 'sanitaryName', 'businessCondition', 'totalOfworker'], dtype='object')
(df0.isnull()).sum(axis=0)
number 0 companyName 0 adress 232 adressBystreet 7284 dateOflicensing 0 stateOfbusiness 0 dateOfclosure 33587 startdateOfcessation 46832 duedateOfcessation 46832 dateOfreOpen 46832 areaOfsite 46832 zip 46832 waterwork 17222 numOfmenWorker 40914 yearOfStart 19 multipleUse 19 grade 43057 sizeOfsite 19 numOfwomenWorker 39790 vicintyOfsite 33938 sanitaryName 19 businessCondition 19 totalOfworker 41295 dtype: int64
(df0.isnull()).mean(axis=0)
number 0.000000 companyName 0.000000 adress 0.004954 adressBystreet 0.155535 dateOflicensing 0.000000 stateOfbusiness 0.000000 dateOfclosure 0.717181 startdateOfcessation 1.000000 duedateOfcessation 1.000000 dateOfreOpen 1.000000 areaOfsite 1.000000 zip 1.000000 waterwork 0.367740 numOfmenWorker 0.873633 yearOfStart 0.000406 multipleUse 0.000406 grade 0.919393 sizeOfsite 0.000406 numOfwomenWorker 0.849633 vicintyOfsite 0.724675 sanitaryName 0.000406 businessCondition 0.000406 totalOfworker 0.881769 dtype: float64
#x0 = np.array(df0.loc[:,'sizeOfsite'].dropna())
x0 = df0.loc[:,'sizeOfsite'].dropna()
x0.shape
(46813,)
plt.hist(x0, bins=50, color='green', density=True)
plt.show()
x=x0[x0 < 500]
plt.hist(x,bins=50,color='blue', density=True)
plt.show()