# load le dataset
import pandas as pd
filename = '../data/cars93.csv'
df = pd.read_csv(filename)
df.shape
(93, 27)
df.columns
Index(['Manufacturer', 'Model', 'Type', 'Min.Price', 'Price', 'Max.Price', 'MPG.city', 'MPG.highway', 'AirBags', 'DriveTrain', 'Cylinders', 'EngineSize', 'Horsepower', 'RPM', 'Rev.per.mile', 'Man.trans.avail', 'Fuel.tank.capacity', 'Passengers', 'Length', 'Wheelbase', 'Width', 'Turn.circle', 'Rear.seat.room', 'Luggage.room', 'Weight', 'Origin', 'Make'], dtype='object')
df.head(8)
Manufacturer | Model | Type | Min.Price | Price | Max.Price | MPG.city | MPG.highway | AirBags | DriveTrain | ... | Passengers | Length | Wheelbase | Width | Turn.circle | Rear.seat.room | Luggage.room | Weight | Origin | Make | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Acura | Integra | Small | 12.9 | 15.9 | 18.8 | 25 | 31 | None | Front | ... | 5 | 177 | 102 | 68 | 37 | 26.5 | 11.0 | 2705 | non-USA | Acura Integra |
1 | Acura | Legend | Midsize | 29.2 | 33.9 | 38.7 | 18 | 25 | Driver & Passenger | Front | ... | 5 | 195 | 115 | 71 | 38 | 30.0 | 15.0 | 3560 | non-USA | Acura Legend |
2 | Audi | 90 | Compact | 25.9 | 29.1 | 32.3 | 20 | 26 | Driver only | Front | ... | 5 | 180 | 102 | 67 | 37 | 28.0 | 14.0 | 3375 | non-USA | Audi 90 |
3 | Audi | 100 | Midsize | 30.8 | 37.7 | 44.6 | 19 | 26 | Driver & Passenger | Front | ... | 6 | 193 | 106 | 70 | 37 | 31.0 | 17.0 | 3405 | non-USA | Audi 100 |
4 | BMW | 535i | Midsize | 23.7 | 30.0 | 36.2 | 22 | 30 | Driver only | Rear | ... | 4 | 186 | 109 | 69 | 39 | 27.0 | 13.0 | 3640 | non-USA | BMW 535i |
5 | Buick | Century | Midsize | 14.2 | 15.7 | 17.3 | 22 | 31 | Driver only | Front | ... | 6 | 189 | 105 | 69 | 41 | 28.0 | 16.0 | 2880 | USA | Buick Century |
6 | Buick | LeSabre | Large | 19.9 | 20.8 | 21.7 | 19 | 28 | Driver only | Front | ... | 6 | 200 | 111 | 74 | 42 | 30.5 | 17.0 | 3470 | USA | Buick LeSabre |
7 | Buick | Roadmaster | Large | 22.6 | 23.7 | 24.9 | 16 | 25 | Driver only | Rear | ... | 6 | 216 | 116 | 78 | 45 | 30.5 | 21.0 | 4105 | USA | Buick Roadmaster |
8 rows × 27 columns
df['MPG.city']
df.dtypes
Manufacturer object Model object Type object Min.Price float64 Price float64 Max.Price float64 MPG.city int64 MPG.highway int64 AirBags object DriveTrain object Cylinders object EngineSize float64 Horsepower int64 RPM int64 Rev.per.mile int64 Man.trans.avail object Fuel.tank.capacity float64 Passengers int64 Length int64 Wheelbase int64 Width int64 Turn.circle int64 Rear.seat.room float64 Luggage.room float64 Weight int64 Origin object Make object dtype: object
import matplotlib.pyplot as plt
%matplotlib inline
df.Horsepower.hist(bins = 10)
<matplotlib.axes._subplots.AxesSubplot at 0x10de4de10>
df['Manufacturer'].head()
0 Acura 1 Acura 2 Audi 3 Audi 4 BMW Name: Manufacturer, dtype: object
df.Manufacturer.head()
0 Acura 1 Acura 2 Audi 3 Audi 4 BMW Name: Manufacturer, dtype: object
df['MPG.city'].describe()
count 93.000000 mean 22.365591 std 5.619812 min 15.000000 25% 18.000000 50% 21.000000 75% 25.000000 max 46.000000 Name: MPG.city, dtype: float64
df.corr()
Min.Price | Price | Max.Price | MPG.city | MPG.highway | EngineSize | Horsepower | RPM | Rev.per.mile | Fuel.tank.capacity | Passengers | Length | Wheelbase | Width | Turn.circle | Rear.seat.room | Luggage.room | Weight | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Min.Price | 1.000000 | 0.970601 | 0.906756 | -0.622875 | -0.579966 | 0.645488 | 0.802444 | -0.042598 | -0.470395 | 0.635369 | 0.061236 | 0.553859 | 0.516758 | 0.492878 | 0.428603 | 0.376642 | 0.413485 | 0.666554 |
Price | 0.970601 | 1.000000 | 0.981580 | -0.594562 | -0.560680 | 0.597425 | 0.788218 | -0.004955 | -0.426395 | 0.619480 | 0.057860 | 0.503628 | 0.500864 | 0.456028 | 0.392590 | 0.311499 | 0.366569 | 0.647179 |
Max.Price | 0.906756 | 0.981580 | 1.000000 | -0.547811 | -0.522561 | 0.535012 | 0.744445 | 0.025015 | -0.374024 | 0.581294 | 0.053216 | 0.442933 | 0.467501 | 0.408414 | 0.347785 | 0.247260 | 0.315315 | 0.605142 |
MPG.city | -0.622875 | -0.594562 | -0.547811 | 1.000000 | 0.943936 | -0.710003 | -0.672636 | 0.363045 | 0.695857 | -0.813144 | -0.416856 | -0.666239 | -0.667108 | -0.720534 | -0.666389 | -0.384347 | -0.494894 | -0.843139 |
MPG.highway | -0.579966 | -0.560680 | -0.522561 | 0.943936 | 1.000000 | -0.626795 | -0.619044 | 0.313469 | 0.587497 | -0.786039 | -0.466386 | -0.542897 | -0.615384 | -0.640359 | -0.593683 | -0.366684 | -0.371629 | -0.810658 |
EngineSize | 0.645488 | 0.597425 | 0.535012 | -0.710003 | -0.626795 | 1.000000 | 0.732120 | -0.547898 | -0.824009 | 0.759306 | 0.372721 | 0.780283 | 0.732484 | 0.867110 | 0.778464 | 0.502750 | 0.680827 | 0.845075 |
Horsepower | 0.802444 | 0.788218 | 0.744445 | -0.672636 | -0.619044 | 0.732120 | 1.000000 | 0.036688 | -0.600314 | 0.711790 | 0.009264 | 0.550865 | 0.486854 | 0.644413 | 0.561216 | 0.256732 | 0.359217 | 0.738798 |
RPM | -0.042598 | -0.004955 | 0.025015 | 0.363045 | 0.313469 | -0.547898 | 0.036688 | 1.000000 | 0.494764 | -0.333345 | -0.467138 | -0.441249 | -0.467812 | -0.539721 | -0.505651 | -0.342175 | -0.524845 | -0.427931 |
Rev.per.mile | -0.470395 | -0.426395 | -0.374024 | 0.695857 | 0.587497 | -0.824009 | -0.600314 | 0.494764 | 1.000000 | -0.609710 | -0.334976 | -0.690233 | -0.636824 | -0.780460 | -0.733160 | -0.377010 | -0.592792 | -0.735264 |
Fuel.tank.capacity | 0.635369 | 0.619480 | 0.581294 | -0.813144 | -0.786039 | 0.759306 | 0.711790 | -0.333345 | -0.609710 | 1.000000 | 0.472095 | 0.690461 | 0.757674 | 0.798719 | 0.671343 | 0.509689 | 0.613437 | 0.894018 |
Passengers | 0.061236 | 0.057860 | 0.053216 | -0.416856 | -0.466386 | 0.372721 | 0.009264 | -0.467138 | -0.334976 | 0.472095 | 1.000000 | 0.485294 | 0.694054 | 0.489979 | 0.449025 | 0.694134 | 0.653317 | 0.553273 |
Length | 0.553859 | 0.503628 | 0.442933 | -0.666239 | -0.542897 | 0.780283 | 0.550865 | -0.441249 | -0.690233 | 0.690461 | 0.485294 | 1.000000 | 0.823650 | 0.822148 | 0.738955 | 0.549958 | 0.712962 | 0.806274 |
Wheelbase | 0.516758 | 0.500864 | 0.467501 | -0.667108 | -0.615384 | 0.732484 | 0.486854 | -0.467812 | -0.636824 | 0.757674 | 0.694054 | 0.823650 | 1.000000 | 0.807213 | 0.723324 | 0.667259 | 0.734127 | 0.871895 |
Width | 0.492878 | 0.456028 | 0.408414 | -0.720534 | -0.640359 | 0.867110 | 0.644413 | -0.539721 | -0.780460 | 0.798719 | 0.489979 | 0.822148 | 0.807213 | 1.000000 | 0.817854 | 0.465618 | 0.673490 | 0.874961 |
Turn.circle | 0.428603 | 0.392590 | 0.347785 | -0.666389 | -0.593683 | 0.778464 | 0.561216 | -0.505651 | -0.733160 | 0.671343 | 0.449025 | 0.738955 | 0.723324 | 0.817854 | 1.000000 | 0.466328 | 0.585018 | 0.778043 |
Rear.seat.room | 0.376642 | 0.311499 | 0.247260 | -0.384347 | -0.366684 | 0.502750 | 0.256732 | -0.342175 | -0.377010 | 0.509689 | 0.694134 | 0.549958 | 0.667259 | 0.465618 | 0.466328 | 1.000000 | 0.651968 | 0.526250 |
Luggage.room | 0.413485 | 0.366569 | 0.315315 | -0.494894 | -0.371629 | 0.680827 | 0.359217 | -0.524845 | -0.592792 | 0.613437 | 0.653317 | 0.712962 | 0.734127 | 0.673490 | 0.585018 | 0.651968 | 1.000000 | 0.637226 |
Weight | 0.666554 | 0.647179 | 0.605142 | -0.843139 | -0.810658 | 0.845075 | 0.738798 | -0.427931 | -0.735264 | 0.894018 | 0.553273 | 0.806274 | 0.871895 | 0.874961 | 0.778043 | 0.526250 | 0.637226 | 1.000000 |
import seaborn as sns
corr = df.corr()
fig, ax = plt.subplots(1,1, figsize =(9,9))
sns.heatmap(corr,
xticklabels=corr.columns.values,
yticklabels=corr.columns.values)
<matplotlib.axes._subplots.AxesSubplot at 0x10f2e47b8>
df.describe()
Min.Price | Price | Max.Price | MPG.city | MPG.highway | EngineSize | Horsepower | RPM | Rev.per.mile | Fuel.tank.capacity | Passengers | Length | Wheelbase | Width | Turn.circle | Rear.seat.room | Luggage.room | Weight | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 93.000000 | 93.000000 | 93.000000 | 93.000000 | 93.000000 | 93.000000 | 93.000000 | 93.000000 | 93.000000 | 93.000000 | 93.000000 | 93.000000 | 93.000000 | 93.000000 | 93.000000 | 91.000000 | 82.000000 | 93.000000 |
mean | 17.125806 | 19.509677 | 21.898925 | 22.365591 | 29.086022 | 2.667742 | 143.827957 | 5280.645161 | 2332.204301 | 16.664516 | 5.086022 | 183.204301 | 103.946237 | 69.376344 | 38.956989 | 27.829670 | 13.890244 | 3072.903226 |
std | 8.746029 | 9.659430 | 11.030457 | 5.619812 | 5.331726 | 1.037363 | 52.374410 | 596.731690 | 496.506525 | 3.279370 | 1.038979 | 14.602382 | 6.819674 | 3.778986 | 3.223265 | 2.989072 | 2.997967 | 589.896510 |
min | 6.700000 | 7.400000 | 7.900000 | 15.000000 | 20.000000 | 1.000000 | 55.000000 | 3800.000000 | 1320.000000 | 9.200000 | 2.000000 | 141.000000 | 90.000000 | 60.000000 | 32.000000 | 19.000000 | 6.000000 | 1695.000000 |
25% | 10.800000 | 12.200000 | 14.700000 | 18.000000 | 26.000000 | 1.800000 | 103.000000 | 4800.000000 | 1985.000000 | 14.500000 | 4.000000 | 174.000000 | 98.000000 | 67.000000 | 37.000000 | 26.000000 | 12.000000 | 2620.000000 |
50% | 14.700000 | 17.700000 | 19.600000 | 21.000000 | 28.000000 | 2.400000 | 140.000000 | 5200.000000 | 2340.000000 | 16.400000 | 5.000000 | 183.000000 | 103.000000 | 69.000000 | 39.000000 | 27.500000 | 14.000000 | 3040.000000 |
75% | 20.300000 | 23.300000 | 25.300000 | 25.000000 | 31.000000 | 3.300000 | 170.000000 | 5750.000000 | 2565.000000 | 18.800000 | 6.000000 | 192.000000 | 110.000000 | 72.000000 | 41.000000 | 30.000000 | 15.000000 | 3525.000000 |
max | 45.400000 | 61.900000 | 80.000000 | 46.000000 | 50.000000 | 5.700000 | 300.000000 | 6500.000000 | 3755.000000 | 27.000000 | 8.000000 | 219.000000 | 119.000000 | 78.000000 | 45.000000 | 36.000000 | 22.000000 | 4105.000000 |
df.dtypes
Manufacturer object Model object Type object Min.Price float64 Price float64 Max.Price float64 MPG.city int64 MPG.highway int64 AirBags object DriveTrain object Cylinders object EngineSize float64 Horsepower int64 RPM int64 Rev.per.mile int64 Man.trans.avail object Fuel.tank.capacity float64 Passengers int64 Length int64 Wheelbase int64 Width int64 Turn.circle int64 Rear.seat.room float64 Luggage.room float64 Weight int64 Origin object Make object dtype: object
df.Type.value_counts()
Midsize 22 Small 21 Compact 16 Sporty 14 Large 11 Van 9 Name: Type, dtype: int64
df.Origin.value_counts()
USA 48 non-USA 45 Name: Origin, dtype: int64
condition = (df['Origin'] == 'non-USA' ) & (df.Type == 'Midsize')
condition
df[condition].shape
(10, 27)
col = 'Origin'
condition = df[col].isnull()
df[condition].shape
(4, 27)
import numpy as np
df['log_weight'] = df.Weight.apply( lambda w : np.log(w + 1) )
df['log_weight'].hist(bins = 100)
<matplotlib.axes._subplots.AxesSubplot at 0x10e0c16a0>
df['Weight'].hist(bins = 100)
<matplotlib.axes._subplots.AxesSubplot at 0x10e203630>