# import
import graphlab as gl
from matplotlib import pyplot as plt
# inline the visualization
%matplotlib inline
gl.canvas.set_target("browser", port=None) # use "ipynb" for inline visualization
# importing the data
data = gl.SFrame("home_data.gl/")
data
id | date | price | bedrooms | bathrooms | sqft_living | sqft_lot | floors | waterfront |
---|---|---|---|---|---|---|---|---|
7129300520 | 2014-10-13 00:00:00+00:00 | 221900 | 3 | 1 | 1180 | 5650 | 1 | 0 |
6414100192 | 2014-12-09 00:00:00+00:00 | 538000 | 3 | 2.25 | 2570 | 7242 | 2 | 0 |
5631500400 | 2015-02-25 00:00:00+00:00 | 180000 | 2 | 1 | 770 | 10000 | 1 | 0 |
2487200875 | 2014-12-09 00:00:00+00:00 | 604000 | 4 | 3 | 1960 | 5000 | 1 | 0 |
1954400510 | 2015-02-18 00:00:00+00:00 | 510000 | 3 | 2 | 1680 | 8080 | 1 | 0 |
7237550310 | 2014-05-12 00:00:00+00:00 | 1225000 | 4 | 4.5 | 5420 | 101930 | 1 | 0 |
1321400060 | 2014-06-27 00:00:00+00:00 | 257500 | 3 | 2.25 | 1715 | 6819 | 2 | 0 |
2008000270 | 2015-01-15 00:00:00+00:00 | 291850 | 3 | 1.5 | 1060 | 9711 | 1 | 0 |
2414600126 | 2015-04-15 00:00:00+00:00 | 229500 | 3 | 1 | 1780 | 7470 | 1 | 0 |
3793500160 | 2015-03-12 00:00:00+00:00 | 323000 | 3 | 2.5 | 1890 | 6560 | 2 | 0 |
view | condition | grade | sqft_above | sqft_basement | yr_built | yr_renovated | zipcode | lat |
---|---|---|---|---|---|---|---|---|
0 | 3 | 7 | 1180 | 0 | 1955 | 0 | 98178 | 47.51123398 |
0 | 3 | 7 | 2170 | 400 | 1951 | 1991 | 98125 | 47.72102274 |
0 | 3 | 6 | 770 | 0 | 1933 | 0 | 98028 | 47.73792661 |
0 | 5 | 7 | 1050 | 910 | 1965 | 0 | 98136 | 47.52082 |
0 | 3 | 8 | 1680 | 0 | 1987 | 0 | 98074 | 47.61681228 |
0 | 3 | 11 | 3890 | 1530 | 2001 | 0 | 98053 | 47.65611835 |
0 | 3 | 7 | 1715 | 0 | 1995 | 0 | 98003 | 47.30972002 |
0 | 3 | 7 | 1060 | 0 | 1963 | 0 | 98198 | 47.40949984 |
0 | 3 | 7 | 1050 | 730 | 1960 | 0 | 98146 | 47.51229381 |
0 | 3 | 7 | 1890 | 0 | 2003 | 0 | 98038 | 47.36840673 |
long | sqft_living15 | sqft_lot15 |
---|---|---|
-122.25677536 | 1340.0 | 5650.0 |
-122.3188624 | 1690.0 | 7639.0 |
-122.23319601 | 2720.0 | 8062.0 |
-122.39318505 | 1360.0 | 5000.0 |
-122.04490059 | 1800.0 | 7503.0 |
-122.00528655 | 4760.0 | 101930.0 |
-122.32704857 | 2238.0 | 6819.0 |
-122.31457273 | 1650.0 | 9711.0 |
-122.33659507 | 1780.0 | 8113.0 |
-122.0308176 | 2390.0 | 7570.0 |
data.show()
Canvas is accessible via web browser at the URL: http://localhost:21695/index.html Opening Canvas in default web browser.
gl.canvas.set_target("ipynb")
# scatter plot view
data.show(view="Scatter Plot", x="sqft_living", y="price")
# spliting the data into train and test data
train_data, test_data = data.random_split(0.8, seed=0)
# creating Linear Regression model
clf = gl.linear_regression.create(train_data, target="price", features=["sqft_living"])
PROGRESS: Creating a validation set from 5 percent of training data. This may take a while. You can set ``validation_set=None`` to disable validation tracking.
Linear regression:
--------------------------------------------------------
Number of examples : 16474
Number of features : 1
Number of unpacked features : 1
Number of coefficients : 2
Starting Newton Method
--------------------------------------------------------
+-----------+----------+--------------+--------------------+----------------------+---------------+-----------------+
| Iteration | Passes | Elapsed Time | Training-max_error | Validation-max_error | Training-rmse | Validation-rmse |
+-----------+----------+--------------+--------------------+----------------------+---------------+-----------------+
| 1 | 2 | 1.098370 | 4336865.368528 | 1673632.160487 | 263345.228506 | 255619.740349 |
+-----------+----------+--------------+--------------------+----------------------+---------------+-----------------+
SUCCESS: Optimal solution found.
# Evaluate the simple model
test_data[:4]
id | date | price | bedrooms | bathrooms | sqft_living | sqft_lot | floors | waterfront |
---|---|---|---|---|---|---|---|---|
0114101516 | 2014-05-28 00:00:00+00:00 | 310000 | 3 | 1 | 1430 | 19901 | 1.5 | 0 |
9297300055 | 2015-01-24 00:00:00+00:00 | 650000 | 4 | 3 | 2950 | 5000 | 2 | 0 |
1202000200 | 2014-11-03 00:00:00+00:00 | 233000 | 3 | 2 | 1710 | 4697 | 1.5 | 0 |
8562750320 | 2014-11-10 00:00:00+00:00 | 580500 | 3 | 2.5 | 2320 | 3980 | 2 | 0 |
view | condition | grade | sqft_above | sqft_basement | yr_built | yr_renovated | zipcode | lat |
---|---|---|---|---|---|---|---|---|
0 | 4 | 7 | 1430 | 0 | 1927 | 0 | 98028 | 47.75584254 |
3 | 3 | 9 | 1980 | 970 | 1979 | 0 | 98126 | 47.57136955 |
0 | 5 | 6 | 1710 | 0 | 1941 | 0 | 98002 | 47.30482931 |
0 | 3 | 8 | 2320 | 0 | 2003 | 0 | 98027 | 47.5391103 |
long | sqft_living15 | sqft_lot15 |
---|---|---|
-122.22874498 | 1780.0 | 12697.0 |
-122.37541218 | 2140.0 | 4000.0 |
-122.21774909 | 1030.0 | 4705.0 |
-122.06971484 | 2580.0 | 3980.0 |
test_data['price'].mean()
543054.0425632534
# evaluate
clf.evaluate(test_data)
{'max_error': 4133533.0970407226, 'rmse': 255214.18419781374}
# predicting the data
plt.plot(test_data["sqft_living"], test_data["price"], ".",
test_data["sqft_living"], clf.predict(test_data), '-')
[<matplotlib.lines.Line2D at 0x242175c0>, <matplotlib.lines.Line2D at 0x24217668>]
clf.get("coefficients")
name | index | value | stderr |
---|---|---|---|
(intercept) | None | -49178.3910177 | 5063.19949956 |
sqft_living | None | 283.179503941 | 2.22580563108 |
# Exploring some more features
my_features = ["bedrooms", "bathrooms", "sqft_living", "sqft_lot", "floors", "zipcode"]
data[my_features].show()
data.show(view="BoxWhisker Plot", x="zipcode", y="price")
# creating another model
clf_mine = gl.linear_regression.create(train_data, target="price", features=my_features, validation_set=None)
Linear regression:
--------------------------------------------------------
Number of examples : 17384
Number of features : 6
Number of unpacked features : 6
Number of coefficients : 115
Starting Newton Method
--------------------------------------------------------
+-----------+----------+--------------+--------------------+---------------+
| Iteration | Passes | Elapsed Time | Training-max_error | Training-rmse |
+-----------+----------+--------------+--------------------+---------------+
| 1 | 2 | 0.025016 | 3763208.270523 | 181908.848367 |
+-----------+----------+--------------+--------------------+---------------+
SUCCESS: Optimal solution found.
# evaluate the new model and old model
print "Model 1 :", clf.evaluate(test_data)
print "Model 2 :",clf_mine.evaluate(test_data)
Model 1 : {'max_error': 4133533.0970407226, 'rmse': 255214.18419781374} Model 2 : {'max_error': 3486584.509381705, 'rmse': 179542.4333126903}
# predicting some house proce
house1 = data[data['id']=='5309101200']
house1
id | date | price | bedrooms | bathrooms | sqft_living | sqft_lot | floors | waterfront |
---|---|---|---|---|---|---|---|---|
5309101200 | 2014-06-05 00:00:00+00:00 | 620000 | 4 | 2.25 | 2400 | 5350 | 1.5 | 0 |
view | condition | grade | sqft_above | sqft_basement | yr_built | yr_renovated | zipcode | lat |
---|---|---|---|---|---|---|---|---|
0 | 4 | 7 | 1460 | 940 | 1929 | 0 | 98117 | 47.67632376 |
long | sqft_living15 | sqft_lot15 |
---|---|---|
-122.37010126 | 1250.0 | 4880.0 |
print clf_mine.predict(house1)
print clf.predict(house1)
[721918.9333272863] [630452.4184407898]
advanced_features = [
'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'zipcode',
'condition', # condition of house
'grade', # measure of quality of construction
'waterfront', # waterfront property
'view', # type of view
'sqft_above', # square feet above ground
'sqft_basement', # square feet in basement
'yr_built', # the year built
'yr_renovated', # the year renovated
'lat', 'long', # the lat-long of the parcel
'sqft_living15', # average sq.ft. of 15 nearest neighbors
'sqft_lot15', # average lot size of 15 nearest neighbors
]
clf_adv = gl.linear_regression.create(train_data, target="price", features=advanced_features, validation_set=None)
Linear regression:
--------------------------------------------------------
Number of examples : 17384
Number of features : 18
Number of unpacked features : 18
Number of coefficients : 127
Starting Newton Method
--------------------------------------------------------
+-----------+----------+--------------+--------------------+---------------+
| Iteration | Passes | Elapsed Time | Training-max_error | Training-rmse |
+-----------+----------+--------------+--------------------+---------------+
| 1 | 2 | 0.056044 | 3469012.450686 | 154580.940736 |
+-----------+----------+--------------+--------------------+---------------+
SUCCESS: Optimal solution found.
# evaluate the new model and old model
print "Model 2 :", clf_mine.evaluate(test_data)
print "Model 3 :",clf_adv.evaluate(test_data)
Model 2 : {'max_error': 3486584.509381705, 'rmse': 179542.4333126903} Model 3 : {'max_error': 3556849.413858208, 'rmse': 156831.1168021901}
# Selection and summary statistics: We found the zip code with the highest average house price.
# What is the average house price of that zip code?
data[data["zipcode"]=="98039"]["price"].mean()
2160606.5999999996
# Filtering data: What fraction of the houses have living space between 2000 sq.ft. and 4000 sq.ft.?
data1 = data[data["sqft_living"]>=2000]
data2 = data1[data1["sqft_living"]<=4000]
9221
data.num_rows()
21613
data2.num_rows()
9221
9221.00/21613
0.4266413732475825