Hypothesis Function 정의
임의의 $n$-벡터 $x^i =\{x_1, x_2,...,x_n\}$에 대해 Hypothesis Function $h_{\theta}(x^i)$ 는 다음과 같이 정의된다. $$h_{\theta}(x^i) = \theta_0 + \theta_1 x_1 + \theta_2 x_2 + ... + \theta_n x_n$$
계수 벡터 $\theta$를 구하는 수학적 모델
$$J(\theta) = \dfrac{1}{2m} \sum_{i=1}^m \big( h_\theta(x^i) - y^i \big)^2$$
$$\newcommand{\argmin}{\arg\!\min} \hat{\theta} = \argmin_\theta J(\theta)$$
from scipy import stats
from pandas import Series, DataFrame
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy
%matplotlib inline
data = {
'Temperature': [26, 27, 28, 29, 30, 31, 32, 33],
'Number of Sells': [270, 280, 290, 300, 310, 320, 330, 340]
}
df = pd.DataFrame(data)
df
df['Temperature'].values
df['Number of Sells'].values
df.plot(kind="scatter", x="Temperature", y="Number of Sells")
slope, intercept, r_value, p_value, std_err = stats.linregress(df['Temperature'].values, df['Number of Sells'].values)
format = "%40s: %12.10f"
print format % ("slope", slope)
print format % ("intercept", intercept)
print format % ("r_value (Correlation Coefficient)", r_value)
print format % ("r-squared (Coefficient of Determination)", r_value**2)
print format % ("p_value (Hyperthesis Testing)", p_value)
print format % ("std_err (Standard Error)", std_err)
회귀식: $y = intercept + slope \times x$
질문: 온도가 34일 때 예상 에어콘 판매량은? $10 + 10 \times 34 = 350$
r-value (Pearson correlation coefficient): $$ r-value = \frac{cov(x_i, y_i)}{\sigma_{x_i} \sigma_{y_i}}$$
where
def cov(a, b):
if len(a) != len(b):
return
a_mean = np.mean(a)
b_mean = np.mean(b)
sum = 0
for i in range(0, len(a)):
sum += ((a[i] - a_mean) * (b[i] - b_mean))
return sum/(len(a) - 1)
a = np.cov(df['Temperature'].values, df['Number of Sells'].values, ddof = 1)[0][1]
print a
b = np.std(df['Temperature'].values, ddof = 1)
print b
c = np.std(df['Number of Sells'].values, ddof = 1)
print c
print a / (b * c)
np.corrcoef(df['Temperature'].values, df['Number of Sells'].values, ddof = 1)[0][1]
References
The death rate is to be represented as a function of other variables.
import urllib2
import json
path = 'https://raw.githubusercontent.com/bluebibi/LINK_ML_BIG_DATA/master/death_rate.csv'
raw_csv = urllib2.urlopen(path)
df = pd.read_csv(raw_csv)
df.head()
print df['A1'].values
print df['D'].values
corr_dic = {}
for i in range(1,16):
corr_dic[i] = np.corrcoef(df['A' + str(i)].values, df['D'].values, ddof = 1)[0][1]
print corr_dic
print
sorted_corr_dic = sorted(corr_dic.items(), key=lambda x: x[1], reverse=True)
print sorted_corr_dic
df_sub = df[['A9','A1','A6', 'D']]
df_sub.head()
fig = plt.figure(figsize=(17, 6))
ax1 = fig.add_subplot(131)
ax1.scatter(df_sub['A9'], df_sub['D'])
ax1.set_title("Data Rate vs. Size of the nonwhite population")
ax2 = fig.add_subplot(132)
ax2.scatter(df_sub['A1'], df_sub['D'])
ax2.set_title("Data Rate vs. Average annual precipitation")
ax3 = fig.add_subplot(133)
ax3.scatter(df_sub['A6'], df_sub['D'])
ax3.set_title("Data Rate vs. Number of years of schooling for persons over 22")
slope, intercept, r_value, p_value, std_err = stats.linregress(df_sub['A9'].values, df_sub['D'].values)
format = "%40s: %12.10f"
print format % ("slope", slope)
print format % ("intercept", intercept)
print format % ("r_value (Correlation Coefficient)", r_value)
print format % ("r-squared (Coefficient of Determination)", r_value**2)
print format % ("p_value (Hyperthesis Testing)", p_value)
print format % ("std_err (Standard Error)", std_err)
predicator_analysis = {}
for i in range(1, 16):
predicator_analysis[i] = Series(np.empty(6), index=['slope', 'intercept', 'r_value', 'r_squared', 'p_value', 'std_err'])
predicator_analysis[i][0],\
predicator_analysis[i][1],\
predicator_analysis[i][2],\
predicator_analysis[i][4],\
predicator_analysis[i][5] = stats.linregress(df['A' + str(i)].values, df['D'].values)
predicator_analysis[i][3] = predicator_analysis[i][2] ** 2
format1 = "%3s %15s %15s %15s %15s %15s %15s"
format2 = "%3d %15f %15f %15f %15f %15f %15f"
print format1 % ('No.', 'slope', 'intercept', 'r_value', 'r_squared', 'p_value', 'std_err')
for i in range(1, 16):
lst = [i]
for j in range(6):
lst.append(predicator_analysis[i][j])
print format2 % tuple(lst)
fig = plt.figure(figsize=(17, 6))
ax1 = fig.add_subplot(131)
ax1.scatter(df_sub['A9'], df_sub['D'])
line_plot_x1 = np.linspace(df_sub['A9'].min(), df_sub['A9'].max(), 10)
slope, intercept, r_value, p_value, std_err = stats.linregress(df_sub['A9'].values, df_sub['D'].values)
ax1.plot(line_plot_x1, intercept + slope * line_plot_x1)
ax2 = fig.add_subplot(132)
ax2.scatter(df_sub['A1'], df_sub['D'])
line_plot_x2 = np.linspace(df_sub['A1'].min(), df_sub['A1'].max(), 10)
slope, intercept, r_value, p_value, std_err = stats.linregress(df_sub['A1'].values, df_sub['D'].values)
ax2.plot(line_plot_x2, intercept + slope * line_plot_x2)
ax3 = fig.add_subplot(133)
ax3.scatter(df_sub['A6'], df_sub['D'])
line_plot_x3 = np.linspace(df_sub['A6'].min(), df_sub['A6'].max(), 10)
slope, intercept, r_value, p_value, std_err = stats.linregress(df_sub['A6'].values, df_sub['D'].values)
ax3.plot(line_plot_x3, intercept + slope * line_plot_x3)
from sklearn import linear_model
regr = linear_model.LinearRegression()
df[['A9', 'A1']].head()
X = zip(df['A9'], df['A1'])
print X
y = df['D'].values
regr = regr.fit(X, y)
print 'Coefficients:', regr.coef_
print 'Intercept:', regr.intercept_
test_x = [36, 12]
print regr.predict(test_x)
print 0.8280 + 0.0036 * test_x[0] + 0.0018 * test_x[1]
# Plot outputs
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure(figsize=(17, 7))
ax1 = fig.add_subplot(121, projection='3d')
ax1.scatter(df['A9'], df['A1'], df_sub['D'])
ax2 = fig.add_subplot(122, projection='3d')
ax2.scatter(df['A9'], df['A1'], df_sub['D'])
# create x,y
xx, yy = np.meshgrid(range(int(df['A9'].min()), int(df['A9'].max())), range(int(df['A1'].min()), int(df['A1'].max())))
# calculate corresponding z
z = 0.8280 + 0.0036 * xx + 0.0018 * yy
ax2.plot_surface(xx, yy, z, rstride=1, cstride=1, linewidth=0, color="yellow", shade=False)
import tensorflow as tf
import numpy as np
# Numpy 랜덤으로 100개의 가짜 데이터 채우기.
x_data = np.float32(np.random.rand(2, 100))
# 학습 레이블(목표값)은 아래의 식으로 산출. (W = [0.1, 0.2], b = 0.3)
y_data = np.dot([0.100, 0.200], x_data) + 0.300
print type(x_data), x_data.shape
print type(y_data), y_data.shape
import tensorflow as tf
import numpy as np
x_data = df[['A9', 'A1']].T
y_data = df['D']
x_data = x_data.as_matrix().astype('float32')
y_data = y_data.as_matrix().astype('float32')
print type(x_data), x_data.shape
print type(y_data), y_data.shape
# b는 0,
b = tf.Variable(tf.zeros([1]))
# W는 1x2 형태의 웨이트 변수
W = tf.Variable(tf.zeros([1, 2]))
y = tf.matmul(W, x_data) + b
print W.get_shape()
print y.get_shape()
# 손실 함수 정의
loss = tf.reduce_mean(tf.square(y - y_data))
# 경사하강법으로 손실 함수를 최소화 (0.0005는 학습 비율)
optimizer = tf.train.GradientDescentOptimizer(0.0005)
# 학습 오퍼레이션 정의
train = optimizer.minimize(loss)
# 모든 변수를 초기화.
init = tf.initialize_all_variables()
# 세션 시작
sess = tf.Session()
sess.run(init)
# 200000번 학습.
for step in xrange(0, 200001):
sess.run(train)
if step % 10000 == 0:
print step, sess.run(W), sess.run(b)