A Summary of lecture "Introduction to Computational Thinking and Data Science", via MITx-6.00.2x (edX)
Note: "Field of study that gives computers the ability to learn without being explicitly programmed" - Arthur Samuel
- Modern statistics meets optimization
from lecture12_segment2 import *
cobra = Animal('cobra', [1,1,1,1,0])
rattlesnake = Animal('rattlesnake', [1,1,1,1,0])
boa = Animal('boa\nconstrictor', [0,1,0,1,0])
chicken = Animal('chicken', [1,1,0,1,2])
alligator = Animal('alligator', [1,1,0,1,4])
dartFrog = Animal('dart frog', [1,0,1,0,4])
zebra = Animal('zebra', [0,0,0,0,4])
python = Animal('python', [1,1,0,1,0])
guppy = Animal('guppy', [0,1,0,0,0])
animals = [cobra, rattlesnake, boa, chicken, guppy,
dartFrog, zebra, python, alligator]
compareAnimals(animals, 3) # k=3
Help on method scale in module matplotlib.table: scale(xscale, yscale) method of matplotlib.table.Table instance Scale column widths by *xscale* and row heights by *yscale*.
# Applying scaling
cobra = Animal('cobra', [1,1,1,1,0])
rattlesnake = Animal('rattlesnake', [1,1,1,1,0])
boa = Animal('boa\nconstrictor', [0,1,0,1,0])
chicken = Animal('chicken', [1,1,0,1,2])
alligator = Animal('alligator', [1,1,0,1,1])
dartFrog = Animal('dart frog', [1,0,1,0,1])
zebra = Animal('zebra', [0,0,0,0,1])
python = Animal('python', [1,1,0,1,0])
guppy = Animal('guppy', [0,1,0,0,0])
animals = [cobra, rattlesnake, boa, chicken, guppy,
dartFrog, zebra, python, alligator]
compareAnimals(animals, 3) k = 3
Help on method scale in module matplotlib.table: scale(xscale, yscale) method of matplotlib.table.Table instance Scale column widths by *xscale* and row heights by *yscale*.
def zScaleFeatures(vals):
"""Assumes vals is a sequence of floats"""
result = np.array(vals)
mean = np.mean(vals)
result = result - mean
return result/np.std(result)
def iScaleFeatures(vals):
"""Assumes vals is a sequence of floats"""
minVal, maxVal = min(vals), max(vals)
fit = np.polyfit([minVal, maxVal], [0, 1], 1)
return np.polyval(fit, vals)
- Why not divide variability by size of cluster?
- Big and bad worse than small and bad
- Is optimization problem finding a $C$ that minimizes $dissimilarity(C)$?
- No, otherwise could put each example in its own cluster
- Need constraints, e.g.
- Minimum distance between clusters
- Number of clusters
randomly chose k examples as initial centroids
while true:
create k clusters by assigning each example to closest centroid
compute k new centroids by averaging examples in each cluster
if centroids don`t change:
break
from lecture12_segment3 import *
centers = [(2, 3), (4, 6), (7, 4), (7,7)]
examples = []
random.seed(0)
for c in centers:
for i in range(5):
xVal = (c[0] + random.gauss(0, .5))
yVal = (c[1] + random.gauss(0, .5))
name = str(c) + '-' + str(i)
example = Example(name, pylab.array([xVal, yVal]))
examples.append(example)
xVals, yVals = [], []
for e in examples:
xVals.append(e.getFeatures()[0])
yVals.append(e.getFeatures()[1])
random.seed(2)
kmeans(examples, 4, True)
Iteration #1 Cluster color = 0 Cluster with centroid [1.66014278 3.18525178] contains: (2, 3)-1 Cluster color = 1 Cluster with centroid [1.8494407 2.7367613] contains: (2, 3)-0, (2, 3)-2, (2, 3)-3, (2, 3)-4 Cluster color = 2 Cluster with centroid [5.57612073 6.33385138] contains: (4, 6)-0, (4, 6)-1, (4, 6)-2, (4, 6)-3, (4, 6)-4, (7, 7)-0, (7, 7)-1, (7, 7)-2, (7, 7)-3, (7, 7)-4 Cluster color = 3 Cluster with centroid [7.11402489 3.98797723] contains: (7, 4)-0, (7, 4)-1, (7, 4)-2, (7, 4)-3, (7, 4)-4 Iteration #2 Cluster color = 0 Cluster with centroid [1.49914988 3.08204521] contains: (2, 3)-1, (2, 3)-2, (2, 3)-4 Cluster color = 1 Cluster with centroid [2.28022797 2.44308067] contains: (2, 3)-0, (2, 3)-3 Cluster color = 2 Cluster with centroid [5.57612073 6.33385138] contains: (4, 6)-0, (4, 6)-1, (4, 6)-2, (4, 6)-3, (4, 6)-4, (7, 7)-0, (7, 7)-1, (7, 7)-2, (7, 7)-3, (7, 7)-4 Cluster color = 3 Cluster with centroid [7.11402489 3.98797723] contains: (7, 4)-0, (7, 4)-1, (7, 4)-2, (7, 4)-3, (7, 4)-4 Iteration #3 Cluster color = 0 Cluster with centroid [1.49914988 3.08204521] contains: (2, 3)-1, (2, 3)-2, (2, 3)-4 Cluster color = 1 Cluster with centroid [2.28022797 2.44308067] contains: (2, 3)-0, (2, 3)-3 Cluster color = 2 Cluster with centroid [5.57612073 6.33385138] contains: (4, 6)-0, (4, 6)-1, (4, 6)-2, (4, 6)-3, (4, 6)-4, (7, 7)-0, (7, 7)-1, (7, 7)-2, (7, 7)-3, (7, 7)-4 Cluster color = 3 Cluster with centroid [7.11402489 3.98797723] contains: (7, 4)-0, (7, 4)-1, (7, 4)-2, (7, 4)-3, (7, 4)-4
[<lecture12_segment3.Cluster at 0x227612ac8c8>, <lecture12_segment3.Cluster at 0x227612ac408>, <lecture12_segment3.Cluster at 0x227612ac588>, <lecture12_segment3.Cluster at 0x227612ac6c8>]
best = kMeans(points)
for t in range(numTrials):
C = kMeans(points)
if dissimilarity(C) < dissimilarity(best):
best = C
return best